Skip to content

Commit

Permalink
Uploading new version and adding pending files
Browse files Browse the repository at this point in the history
  • Loading branch information
fhalamos committed Sep 14, 2020
1 parent 4e29e43 commit e2c208d
Show file tree
Hide file tree
Showing 30 changed files with 7,194 additions and 301 deletions.
17 changes: 9 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,17 @@ Once the PIIs are identified, users have the opportunity to say what they would

### Files included

#### Main files
* app_frontend.py: App GUI script using tkinter.
* PII_data_processor.py: App backend, it reads data files, identifies PIIs and creates new de-identified data files.
* find_piis_in_unstructed_text.py: Script used by PII_data_processor to particularly detect piis in unstructured text

### Other utility files
* restricted_words.py: Script to get restricted words for PII identification
* app_frontend.py: App frontend, using python tkinter.
* constant_strings.py: Declares strings used across app.
* query_google_answer_boxes.py: Script to query locations and populations
* dist folder: Contains .exe file for execution

In addition, a small app to find PIIs in unstructured text is offered
* find_piis_in_unstructured_text_backend.py
* find_piis_in_unstructured_text_frontend.py
* hook-spacy.py: Dependency file needed when creating .exe

### Help and Support

Expand All @@ -47,6 +50,4 @@ J-PAL: PII-Scan. 2017. https://github.com/J-PAL/PII-Scan
The PII script is [MIT Licensed](https://github.com/PovertyAction/PII_detection/blob/master/LICENSE).

### To create .exe from source file
`pyinstaller --onefile --windowed --icon=app.ico --add-data="app.ico;." --add-data="ipa_logo.jpg;." app_frontend.py`


`pyinstaller --onefile --windowed --icon=app.ico --add-data="app.ico;." --add-data="ipa_logo.jpg;." --additional-hooks-dir=. --hiddenimport srsly.msgpack.util app_frontend.py`
2 changes: 1 addition & 1 deletion app_frontend.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

intro_text = "This script is meant to assist in the detection of PII (personally identifiable information) and subsequent removal from a dataset. This is an alpha program, not fully tested yet."
intro_text_p2 = "You will first load a dataset that might contain PII variables. The system will try to identify the PII candidates. Please indicate if you would like to Drop, Encode or Keep them to then generate a new de-identified dataset."#, built without access to datasets containing PII on which to test or train it. Please help improve the program by filling out the survey on your experience using it (Help -> Provide Feedback)."
app_title = "IPA's PII Detector - v0.2.11"
app_title = "IPA's PII Detector - v0.2.12"

window_width = 1086
window_height = 666
Expand Down
Binary file not shown.
251 changes: 251 additions & 0 deletions find_piis_in_unstructured_text.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,251 @@
from PII_data_processor import column_has_sufficiently_sparse_strings, clean_column, import_file, export
from constant_strings import *
import restricted_words as restricted_words_list
import query_google_answer_boxes as google
import requests
from secret_keys import get_forebears_api_key
import json
from datetime import datetime
import spacy

def get_stopwords(languages=None):

from os import listdir
from os.path import isfile, join

stopwords_path = './stopwords/'

#If no language selected, get all stopwords
if(languages == None):
stopwords_files = [join(stopwords_path, f) for f in listdir(stopwords_path) if isfile(join(stopwords_path, f))]
else: #Select only stopwords files for given languages
stopwords_files = [join(stopwords_path, language) for language in languages if isfile(join(stopwords_path, language))]

stopwords_list = []
for file_path in stopwords_files:
with open(file_path, 'r', encoding="utf-8") as reader:
stopwords = reader.read().split('\n')
stopwords_list.extend(stopwords)

return list(set(stopwords_list))

def remove_stopwords(strings_list, languages=['english','spanish']):
import stopwords
stop_words = get_stopwords(languages)
strings_list = [s for s in list(strings_list) if not s in stop_words]
return strings_list

def find_phone_numbers_in_list_strings(list_strings):

phone_n_regex_str = "(\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4})"
import re
phone_n_regex = re.compile(phone_n_regex_str)
phone_numbers_found = list(filter(phone_n_regex.match, list_strings))

return phone_numbers_found


def generate_names_parameter_for_api(list_names, option):
#According to https://forebears.io/onograph/documentation/api/location/batch

list_of_names_json=[]
for name in list_names:
list_of_names_json.append('{"name":"'+name+'","type":"'+option+'","limit":1}')

names_parameter = '['+','.join(list_of_names_json)+']'
return names_parameter

def get_names_from_json_response(response):

names_found = []

json_response = json.loads(response)
for result in json_response["results"]:
#Names that exist come with the field 'jurisdictions'
if('jurisdictions' in result):
names_found.append(result['name'])
# else:
# print(result['name']+" is not a name")

return names_found

def filter_based_type_of_word(list_strings, language):

if language == SPANISH:
nlp = spacy.load("es_core_news_sm")

else:
nlp = spacy.load("en_core_web_sm")


#Accepted types of words
#Reference https://spacy.io/api/annotation#pos-tagging
accepted_types = ['PROPN', 'X','PER','LOC','ORG','MISC','']

filtered_list = []
for string in list_strings:
doc = nlp(string)
for token in doc:
if token.pos_ in accepted_types:
filtered_list.append(token.text)

return list(set(filtered_list))

def find_names_in_list_string(list_potential_names):
'''
Uses https://forebears.io/onograph/documentation/api/location/batch to find names in list_potential_names
If this approach seems to be slow or inaccurate, an alternative its to use spacy:
import spacy
string = "my name is felipe"
nlp = spacy.load("en_core_web_md")
doc = nlp(string)
for token in doc:
if (token.ent_type_ == 'PERSON')
print(token+" is a name")
'''
API_KEY = get_forebears_api_key()

all_names_found = set()

#Api calls must query at most 1,000 names.
n = 1000
list_of_list_1000_potential_names = [list_potential_names[i:i + n] for i in range(0, len(list_potential_names), n)]

for list_1000_potential_names in list_of_list_1000_potential_names:
#Need to 2 to API calls, one checking forenames and one checking surnames
for forename_or_surname in ['forename', 'surname']:
api_url = 'https://ono.4b.rs/v1/jurs?key='+API_KEY

names_parameter = generate_names_parameter_for_api(list_1000_potential_names, forename_or_surname)

response = requests.post(api_url, data={'names':names_parameter})

names_found = get_names_from_json_response(response.text)
for name in names_found:
all_names_found.add(name)

return list(all_names_found)


def get_list_unique_strings_in_dataset(dataset, columns_to_check):
#To make the list, we will go over all columns that have sparse strings
set_string_in_dataset = set()

#For every column in the dataset
for column_name in columns_to_check:
#If column contains strings
if(column_has_sufficiently_sparse_strings(dataset, column_name)):

#Clean column
column = clean_column(dataset[column_name])

for row in column:
#If row contains more than one word, add each word
if (' ' in row):
#For every word in the row
for word in row.split(" "):
#Add word to strings to check
set_string_in_dataset.add(word)
#If row does not contain spaces, add whole row (its only one string)
else:
set_string_in_dataset.add(row)

return list(set_string_in_dataset)

def find_piis(dataset, label_dict, columns_to_check_not_filtered, language):

#Filter columns to those that have sparse entries
columns_to_check = []
for column_name in columns_to_check_not_filtered:
if column_has_sufficiently_sparse_strings(dataset, column_name):
columns_to_check.append(column_name)

print("columns_to_check")
print(columns_to_check)

#Do not check surveyCTO columns
#columns_to_check = [column for column in dataset.columns if column not in restricted_words_list.get_surveycto_restricted_vars()]

#First we will make a list of all strings that need to be checked
print("->Getting list of unique strings in dataset...")
strings_to_check = get_list_unique_strings_in_dataset(dataset, columns_to_check)

#Remove string with less than 3 chars - piis should be longer than that
print("->Removing strings with less than 3 characters")
strings_to_check = [s for s in strings_to_check if len(s)>2]

#Find all telephone numbers
print("-->Finding phone numbers")
phone_numbers_found = find_phone_numbers_in_list_strings(strings_to_check)
print("found "+str(len(phone_numbers_found)))

#Update strings_to_check
strings_to_check = [s for s in strings_to_check if s not in phone_numbers_found]

#Clean list of words, now that we have already found numbers
print("Length of list "+str(len(strings_to_check)))
print("->Removing stopwords")
strings_to_check = remove_stopwords(strings_to_check)
print("->Filtering based on word type")
strings_to_check = filter_based_type_of_word(strings_to_check, language)
print("Length of list "+str(len(strings_to_check)))

#Find all names
print("->Finding names")
names_found = find_names_in_list_string(strings_to_check)
print("found "+str(len(names_found)))
print(names_found)
#Update strings_to_check
strings_to_check = [s for s in strings_to_check if s not in names_found]

#Find all locations with pop less than 20,000
print("-->Finding locations with low population")
locations_with_low_population_found = google.get_locations_with_low_population(strings_to_check)
print("found "+str(len(locations_with_low_population_found)))
print(locations_with_low_population_found)

return list(set(phone_numbers_found + names_found + locations_with_low_population_found))


#Find piis in list
print("->Findind PIIs")
piis_found = find_piis_in_list_strings(filtered_strings_to_check)

# #Replace found piis found from the dataset
# print("->Replacing PIIs in new dataset")
# now = datetime.now()
# current_time = now.strftime("%H:%M:%S")
# print("Current Time =", current_time)
# deidentified_dataset = dataset.replace(piis_found, 'XXXX', regex=True)

# #Save new dataframe
# print("->Exporting new dataset")
# now = datetime.now()
# current_time = now.strftime("%H:%M:%S")
# print("Current Time =", current_time)
# new_file_path = export(deidentified_dataset, dataset_path)

print("Task ready!")

return piis_found


if __name__ == "__main__":

dataset_path = 'X:\Box Sync\GRDS_Resources\Data Science\Test data\Raw\RECOVR_MEX_r1_Raw.dta'

reading_status, reading_content = import_file(dataset_path)

if(reading_status is False):
print("Problem importing file")

dataset = reading_content[DATASET]
label_dict = reading_content[LABEL_DICT]

columns_to_check = [c for c in dataset.columns if c not in restricted_words_list.get_surveycto_restricted_vars()]

find_piis(dataset, label_dict, columns_to_check)

# print(find_names_in_list_string(['Felipe','nombrequenoexiste', 'George', 'Felipe', 'Enriqueta', 'dededede']))

Loading

0 comments on commit e2c208d

Please sign in to comment.