Skip to content

Commit

Permalink
Merge pull request #28 from redhuntlabs/saudi-pii-pack
Browse files Browse the repository at this point in the history
Saudi pii pack
  • Loading branch information
0x4f53 authored Nov 23, 2023
2 parents 9b266f5 + 0ec2088 commit 9777b17
Show file tree
Hide file tree
Showing 4 changed files with 65 additions and 3 deletions.
File renamed without changes.
54 changes: 54 additions & 0 deletions definitions.json
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,60 @@
"<<<<"
]
},
"Resident Identity (Iqama)": {
"regex":null,
"region":"Saudi Arabia",
"keywords":[
"Kingdom",
"Saudi",
"Arabia",
"Ministry",
"Interior",
"Permit",
"Iqama",
"Residen",
"Identity"
]
},
"Saudi Driver's License": {
"regex":"\b[0-9]{10}\b",
"region":"Saudi Arabia",
"keywords":[
"Kingdom",
"Saudi",
"Arabia",
"Ministry",
"Interior",
"Driving",
"License"
]
},
"Saudi Arabian Visa": {
"regex":"(?:V<SAU)(?:[A-Z0-9<].+)",
"region":"Saudi Arabia",
"keywords":[
"Visa",
"Saudi Arabia",
"V<SAU",
"<<<<",
"Entries",
"Permitted",
"Work",
"Validity"
]
},
"Tawuniya Health Insurance": {
"regex":"\b[0-9]{5}\b",
"region":"Saudi Arabia",
"keywords":[
"Tawuniya",
"Policy",
"Holder",
"Number",
"Deductible",
"Approval"
]
},
"Nebraska Driver's License": {
"regex":"[A-Z]{1}[0-9]{9,11}",
"region":"United States",
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,4 @@ nltk
bs4
requests
geotext
spacy
13 changes: 10 additions & 3 deletions text_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,9 +94,16 @@ def regional_pii(text):
from nltk import word_tokenize, pos_tag, ne_chunk
from nltk.corpus import stopwords

if not nltk.data.find('tokenizers/punkt'): nltk.download('punkt')
if not nltk.data.find('chunkers/maxent_ne_chunker'): nltk.download('maxent_ne_chunker')
if not nltk.data.find('corpora/words.zip'): nltk.download('words')
resources = ["punkt", "maxent_ne_chunker", "stopwords", "words", "averaged_perceptron_tagger"]

try:
nltk_resources = ["tokenizers/punkt", "chunkers/maxent_ne_chunker", "corpora/words.zip"]
for resource in nltk_resources:
if not nltk.data.find(resource): raise LookupError()
except LookupError:
for resource in resources:
nltk.download(resource)

stop_words = set(stopwords.words('english'))

words = word_tokenize(text)
Expand Down

0 comments on commit 9777b17

Please sign in to comment.