Merge pull request #28 from redhuntlabs/saudi-pii-pack

Saudi pii pack
redhuntlabs · Nov 23, 2023 · 9777b17 · 9777b17
2 parents 9b266f5 + 0ec2088
commit 9777b17
Show file tree

Hide file tree

Showing 4 changed files with 65 additions and 3 deletions.
diff --git a/.github/workflows/github-actions-demo.yml → .github/workflows/github-action.yml b/.github/workflows/github-actions-demo.yml → .github/workflows/github-action.yml
diff --git a/definitions.json b/definitions.json
@@ -151,6 +151,60 @@
          "<<<<"
       ]
    },
+   "Resident Identity (Iqama)": {
+      "regex":null,
+      "region":"Saudi Arabia",
+      "keywords":[
+         "Kingdom",
+         "Saudi",
+         "Arabia",
+         "Ministry",
+         "Interior",
+         "Permit",
+         "Iqama",
+         "Residen",
+         "Identity"
+      ]
+   },
+   "Saudi Driver's License": {
+      "regex":"\b[0-9]{10}\b",
+      "region":"Saudi Arabia",
+      "keywords":[
+         "Kingdom",
+         "Saudi",
+         "Arabia",
+         "Ministry",
+         "Interior",
+         "Driving",
+         "License"
+      ]
+   },
+   "Saudi Arabian Visa": {
+      "regex":"(?:V<SAU)(?:[A-Z0-9<].+)",
+      "region":"Saudi Arabia",
+      "keywords":[
+         "Visa",
+         "Saudi Arabia",
+         "V<SAU",
+         "<<<<",
+         "Entries",
+         "Permitted",
+         "Work",
+         "Validity"
+      ]
+   },
+   "Tawuniya Health Insurance": {
+      "regex":"\b[0-9]{5}\b",
+      "region":"Saudi Arabia",
+      "keywords":[
+         "Tawuniya",
+         "Policy",
+         "Holder",
+         "Number",
+         "Deductible",
+         "Approval"
+      ]
+   },
    "Nebraska Driver's License": {
       "regex":"[A-Z]{1}[0-9]{9,11}",
       "region":"United States",

diff --git a/requirements.txt b/requirements.txt
@@ -17,3 +17,4 @@ nltk
 bs4
 requests
 geotext
+spacy
diff --git a/text_utils.py b/text_utils.py
@@ -94,9 +94,16 @@ def regional_pii(text):
     from nltk import word_tokenize, pos_tag, ne_chunk
     from nltk.corpus import stopwords
 
-    if not nltk.data.find('tokenizers/punkt'): nltk.download('punkt')
-    if not nltk.data.find('chunkers/maxent_ne_chunker'): nltk.download('maxent_ne_chunker')
-    if not nltk.data.find('corpora/words.zip'): nltk.download('words')
+    resources = ["punkt", "maxent_ne_chunker", "stopwords", "words", "averaged_perceptron_tagger"]
+
+    try:
+        nltk_resources = ["tokenizers/punkt", "chunkers/maxent_ne_chunker", "corpora/words.zip"]
+        for resource in nltk_resources:
+            if not nltk.data.find(resource): raise LookupError()
+    except LookupError:
+        for resource in resources:
+            nltk.download(resource)
+
     stop_words = set(stopwords.words('english'))
 
     words = word_tokenize(text)