Skip to content

Commit

Permalink
Merge notebook-display into transformers-batching
Browse files Browse the repository at this point in the history
  • Loading branch information
fexfl committed Dec 23, 2024
2 parents b2317ce + a2df5be commit e62373d
Show file tree
Hide file tree
Showing 3 changed files with 62 additions and 11 deletions.
23 changes: 18 additions & 5 deletions mailcom/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,12 @@ def __init__(self):
# records the already replaced names in an email
self.used_first_names = {}

# records NEs in the last email
self.per_list = []
self.org_list = []
self.loc_list = []
self.misc_list = []

def set_sentence_batch_size(self, batch_size: int):
if batch_size == 0 or batch_size < -1:
raise ValueError(
Expand Down Expand Up @@ -105,6 +111,11 @@ def init_transformers(
def reset(self):
# reset used names for processing a new email
self.used_first_names.clear()
# reset NEs
self.per_list.clear()
self.org_list.clear()
self.loc_list.clear()
self.misc_list.clear()

def get_sentences(self, input_text):
doc = self.nlp_spacy(input_text)
Expand All @@ -117,8 +128,8 @@ def get_ner(self, sentence):
ner = self.ner_recognizer(sentence)
return ner

def pseudonymize_per(self, new_sentence, nelist):
unique_ne_list = list(dict.fromkeys(nelist))
def pseudonymize_per(self, new_sentence):
unique_ne_list = list(dict.fromkeys(self.per_list))
for ne in unique_ne_list:
# choose the pseudonym
nm_list = self.used_first_names
Expand Down Expand Up @@ -150,7 +161,6 @@ def pseudonymize_per(self, new_sentence, nelist):
def pseudonymize_ne(self, ner, sentence):
# remove any named entities
entlist = []
nelist = []
new_sentence = sentence
for i in range(len(ner)):
entity = ner[i]
Expand All @@ -167,18 +177,21 @@ def pseudonymize_ne(self, ner, sentence):
# replace PER
if ent_string == "PER":
# add the name of this entity to list
nelist.append(ent_word)
self.per_list.append(ent_word)
# replace LOC
elif ent_string == "LOC":
new_sentence = new_sentence.replace(ent_word, "[location]")
self.loc_list.append(ent_word)
# replace ORG
elif ent_string == "ORG":
new_sentence = new_sentence.replace(ent_word, "[organization]")
self.org_list.append(ent_word)
# replace MISC
elif ent_string == "MISC":
new_sentence = new_sentence.replace(ent_word, "[misc]")
self.misc_list.append(ent_word)
# replace all unique PER now
new_sentence = self.pseudonymize_per(new_sentence, nelist)
new_sentence = self.pseudonymize_per(new_sentence)

newlist = [new_sentence]
return newlist
Expand Down
3 changes: 2 additions & 1 deletion mailcom/test/test_parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,8 @@ def test_get_sentences_with_punctuation(get_default_fr):
def test_pseudonymize_per(get_default_fr):
sentence = "Francois and Agathe are friends."
nelist = ["Francois", "Agathe"]
pseudonymized_sentence = get_default_fr.pseudonymize_per(sentence, nelist)
get_default_fr.per_list = nelist
pseudonymized_sentence = get_default_fr.pseudonymize_per(sentence)
assert "Francois" not in pseudonymized_sentence
assert "Agathe" not in pseudonymized_sentence
assert any(
Expand Down
47 changes: 42 additions & 5 deletions notebook/demo.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@
"source": [
"import mailcom.inout\n",
"import mailcom.parse\n",
"import pandas as pd"
"import pandas as pd\n",
"from IPython.display import display, HTML"
]
},
{
Expand All @@ -37,6 +38,35 @@
"Below, the input files are loaded from the given `input_dir` directory. You can provide relative or absolute paths to the directory that contains your `eml` or `html` files. All files of the `eml` or `htlm` file type in that directory will be considered input files."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# function for displaying the result using HTML\n",
"def highlight_ne(text, per_list, org_list, loc_list, misc_list):\n",
" # create a list of all entities with their positions\n",
" entities = []\n",
" for loc in loc_list:\n",
" entities.append((loc, \"green\"))\n",
" for org in org_list:\n",
" entities.append((org, \"blue\"))\n",
" for misc in misc_list:\n",
" entities.append((misc, \"yellow\"))\n",
" for per in per_list:\n",
" entities.append((per, \"red\"))\n",
" \n",
" # sort entities by their positions in the text in reverse order\n",
" entities.sort(key=lambda x: text.find(x[0]), reverse=True)\n",
" \n",
" # replace entities with highlighted spans\n",
" for entity, color in entities:\n",
" text = text.replace(entity, f\"<span style=\\\"background-color:{color}\\\">{entity}</span>\")\n",
" \n",
" return text"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand Down Expand Up @@ -86,11 +116,18 @@
" # after this function was called, the email metadata can be accessed via io.email_content\n",
" # the dict already has the entries content, date, attachments, attachment type\n",
" email_dict = io.email_content.copy()\n",
" text = io.get_html_text(text)\n",
" html_text = io.get_html_text(text)\n",
" email_dict[\"html_text\"] = html_text\n",
" if not text:\n",
" continue\n",
" # Test functionality of Pseudonymize class\n",
" output_text = ps.pseudonymize(text)\n",
" output_text = ps.pseudonymize(html_text)\n",
"\n",
" # display original text and highlight found and replaced NEs\n",
" highlighted_html = highlight_ne(html_text, ps.per_list, ps.org_list, ps.loc_list, ps.misc_list)\n",
" display(HTML(highlighted_html))\n",
"\n",
" # add pseudonymized text to dict\n",
" email_dict[\"pseudo_content\"] = output_text\n",
" out_list.append(email_dict)"
]
Expand Down Expand Up @@ -128,7 +165,7 @@
"# print results\n",
"for idx, mail in df.iterrows():\n",
" print(\"Email\", idx)\n",
" print(\"Original Text:\\n\", mail[\"content\"])\n",
" print(\"Original Text:\\n\", mail[\"html_text\"])\n",
" print(\"Pseudonymized Text:\\n\", mail[\"pseudo_content\"])\t"
]
},
Expand Down Expand Up @@ -156,7 +193,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.9"
"version": "3.11.10"
}
},
"nbformat": 4,
Expand Down

0 comments on commit e62373d

Please sign in to comment.