Skip to content

Commit

Permalink
Added batching to pseudonymization process in notebooks
Browse files Browse the repository at this point in the history
  • Loading branch information
fexfl committed Dec 26, 2024
1 parent f767951 commit 0c328ae
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 14 deletions.
20 changes: 13 additions & 7 deletions notebook/batching_performance.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@
" ps = mailcom.parse.Pseudonymize()\n",
" ps.init_spacy(\"fr\")\n",
" ps.init_transformers()\n",
" ps.set_sentence_batch_size(batch_size)\n",
" # time stamp after model loading\n",
" t_model_loaded = time.time()\n",
"\n",
Expand All @@ -102,15 +103,20 @@
" # For performance analysis the process is split into its subprocesses here\n",
" ps.reset()\n",
" sentences = ps.get_sentences(text)\n",
" batches = [\n",
" sentences[n : n + ps.n_batch_sentences] # noqa\n",
" for n in range(0, len(sentences), ps.n_batch_sentences)\n",
" ]\n",
" ts_email_ppr_done = time.time()\n",
" pseudonymized_sentences = []\n",
" for sent in sentences:\n",
" sent = ps.pseudonymize_email_addresses(sent)\n",
" ner = ps.get_ner(sent)\n",
" ps_sent = \" \".join(ps.pseudonymize_ne(ner, sent)) if ner else sent\n",
" pseudonymized_batches = []\n",
" for batch in batches:\n",
" batch = ps.concatenate(batch)\n",
" batch = ps.pseudonymize_email_addresses(batch)\n",
" ner = ps.get_ner(batch)\n",
" ps_sent = \" \".join(ps.pseudonymize_ne(ner, batch)) if ner else batch\n",
" ps_sent = ps.pseudonymize_numbers(ps_sent)\n",
" pseudonymized_sentences.append(ps_sent)\n",
" output_text = ps.concatenate(pseudonymized_sentences)\n",
" pseudonymized_batches.append(ps_sent)\n",
" output_text = ps.concatenate(pseudonymized_batches)\n",
"\n",
" # add output to dict\n",
" email_dict[\"pseudo_content\"] = output_text\n",
Expand Down
19 changes: 12 additions & 7 deletions notebook/performance_demo.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -64,15 +64,20 @@
" # For performance analysis the process is split into its subprocesses here\n",
" ps.reset()\n",
" sentences = ps.get_sentences(text)\n",
" batches = [\n",
" sentences[n : n + ps.n_batch_sentences] # noqa\n",
" for n in range(0, len(sentences), ps.n_batch_sentences)\n",
" ]\n",
" ts_email_ppr_done = time.time()\n",
" pseudonymized_sentences = []\n",
" for sent in sentences:\n",
" sent = ps.pseudonymize_email_addresses(sent)\n",
" ner = ps.get_ner(sent)\n",
" ps_sent = \" \".join(ps.pseudonymize_ne(ner, sent)) if ner else sent\n",
" pseudonymized_batches = []\n",
" for batch in batches:\n",
" batch = ps.concatenate(batch)\n",
" batch = ps.pseudonymize_email_addresses(batch)\n",
" ner = ps.get_ner(batch)\n",
" ps_sent = \" \".join(ps.pseudonymize_ne(ner, batch)) if ner else batch\n",
" ps_sent = ps.pseudonymize_numbers(ps_sent)\n",
" pseudonymized_sentences.append(ps_sent)\n",
" output_text = ps.concatenate(pseudonymized_sentences)\n",
" pseudonymized_batches.append(ps_sent)\n",
" output_text = ps.concatenate(pseudonymized_batches)\n",
"\n",
" # add output to dict\n",
" email_dict[\"pseudo_content\"] = output_text\n",
Expand Down

0 comments on commit 0c328ae

Please sign in to comment.