-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Added basic structure for comparing batch performance
- Loading branch information
Showing
1 changed file
with
185 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,185 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import mailcom.inout\n", | ||
"import mailcom.parse\n", | ||
"import pandas as pd\n", | ||
"import time\n", | ||
"import datetime\n", | ||
"import matplotlib.pyplot as plt" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"def performance_test(batch_size):\n", | ||
" print(\"-----------------------------------\")\n", | ||
" print(\"Starting performance test for batch size\", batch_size)\n", | ||
" # create t0 timestamp\n", | ||
" t0 = time.time()\n", | ||
"\n", | ||
" # import files from csv file\n", | ||
" email_list = pd.read_csv(\"../mailcom/test/data/mails_lb_sg_copy.csv\")\n", | ||
" #print(email_list)\n", | ||
"\n", | ||
" t_csv_read = time.time()\n", | ||
"\n", | ||
" # create pseudonymization object\n", | ||
" ps = mailcom.parse.Pseudonymize()\n", | ||
" ps.init_spacy(\"fr\")\n", | ||
" ps.init_transformers()\n", | ||
" # time stamp after model loading\n", | ||
" t_model_loaded = time.time()\n", | ||
"\n", | ||
" # loop over mails and pseudonymize them\n", | ||
" out_list = []\n", | ||
" ts_list = []\n", | ||
" for idx, row in email_list.iterrows():\n", | ||
" ts_email_start = time.time()\n", | ||
" text = row[\"message\"]\n", | ||
" email_dict = {\"content\": text}\n", | ||
" if not text:\n", | ||
" continue\n", | ||
" # Test functionality of Pseudonymize class\n", | ||
" # Pseudonymization is usually done using ps.pseudonymize\n", | ||
" # For performance analysis the process is split into its subprocesses here\n", | ||
" ps.reset()\n", | ||
" sentences = ps.get_sentences(text)\n", | ||
" ts_email_ppr_done = time.time()\n", | ||
" pseudonymized_sentences = []\n", | ||
" for sent in sentences:\n", | ||
" sent = ps.pseudonymize_email_addresses(sent)\n", | ||
" ner = ps.get_ner(sent)\n", | ||
" ps_sent = \" \".join(ps.pseudonymize_ne(ner, sent)) if ner else sent\n", | ||
" ps_sent = ps.pseudonymize_numbers(ps_sent)\n", | ||
" pseudonymized_sentences.append(ps_sent)\n", | ||
" output_text = ps.concatenate(pseudonymized_sentences)\n", | ||
"\n", | ||
" # add output to dict\n", | ||
" email_dict[\"pseudo_content\"] = output_text\n", | ||
" out_list.append(email_dict)\n", | ||
"\n", | ||
" # timestamp after this email\n", | ||
" ts_email_end = time.time()\n", | ||
" ts_list.append([ts_email_start, ts_email_ppr_done, ts_email_end])\n", | ||
"\n", | ||
" # write output to pandas df\n", | ||
" df = pd.DataFrame(out_list)\n", | ||
" # print(df)\n", | ||
"\n", | ||
" # display timestamps\n", | ||
"\n", | ||
" # bar plot for each individual email\n", | ||
" # processing times\n", | ||
" idx_list = [row[0] for row in email_list.iterrows()]\n", | ||
" email_duration_list = [ts[2] - ts[1] for ts in ts_list]\n", | ||
" email_ppr_list = [ts[1] - ts[0] for ts in ts_list]\n", | ||
" email_total_list = [ts[2] - ts[0] for ts in ts_list]\n", | ||
" email_bar_height = {\n", | ||
" \"Pre-Processing\": email_ppr_list,\n", | ||
" \"Pseudonymization\": email_duration_list\n", | ||
" }\n", | ||
" bt = [0 for idx in idx_list]\n", | ||
"\n", | ||
" plt.figure(figsize=(10,4), dpi=80)\n", | ||
"\n", | ||
" # plot 1\n", | ||
" plt.subplot(1, 2, 1)\n", | ||
" for key, height in email_bar_height.items():\n", | ||
" plt.bar(idx_list, height, 0.5, label=key, bottom=bt)\n", | ||
" bt = [bi + hi for (bi,hi) in zip(bt, height)]\n", | ||
" #plt.yscale(\"log\")\n", | ||
" plt.xlabel(\"Email\")\n", | ||
" plt.ylabel(\"t [s]\")\n", | ||
" plt.title(\"Computation times for emails, model loading and file reading\")\n", | ||
" plt.legend()\n", | ||
"\n", | ||
" # plot for model loading and file reading, as well as average email time\n", | ||
" # processing times\n", | ||
" bar_x = [\"CSV Reading\", \"Model Loading\", \"Average Email Time\"]\n", | ||
" average_email_time = sum(email_total_list) / len(email_total_list)\n", | ||
" bar_y = [t_csv_read - t0, t_model_loaded - t0, average_email_time]\n", | ||
" plt.ylabel(\"t [s]\")\n", | ||
"\n", | ||
" # plot 2\n", | ||
" plt.subplot(1, 2, 2)\n", | ||
" plt.bar(bar_x, bar_y, 0.5)\n", | ||
"\n", | ||
" # Total time\n", | ||
" print(\"Total time:\", (datetime.datetime.fromtimestamp(ts_list[len(ts_list)-1][2] - t_model_loaded).strftime('%M:%S')))\n", | ||
"\n", | ||
" # plt.savefig(\"out/mailcom_batching_performance_n_\" + str(batch_size) + datetime.datetime.fromtimestamp(t0).strftime('%H%M%S') + \".png\")\n", | ||
" print(\"-----------------------------------\")\n", | ||
"\n", | ||
" return average_email_time" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"batching_sizes = [-1, -1, 1, 2, 3, 4, 6, 8, 10] # first run is ignored since there seem to be some inconsistencies when loading for the first time\n", | ||
"# batching_sizes = [1]\n", | ||
"n_samples = 3\n", | ||
"\n", | ||
"av_email_times_for_batches = []\n", | ||
"for bs in batching_sizes:\n", | ||
" average_email_time = 0\n", | ||
" for _ in range(n_samples):\n", | ||
" t = performance_test(bs)\n", | ||
" average_email_time += t\n", | ||
" av_email_times_for_batches.append(average_email_time/n_samples)\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"plt.bar(batching_sizes[1:], av_email_times_for_batches[1:], 0.5)\n", | ||
"plt.xlabel(\"n batches\")\n", | ||
"plt.ylabel(\"Average Email Time [s]\")\n", | ||
"plt.title(\"Average email time for different batch sizes\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "mailcom", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.11.10" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |