Skip to content

Commit

Permalink
Added basic structure for comparing batch performance
Browse files Browse the repository at this point in the history
  • Loading branch information
fexfl committed Dec 20, 2024
1 parent fc68d93 commit b2317ce
Showing 1 changed file with 185 additions and 0 deletions.
185 changes: 185 additions & 0 deletions notebook/batching_performance.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,185 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import mailcom.inout\n",
"import mailcom.parse\n",
"import pandas as pd\n",
"import time\n",
"import datetime\n",
"import matplotlib.pyplot as plt"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def performance_test(batch_size):\n",
" print(\"-----------------------------------\")\n",
" print(\"Starting performance test for batch size\", batch_size)\n",
" # create t0 timestamp\n",
" t0 = time.time()\n",
"\n",
" # import files from csv file\n",
" email_list = pd.read_csv(\"../mailcom/test/data/mails_lb_sg_copy.csv\")\n",
" #print(email_list)\n",
"\n",
" t_csv_read = time.time()\n",
"\n",
" # create pseudonymization object\n",
" ps = mailcom.parse.Pseudonymize()\n",
" ps.init_spacy(\"fr\")\n",
" ps.init_transformers()\n",
" # time stamp after model loading\n",
" t_model_loaded = time.time()\n",
"\n",
" # loop over mails and pseudonymize them\n",
" out_list = []\n",
" ts_list = []\n",
" for idx, row in email_list.iterrows():\n",
" ts_email_start = time.time()\n",
" text = row[\"message\"]\n",
" email_dict = {\"content\": text}\n",
" if not text:\n",
" continue\n",
" # Test functionality of Pseudonymize class\n",
" # Pseudonymization is usually done using ps.pseudonymize\n",
" # For performance analysis the process is split into its subprocesses here\n",
" ps.reset()\n",
" sentences = ps.get_sentences(text)\n",
" ts_email_ppr_done = time.time()\n",
" pseudonymized_sentences = []\n",
" for sent in sentences:\n",
" sent = ps.pseudonymize_email_addresses(sent)\n",
" ner = ps.get_ner(sent)\n",
" ps_sent = \" \".join(ps.pseudonymize_ne(ner, sent)) if ner else sent\n",
" ps_sent = ps.pseudonymize_numbers(ps_sent)\n",
" pseudonymized_sentences.append(ps_sent)\n",
" output_text = ps.concatenate(pseudonymized_sentences)\n",
"\n",
" # add output to dict\n",
" email_dict[\"pseudo_content\"] = output_text\n",
" out_list.append(email_dict)\n",
"\n",
" # timestamp after this email\n",
" ts_email_end = time.time()\n",
" ts_list.append([ts_email_start, ts_email_ppr_done, ts_email_end])\n",
"\n",
" # write output to pandas df\n",
" df = pd.DataFrame(out_list)\n",
" # print(df)\n",
"\n",
" # display timestamps\n",
"\n",
" # bar plot for each individual email\n",
" # processing times\n",
" idx_list = [row[0] for row in email_list.iterrows()]\n",
" email_duration_list = [ts[2] - ts[1] for ts in ts_list]\n",
" email_ppr_list = [ts[1] - ts[0] for ts in ts_list]\n",
" email_total_list = [ts[2] - ts[0] for ts in ts_list]\n",
" email_bar_height = {\n",
" \"Pre-Processing\": email_ppr_list,\n",
" \"Pseudonymization\": email_duration_list\n",
" }\n",
" bt = [0 for idx in idx_list]\n",
"\n",
" plt.figure(figsize=(10,4), dpi=80)\n",
"\n",
" # plot 1\n",
" plt.subplot(1, 2, 1)\n",
" for key, height in email_bar_height.items():\n",
" plt.bar(idx_list, height, 0.5, label=key, bottom=bt)\n",
" bt = [bi + hi for (bi,hi) in zip(bt, height)]\n",
" #plt.yscale(\"log\")\n",
" plt.xlabel(\"Email\")\n",
" plt.ylabel(\"t [s]\")\n",
" plt.title(\"Computation times for emails, model loading and file reading\")\n",
" plt.legend()\n",
"\n",
" # plot for model loading and file reading, as well as average email time\n",
" # processing times\n",
" bar_x = [\"CSV Reading\", \"Model Loading\", \"Average Email Time\"]\n",
" average_email_time = sum(email_total_list) / len(email_total_list)\n",
" bar_y = [t_csv_read - t0, t_model_loaded - t0, average_email_time]\n",
" plt.ylabel(\"t [s]\")\n",
"\n",
" # plot 2\n",
" plt.subplot(1, 2, 2)\n",
" plt.bar(bar_x, bar_y, 0.5)\n",
"\n",
" # Total time\n",
" print(\"Total time:\", (datetime.datetime.fromtimestamp(ts_list[len(ts_list)-1][2] - t_model_loaded).strftime('%M:%S')))\n",
"\n",
" # plt.savefig(\"out/mailcom_batching_performance_n_\" + str(batch_size) + datetime.datetime.fromtimestamp(t0).strftime('%H%M%S') + \".png\")\n",
" print(\"-----------------------------------\")\n",
"\n",
" return average_email_time"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"batching_sizes = [-1, -1, 1, 2, 3, 4, 6, 8, 10] # first run is ignored since there seem to be some inconsistencies when loading for the first time\n",
"# batching_sizes = [1]\n",
"n_samples = 3\n",
"\n",
"av_email_times_for_batches = []\n",
"for bs in batching_sizes:\n",
" average_email_time = 0\n",
" for _ in range(n_samples):\n",
" t = performance_test(bs)\n",
" average_email_time += t\n",
" av_email_times_for_batches.append(average_email_time/n_samples)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"plt.bar(batching_sizes[1:], av_email_times_for_batches[1:], 0.5)\n",
"plt.xlabel(\"n batches\")\n",
"plt.ylabel(\"Average Email Time [s]\")\n",
"plt.title(\"Average email time for different batch sizes\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "mailcom",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.10"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

0 comments on commit b2317ce

Please sign in to comment.