Added basic structure for comparing batch performance

ssciwr · Dec 20, 2024 · b2317ce · b2317ce
1 parent fc68d93
commit b2317ce
Showing 1 changed file with 185 additions and 0 deletions.
diff --git a/notebook/batching_performance.ipynb b/notebook/batching_performance.ipynb
@@ -0,0 +1,185 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import mailcom.inout\n",
+    "import mailcom.parse\n",
+    "import pandas as pd\n",
+    "import time\n",
+    "import datetime\n",
+    "import matplotlib.pyplot as plt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def performance_test(batch_size):\n",
+    "    print(\"-----------------------------------\")\n",
+    "    print(\"Starting performance test for batch size\", batch_size)\n",
+    "    # create t0 timestamp\n",
+    "    t0 = time.time()\n",
+    "\n",
+    "    # import files from csv file\n",
+    "    email_list = pd.read_csv(\"../mailcom/test/data/mails_lb_sg_copy.csv\")\n",
+    "    #print(email_list)\n",
+    "\n",
+    "    t_csv_read = time.time()\n",
+    "\n",
+    "    # create pseudonymization object\n",
+    "    ps = mailcom.parse.Pseudonymize()\n",
+    "    ps.init_spacy(\"fr\")\n",
+    "    ps.init_transformers()\n",
+    "    # time stamp after model loading\n",
+    "    t_model_loaded = time.time()\n",
+    "\n",
+    "    # loop over mails and pseudonymize them\n",
+    "    out_list = []\n",
+    "    ts_list = []\n",
+    "    for idx, row in email_list.iterrows():\n",
+    "        ts_email_start = time.time()\n",
+    "        text = row[\"message\"]\n",
+    "        email_dict = {\"content\": text}\n",
+    "        if not text:\n",
+    "            continue\n",
+    "        # Test functionality of Pseudonymize class\n",
+    "        # Pseudonymization is usually done using ps.pseudonymize\n",
+    "        # For performance analysis the process is split into its subprocesses here\n",
+    "        ps.reset()\n",
+    "        sentences = ps.get_sentences(text)\n",
+    "        ts_email_ppr_done = time.time()\n",
+    "        pseudonymized_sentences = []\n",
+    "        for sent in sentences:\n",
+    "            sent = ps.pseudonymize_email_addresses(sent)\n",
+    "            ner = ps.get_ner(sent)\n",
+    "            ps_sent = \" \".join(ps.pseudonymize_ne(ner, sent)) if ner else sent\n",
+    "            ps_sent = ps.pseudonymize_numbers(ps_sent)\n",
+    "            pseudonymized_sentences.append(ps_sent)\n",
+    "        output_text = ps.concatenate(pseudonymized_sentences)\n",
+    "\n",
+    "        # add output to dict\n",
+    "        email_dict[\"pseudo_content\"] = output_text\n",
+    "        out_list.append(email_dict)\n",
+    "\n",
+    "        # timestamp after this email\n",
+    "        ts_email_end = time.time()\n",
+    "        ts_list.append([ts_email_start, ts_email_ppr_done, ts_email_end])\n",
+    "\n",
+    "    # write output to pandas df\n",
+    "    df = pd.DataFrame(out_list)\n",
+    "    # print(df)\n",
+    "\n",
+    "    # display timestamps\n",
+    "\n",
+    "    # bar plot for each individual email\n",
+    "    # processing times\n",
+    "    idx_list = [row[0] for row in email_list.iterrows()]\n",
+    "    email_duration_list = [ts[2] - ts[1] for ts in ts_list]\n",
+    "    email_ppr_list = [ts[1] - ts[0] for ts in ts_list]\n",
+    "    email_total_list = [ts[2] - ts[0] for ts in ts_list]\n",
+    "    email_bar_height = {\n",
+    "        \"Pre-Processing\": email_ppr_list,\n",
+    "        \"Pseudonymization\": email_duration_list\n",
+    "    }\n",
+    "    bt = [0 for idx in idx_list]\n",
+    "\n",
+    "    plt.figure(figsize=(10,4), dpi=80)\n",
+    "\n",
+    "    # plot 1\n",
+    "    plt.subplot(1, 2, 1)\n",
+    "    for key, height in email_bar_height.items():\n",
+    "        plt.bar(idx_list, height, 0.5, label=key, bottom=bt)\n",
+    "        bt = [bi + hi for (bi,hi) in zip(bt, height)]\n",
+    "    #plt.yscale(\"log\")\n",
+    "    plt.xlabel(\"Email\")\n",
+    "    plt.ylabel(\"t [s]\")\n",
+    "    plt.title(\"Computation times for emails, model loading and file reading\")\n",
+    "    plt.legend()\n",
+    "\n",
+    "    # plot for model loading and file reading, as well as average email time\n",
+    "    # processing times\n",
+    "    bar_x = [\"CSV Reading\", \"Model Loading\", \"Average Email Time\"]\n",
+    "    average_email_time = sum(email_total_list) / len(email_total_list)\n",
+    "    bar_y = [t_csv_read - t0, t_model_loaded - t0, average_email_time]\n",
+    "    plt.ylabel(\"t [s]\")\n",
+    "\n",
+    "    # plot 2\n",
+    "    plt.subplot(1, 2, 2)\n",
+    "    plt.bar(bar_x, bar_y, 0.5)\n",
+    "\n",
+    "    # Total time\n",
+    "    print(\"Total time:\", (datetime.datetime.fromtimestamp(ts_list[len(ts_list)-1][2] - t_model_loaded).strftime('%M:%S')))\n",
+    "\n",
+    "    # plt.savefig(\"out/mailcom_batching_performance_n_\" + str(batch_size) + datetime.datetime.fromtimestamp(t0).strftime('%H%M%S') + \".png\")\n",
+    "    print(\"-----------------------------------\")\n",
+    "\n",
+    "    return average_email_time"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "batching_sizes = [-1, -1, 1, 2, 3, 4, 6, 8, 10] # first run is ignored since there seem to be some inconsistencies when loading for the first time\n",
+    "# batching_sizes = [1]\n",
+    "n_samples = 3\n",
+    "\n",
+    "av_email_times_for_batches = []\n",
+    "for bs in batching_sizes:\n",
+    "    average_email_time = 0\n",
+    "    for _ in range(n_samples):\n",
+    "        t = performance_test(bs)\n",
+    "        average_email_time += t\n",
+    "    av_email_times_for_batches.append(average_email_time/n_samples)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.bar(batching_sizes[1:], av_email_times_for_batches[1:], 0.5)\n",
+    "plt.xlabel(\"n batches\")\n",
+    "plt.ylabel(\"Average Email Time [s]\")\n",
+    "plt.title(\"Average email time for different batch sizes\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "mailcom",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}