From 67921f4423860e18d7fe1d02245dae7b9e481225 Mon Sep 17 00:00:00 2001
From: Andreh <57905931+Thoron86@users.noreply.github.com>
Date: Mon, 24 Jun 2024 22:08:36 +0000
Subject: [PATCH] Lemmatizer

---
 Group-8-Retrieval-System/lemmatizer.ipynb | 476 ++++++++++++++++++++++
 1 file changed, 476 insertions(+)
 create mode 100644 Group-8-Retrieval-System/lemmatizer.ipynb

diff --git a/Group-8-Retrieval-System/lemmatizer.ipynb b/Group-8-Retrieval-System/lemmatizer.ipynb
new file mode 100644
index 0000000..2e99793
--- /dev/null
+++ b/Group-8-Retrieval-System/lemmatizer.ipynb
@@ -0,0 +1,476 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# You only need to execute this cell if you are using Google Golab.\n",
+    "# If you use GitHub Codespaces, everything is already installed.\n",
+    "!pip3 install tira ir-datasets python-terrier nltk"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Imports\n",
+    "from tira.third_party_integrations import ensure_pyterrier_is_loaded, persist_and_normalize_run\n",
+    "from tira.rest_api_client import Client\n",
+    "\n",
+    "import nltk\n",
+    "from nltk.stem import WordNetLemmatizer\n",
+    "\n",
+    "import pyterrier as pt\n",
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[nltk_data] Downloading package wordnet to /root/nltk_data...\n",
+      "[nltk_data]   Package wordnet is already up-to-date!\n",
+      "[nltk_data] Downloading package omw-1.4 to /root/nltk_data...\n",
+      "[nltk_data]   Package omw-1.4 is already up-to-date!\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 23,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Ensure necessary NLTK resources are downloaded\n",
+    "nltk.download('wordnet')\n",
+    "nltk.download('omw-1.4')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ensure_pyterrier_is_loaded()\n",
+    "tira = Client()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Initialize the WordNetLemmatizer\n",
+    "lemmatizer = WordNetLemmatizer()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Function to lemmatize text\n",
+    "# def lemmatize_text(text):\n",
+    "#     return ' '.join([lemmatizer.lemmatize(word) for word in text.split()])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pt_dataset = pt.get_dataset('irds:ir-lab-sose-2024/ir-acl-anthology-20240504-training')\n",
+    "index = tira.pt.index('ir-lab-sose-2024/tira-ir-starter/Index (tira-ir-starter-pyterrier)', pt_dataset)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Preprocess topics by lemmatizing\n",
+    "topics = pt_dataset.get_topics('text')\n",
+    "# print('Columns in topics DataFrame:', topics.columns)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "lemmatized_topics = []\n",
+    "for idx, row in topics.iterrows():\n",
+    "    lemmatized_query = ' '.join([lemmatizer.lemmatize(word) for word in nltk.word_tokenize(row['query'])])\n",
+    "    lemmatized_topics.append({'qid': row['qid'], 'query': lemmatized_query})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Using column for text: qid\n"
+     ]
+    }
+   ],
+   "source": [
+    "# text_column = topics.columns[0]\n",
+    "# print('Using column for text:', text_column)\n",
+    "# topics[text_column] = topics[text_column].apply(lemmatize_text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# BM25 Retrieval Model\n",
+    "bm25 = pt.BatchRetrieve(index, wmodel=\"BM25\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_lemmatized_topics = pd.DataFrame(lemmatized_topics)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "run = bm25(df_lemmatized_topics)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The run file is normalized outside the TIRA sandbox, I will store it at \"../runs\".\n",
+      "Done. run file is stored under \"../runs/run.txt\".\n"
+     ]
+    }
+   ],
+   "source": [
+    "persist_and_normalize_run(run, system_name='bm25-lemmatizer', default_output='../runs')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>name</th>\n",
+       "      <th>map</th>\n",
+       "      <th>recip_rank</th>\n",
+       "      <th>P_1000</th>\n",
+       "      <th>map +</th>\n",
+       "      <th>map -</th>\n",
+       "      <th>map p-value</th>\n",
+       "      <th>recip_rank +</th>\n",
+       "      <th>recip_rank -</th>\n",
+       "      <th>recip_rank p-value</th>\n",
+       "      <th>P_1000 +</th>\n",
+       "      <th>P_1000 -</th>\n",
+       "      <th>P_1000 p-value</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>BM25</td>\n",
+       "      <td>0.262311</td>\n",
+       "      <td>0.579877</td>\n",
+       "      <td>0.016191</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   name       map  recip_rank    P_1000 map + map - map p-value recip_rank +  \\\n",
+       "0  BM25  0.262311    0.579877  0.016191  None  None        None         None   \n",
+       "\n",
+       "  recip_rank - recip_rank p-value P_1000 + P_1000 - P_1000 p-value  \n",
+       "0         None               None     None     None           None  "
+      ]
+     },
+     "execution_count": 33,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Localtest\n",
+    "pt.Experiment(\n",
+    "    [bm25], \n",
+    "    topics, \n",
+    "    pt_dataset.get_qrels(), \n",
+    "    eval_metrics=['P_1000', 'map', 'recip_rank'],\n",
+    "    names=['BM25'],\n",
+    "    baseline=0\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>qid</th>\n",
+       "      <th>docid</th>\n",
+       "      <th>docno</th>\n",
+       "      <th>rank</th>\n",
+       "      <th>score</th>\n",
+       "      <th>query</th>\n",
+       "      <th>system</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>1</td>\n",
+       "      <td>94858</td>\n",
+       "      <td>2004.cikm_conference-2004.47</td>\n",
+       "      <td>0</td>\n",
+       "      <td>15.681777</td>\n",
+       "      <td>retrieval system improving effectiveness</td>\n",
+       "      <td>bm25-lemmatizer</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>1</td>\n",
+       "      <td>125137</td>\n",
+       "      <td>1989.ipm_journal-ir0volumeA25A4.2</td>\n",
+       "      <td>1</td>\n",
+       "      <td>15.047380</td>\n",
+       "      <td>retrieval system improving effectiveness</td>\n",
+       "      <td>bm25-lemmatizer</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>1</td>\n",
+       "      <td>125817</td>\n",
+       "      <td>2005.ipm_journal-ir0volumeA41A5.11</td>\n",
+       "      <td>2</td>\n",
+       "      <td>14.144223</td>\n",
+       "      <td>retrieval system improving effectiveness</td>\n",
+       "      <td>bm25-lemmatizer</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>1</td>\n",
+       "      <td>5868</td>\n",
+       "      <td>W05-0704</td>\n",
+       "      <td>3</td>\n",
+       "      <td>14.025748</td>\n",
+       "      <td>retrieval system improving effectiveness</td>\n",
+       "      <td>bm25-lemmatizer</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>1</td>\n",
+       "      <td>84876</td>\n",
+       "      <td>2016.ntcir_conference-2016.90</td>\n",
+       "      <td>4</td>\n",
+       "      <td>13.947994</td>\n",
+       "      <td>retrieval system improving effectiveness</td>\n",
+       "      <td>bm25-lemmatizer</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>1</td>\n",
+       "      <td>82472</td>\n",
+       "      <td>1998.sigirconf_conference-98.15</td>\n",
+       "      <td>5</td>\n",
+       "      <td>13.901647</td>\n",
+       "      <td>retrieval system improving effectiveness</td>\n",
+       "      <td>bm25-lemmatizer</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>1</td>\n",
+       "      <td>94415</td>\n",
+       "      <td>2008.cikm_conference-2008.183</td>\n",
+       "      <td>6</td>\n",
+       "      <td>13.808208</td>\n",
+       "      <td>retrieval system improving effectiveness</td>\n",
+       "      <td>bm25-lemmatizer</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>1</td>\n",
+       "      <td>17496</td>\n",
+       "      <td>O01-2005</td>\n",
+       "      <td>7</td>\n",
+       "      <td>13.749449</td>\n",
+       "      <td>retrieval system improving effectiveness</td>\n",
+       "      <td>bm25-lemmatizer</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>1</td>\n",
+       "      <td>82490</td>\n",
+       "      <td>1998.sigirconf_conference-98.33</td>\n",
+       "      <td>8</td>\n",
+       "      <td>13.735541</td>\n",
+       "      <td>retrieval system improving effectiveness</td>\n",
+       "      <td>bm25-lemmatizer</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>9</th>\n",
+       "      <td>1</td>\n",
+       "      <td>124801</td>\n",
+       "      <td>2006.ipm_journal-ir0volumeA42A3.2</td>\n",
+       "      <td>9</td>\n",
+       "      <td>13.569263</td>\n",
+       "      <td>retrieval system improving effectiveness</td>\n",
+       "      <td>bm25-lemmatizer</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   qid   docid                               docno  rank      score  \\\n",
+       "0    1   94858        2004.cikm_conference-2004.47     0  15.681777   \n",
+       "1    1  125137   1989.ipm_journal-ir0volumeA25A4.2     1  15.047380   \n",
+       "2    1  125817  2005.ipm_journal-ir0volumeA41A5.11     2  14.144223   \n",
+       "3    1    5868                            W05-0704     3  14.025748   \n",
+       "4    1   84876       2016.ntcir_conference-2016.90     4  13.947994   \n",
+       "5    1   82472     1998.sigirconf_conference-98.15     5  13.901647   \n",
+       "6    1   94415       2008.cikm_conference-2008.183     6  13.808208   \n",
+       "7    1   17496                            O01-2005     7  13.749449   \n",
+       "8    1   82490     1998.sigirconf_conference-98.33     8  13.735541   \n",
+       "9    1  124801   2006.ipm_journal-ir0volumeA42A3.2     9  13.569263   \n",
+       "\n",
+       "                                      query           system  \n",
+       "0  retrieval system improving effectiveness  bm25-lemmatizer  \n",
+       "1  retrieval system improving effectiveness  bm25-lemmatizer  \n",
+       "2  retrieval system improving effectiveness  bm25-lemmatizer  \n",
+       "3  retrieval system improving effectiveness  bm25-lemmatizer  \n",
+       "4  retrieval system improving effectiveness  bm25-lemmatizer  \n",
+       "5  retrieval system improving effectiveness  bm25-lemmatizer  \n",
+       "6  retrieval system improving effectiveness  bm25-lemmatizer  \n",
+       "7  retrieval system improving effectiveness  bm25-lemmatizer  \n",
+       "8  retrieval system improving effectiveness  bm25-lemmatizer  \n",
+       "9  retrieval system improving effectiveness  bm25-lemmatizer  "
+      ]
+     },
+     "execution_count": 35,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "run.head(10)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}