From 67921f4423860e18d7fe1d02245dae7b9e481225 Mon Sep 17 00:00:00 2001
From: Andreh <57905931+Thoron86@users.noreply.github.com>
Date: Mon, 24 Jun 2024 22:08:36 +0000
Subject: [PATCH] Lemmatizer
---
Group-8-Retrieval-System/lemmatizer.ipynb | 476 ++++++++++++++++++++++
1 file changed, 476 insertions(+)
create mode 100644 Group-8-Retrieval-System/lemmatizer.ipynb
diff --git a/Group-8-Retrieval-System/lemmatizer.ipynb b/Group-8-Retrieval-System/lemmatizer.ipynb
new file mode 100644
index 0000000..2e99793
--- /dev/null
+++ b/Group-8-Retrieval-System/lemmatizer.ipynb
@@ -0,0 +1,476 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# You only need to execute this cell if you are using Google Golab.\n",
+ "# If you use GitHub Codespaces, everything is already installed.\n",
+ "!pip3 install tira ir-datasets python-terrier nltk"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Imports\n",
+ "from tira.third_party_integrations import ensure_pyterrier_is_loaded, persist_and_normalize_run\n",
+ "from tira.rest_api_client import Client\n",
+ "\n",
+ "import nltk\n",
+ "from nltk.stem import WordNetLemmatizer\n",
+ "\n",
+ "import pyterrier as pt\n",
+ "import pandas as pd"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[nltk_data] Downloading package wordnet to /root/nltk_data...\n",
+ "[nltk_data] Package wordnet is already up-to-date!\n",
+ "[nltk_data] Downloading package omw-1.4 to /root/nltk_data...\n",
+ "[nltk_data] Package omw-1.4 is already up-to-date!\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "True"
+ ]
+ },
+ "execution_count": 23,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Ensure necessary NLTK resources are downloaded\n",
+ "nltk.download('wordnet')\n",
+ "nltk.download('omw-1.4')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "ensure_pyterrier_is_loaded()\n",
+ "tira = Client()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Initialize the WordNetLemmatizer\n",
+ "lemmatizer = WordNetLemmatizer()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Function to lemmatize text\n",
+ "# def lemmatize_text(text):\n",
+ "# return ' '.join([lemmatizer.lemmatize(word) for word in text.split()])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "pt_dataset = pt.get_dataset('irds:ir-lab-sose-2024/ir-acl-anthology-20240504-training')\n",
+ "index = tira.pt.index('ir-lab-sose-2024/tira-ir-starter/Index (tira-ir-starter-pyterrier)', pt_dataset)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Preprocess topics by lemmatizing\n",
+ "topics = pt_dataset.get_topics('text')\n",
+ "# print('Columns in topics DataFrame:', topics.columns)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "lemmatized_topics = []\n",
+ "for idx, row in topics.iterrows():\n",
+ " lemmatized_query = ' '.join([lemmatizer.lemmatize(word) for word in nltk.word_tokenize(row['query'])])\n",
+ " lemmatized_topics.append({'qid': row['qid'], 'query': lemmatized_query})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Using column for text: qid\n"
+ ]
+ }
+ ],
+ "source": [
+ "# text_column = topics.columns[0]\n",
+ "# print('Using column for text:', text_column)\n",
+ "# topics[text_column] = topics[text_column].apply(lemmatize_text)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# BM25 Retrieval Model\n",
+ "bm25 = pt.BatchRetrieve(index, wmodel=\"BM25\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_lemmatized_topics = pd.DataFrame(lemmatized_topics)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "run = bm25(df_lemmatized_topics)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 32,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "The run file is normalized outside the TIRA sandbox, I will store it at \"../runs\".\n",
+ "Done. run file is stored under \"../runs/run.txt\".\n"
+ ]
+ }
+ ],
+ "source": [
+ "persist_and_normalize_run(run, system_name='bm25-lemmatizer', default_output='../runs')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 33,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " name | \n",
+ " map | \n",
+ " recip_rank | \n",
+ " P_1000 | \n",
+ " map + | \n",
+ " map - | \n",
+ " map p-value | \n",
+ " recip_rank + | \n",
+ " recip_rank - | \n",
+ " recip_rank p-value | \n",
+ " P_1000 + | \n",
+ " P_1000 - | \n",
+ " P_1000 p-value | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " BM25 | \n",
+ " 0.262311 | \n",
+ " 0.579877 | \n",
+ " 0.016191 | \n",
+ " None | \n",
+ " None | \n",
+ " None | \n",
+ " None | \n",
+ " None | \n",
+ " None | \n",
+ " None | \n",
+ " None | \n",
+ " None | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " name map recip_rank P_1000 map + map - map p-value recip_rank + \\\n",
+ "0 BM25 0.262311 0.579877 0.016191 None None None None \n",
+ "\n",
+ " recip_rank - recip_rank p-value P_1000 + P_1000 - P_1000 p-value \n",
+ "0 None None None None None "
+ ]
+ },
+ "execution_count": 33,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Localtest\n",
+ "pt.Experiment(\n",
+ " [bm25], \n",
+ " topics, \n",
+ " pt_dataset.get_qrels(), \n",
+ " eval_metrics=['P_1000', 'map', 'recip_rank'],\n",
+ " names=['BM25'],\n",
+ " baseline=0\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 35,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " qid | \n",
+ " docid | \n",
+ " docno | \n",
+ " rank | \n",
+ " score | \n",
+ " query | \n",
+ " system | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 94858 | \n",
+ " 2004.cikm_conference-2004.47 | \n",
+ " 0 | \n",
+ " 15.681777 | \n",
+ " retrieval system improving effectiveness | \n",
+ " bm25-lemmatizer | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 125137 | \n",
+ " 1989.ipm_journal-ir0volumeA25A4.2 | \n",
+ " 1 | \n",
+ " 15.047380 | \n",
+ " retrieval system improving effectiveness | \n",
+ " bm25-lemmatizer | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 125817 | \n",
+ " 2005.ipm_journal-ir0volumeA41A5.11 | \n",
+ " 2 | \n",
+ " 14.144223 | \n",
+ " retrieval system improving effectiveness | \n",
+ " bm25-lemmatizer | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 5868 | \n",
+ " W05-0704 | \n",
+ " 3 | \n",
+ " 14.025748 | \n",
+ " retrieval system improving effectiveness | \n",
+ " bm25-lemmatizer | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 1 | \n",
+ " 84876 | \n",
+ " 2016.ntcir_conference-2016.90 | \n",
+ " 4 | \n",
+ " 13.947994 | \n",
+ " retrieval system improving effectiveness | \n",
+ " bm25-lemmatizer | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " 1 | \n",
+ " 82472 | \n",
+ " 1998.sigirconf_conference-98.15 | \n",
+ " 5 | \n",
+ " 13.901647 | \n",
+ " retrieval system improving effectiveness | \n",
+ " bm25-lemmatizer | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " 1 | \n",
+ " 94415 | \n",
+ " 2008.cikm_conference-2008.183 | \n",
+ " 6 | \n",
+ " 13.808208 | \n",
+ " retrieval system improving effectiveness | \n",
+ " bm25-lemmatizer | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " 1 | \n",
+ " 17496 | \n",
+ " O01-2005 | \n",
+ " 7 | \n",
+ " 13.749449 | \n",
+ " retrieval system improving effectiveness | \n",
+ " bm25-lemmatizer | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " 1 | \n",
+ " 82490 | \n",
+ " 1998.sigirconf_conference-98.33 | \n",
+ " 8 | \n",
+ " 13.735541 | \n",
+ " retrieval system improving effectiveness | \n",
+ " bm25-lemmatizer | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " 1 | \n",
+ " 124801 | \n",
+ " 2006.ipm_journal-ir0volumeA42A3.2 | \n",
+ " 9 | \n",
+ " 13.569263 | \n",
+ " retrieval system improving effectiveness | \n",
+ " bm25-lemmatizer | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " qid docid docno rank score \\\n",
+ "0 1 94858 2004.cikm_conference-2004.47 0 15.681777 \n",
+ "1 1 125137 1989.ipm_journal-ir0volumeA25A4.2 1 15.047380 \n",
+ "2 1 125817 2005.ipm_journal-ir0volumeA41A5.11 2 14.144223 \n",
+ "3 1 5868 W05-0704 3 14.025748 \n",
+ "4 1 84876 2016.ntcir_conference-2016.90 4 13.947994 \n",
+ "5 1 82472 1998.sigirconf_conference-98.15 5 13.901647 \n",
+ "6 1 94415 2008.cikm_conference-2008.183 6 13.808208 \n",
+ "7 1 17496 O01-2005 7 13.749449 \n",
+ "8 1 82490 1998.sigirconf_conference-98.33 8 13.735541 \n",
+ "9 1 124801 2006.ipm_journal-ir0volumeA42A3.2 9 13.569263 \n",
+ "\n",
+ " query system \n",
+ "0 retrieval system improving effectiveness bm25-lemmatizer \n",
+ "1 retrieval system improving effectiveness bm25-lemmatizer \n",
+ "2 retrieval system improving effectiveness bm25-lemmatizer \n",
+ "3 retrieval system improving effectiveness bm25-lemmatizer \n",
+ "4 retrieval system improving effectiveness bm25-lemmatizer \n",
+ "5 retrieval system improving effectiveness bm25-lemmatizer \n",
+ "6 retrieval system improving effectiveness bm25-lemmatizer \n",
+ "7 retrieval system improving effectiveness bm25-lemmatizer \n",
+ "8 retrieval system improving effectiveness bm25-lemmatizer \n",
+ "9 retrieval system improving effectiveness bm25-lemmatizer "
+ ]
+ },
+ "execution_count": 35,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "run.head(10)"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.12"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}