From 67921f4423860e18d7fe1d02245dae7b9e481225 Mon Sep 17 00:00:00 2001 From: Andreh <57905931+Thoron86@users.noreply.github.com> Date: Mon, 24 Jun 2024 22:08:36 +0000 Subject: [PATCH] Lemmatizer --- Group-8-Retrieval-System/lemmatizer.ipynb | 476 ++++++++++++++++++++++ 1 file changed, 476 insertions(+) create mode 100644 Group-8-Retrieval-System/lemmatizer.ipynb diff --git a/Group-8-Retrieval-System/lemmatizer.ipynb b/Group-8-Retrieval-System/lemmatizer.ipynb new file mode 100644 index 0000000..2e99793 --- /dev/null +++ b/Group-8-Retrieval-System/lemmatizer.ipynb @@ -0,0 +1,476 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# You only need to execute this cell if you are using Google Golab.\n", + "# If you use GitHub Codespaces, everything is already installed.\n", + "!pip3 install tira ir-datasets python-terrier nltk" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "# Imports\n", + "from tira.third_party_integrations import ensure_pyterrier_is_loaded, persist_and_normalize_run\n", + "from tira.rest_api_client import Client\n", + "\n", + "import nltk\n", + "from nltk.stem import WordNetLemmatizer\n", + "\n", + "import pyterrier as pt\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package wordnet to /root/nltk_data...\n", + "[nltk_data] Package wordnet is already up-to-date!\n", + "[nltk_data] Downloading package omw-1.4 to /root/nltk_data...\n", + "[nltk_data] Package omw-1.4 is already up-to-date!\n" + ] + }, + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Ensure necessary NLTK resources are downloaded\n", + "nltk.download('wordnet')\n", + "nltk.download('omw-1.4')" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "ensure_pyterrier_is_loaded()\n", + "tira = Client()" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "# Initialize the WordNetLemmatizer\n", + "lemmatizer = WordNetLemmatizer()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# Function to lemmatize text\n", + "# def lemmatize_text(text):\n", + "# return ' '.join([lemmatizer.lemmatize(word) for word in text.split()])" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "pt_dataset = pt.get_dataset('irds:ir-lab-sose-2024/ir-acl-anthology-20240504-training')\n", + "index = tira.pt.index('ir-lab-sose-2024/tira-ir-starter/Index (tira-ir-starter-pyterrier)', pt_dataset)" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "# Preprocess topics by lemmatizing\n", + "topics = pt_dataset.get_topics('text')\n", + "# print('Columns in topics DataFrame:', topics.columns)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [], + "source": [ + "lemmatized_topics = []\n", + "for idx, row in topics.iterrows():\n", + " lemmatized_query = ' '.join([lemmatizer.lemmatize(word) for word in nltk.word_tokenize(row['query'])])\n", + " lemmatized_topics.append({'qid': row['qid'], 'query': lemmatized_query})" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Using column for text: qid\n" + ] + } + ], + "source": [ + "# text_column = topics.columns[0]\n", + "# print('Using column for text:', text_column)\n", + "# topics[text_column] = topics[text_column].apply(lemmatize_text)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [], + "source": [ + "# BM25 Retrieval Model\n", + "bm25 = pt.BatchRetrieve(index, wmodel=\"BM25\")" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "df_lemmatized_topics = pd.DataFrame(lemmatized_topics)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [], + "source": [ + "run = bm25(df_lemmatized_topics)" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The run file is normalized outside the TIRA sandbox, I will store it at \"../runs\".\n", + "Done. run file is stored under \"../runs/run.txt\".\n" + ] + } + ], + "source": [ + "persist_and_normalize_run(run, system_name='bm25-lemmatizer', default_output='../runs')" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
namemaprecip_rankP_1000map +map -map p-valuerecip_rank +recip_rank -recip_rank p-valueP_1000 +P_1000 -P_1000 p-value
0BM250.2623110.5798770.016191NoneNoneNoneNoneNoneNoneNoneNoneNone
\n", + "
" + ], + "text/plain": [ + " name map recip_rank P_1000 map + map - map p-value recip_rank + \\\n", + "0 BM25 0.262311 0.579877 0.016191 None None None None \n", + "\n", + " recip_rank - recip_rank p-value P_1000 + P_1000 - P_1000 p-value \n", + "0 None None None None None " + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Localtest\n", + "pt.Experiment(\n", + " [bm25], \n", + " topics, \n", + " pt_dataset.get_qrels(), \n", + " eval_metrics=['P_1000', 'map', 'recip_rank'],\n", + " names=['BM25'],\n", + " baseline=0\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
qiddociddocnorankscorequerysystem
01948582004.cikm_conference-2004.47015.681777retrieval system improving effectivenessbm25-lemmatizer
111251371989.ipm_journal-ir0volumeA25A4.2115.047380retrieval system improving effectivenessbm25-lemmatizer
211258172005.ipm_journal-ir0volumeA41A5.11214.144223retrieval system improving effectivenessbm25-lemmatizer
315868W05-0704314.025748retrieval system improving effectivenessbm25-lemmatizer
41848762016.ntcir_conference-2016.90413.947994retrieval system improving effectivenessbm25-lemmatizer
51824721998.sigirconf_conference-98.15513.901647retrieval system improving effectivenessbm25-lemmatizer
61944152008.cikm_conference-2008.183613.808208retrieval system improving effectivenessbm25-lemmatizer
7117496O01-2005713.749449retrieval system improving effectivenessbm25-lemmatizer
81824901998.sigirconf_conference-98.33813.735541retrieval system improving effectivenessbm25-lemmatizer
911248012006.ipm_journal-ir0volumeA42A3.2913.569263retrieval system improving effectivenessbm25-lemmatizer
\n", + "
" + ], + "text/plain": [ + " qid docid docno rank score \\\n", + "0 1 94858 2004.cikm_conference-2004.47 0 15.681777 \n", + "1 1 125137 1989.ipm_journal-ir0volumeA25A4.2 1 15.047380 \n", + "2 1 125817 2005.ipm_journal-ir0volumeA41A5.11 2 14.144223 \n", + "3 1 5868 W05-0704 3 14.025748 \n", + "4 1 84876 2016.ntcir_conference-2016.90 4 13.947994 \n", + "5 1 82472 1998.sigirconf_conference-98.15 5 13.901647 \n", + "6 1 94415 2008.cikm_conference-2008.183 6 13.808208 \n", + "7 1 17496 O01-2005 7 13.749449 \n", + "8 1 82490 1998.sigirconf_conference-98.33 8 13.735541 \n", + "9 1 124801 2006.ipm_journal-ir0volumeA42A3.2 9 13.569263 \n", + "\n", + " query system \n", + "0 retrieval system improving effectiveness bm25-lemmatizer \n", + "1 retrieval system improving effectiveness bm25-lemmatizer \n", + "2 retrieval system improving effectiveness bm25-lemmatizer \n", + "3 retrieval system improving effectiveness bm25-lemmatizer \n", + "4 retrieval system improving effectiveness bm25-lemmatizer \n", + "5 retrieval system improving effectiveness bm25-lemmatizer \n", + "6 retrieval system improving effectiveness bm25-lemmatizer \n", + "7 retrieval system improving effectiveness bm25-lemmatizer \n", + "8 retrieval system improving effectiveness bm25-lemmatizer \n", + "9 retrieval system improving effectiveness bm25-lemmatizer " + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "run.head(10)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}