Skip to content

Commit

Permalink
Update Advanec_retrieval_system
Browse files Browse the repository at this point in the history
  • Loading branch information
Christian A committed Jun 25, 2024
1 parent ab03764 commit 59d5d6e
Show file tree
Hide file tree
Showing 11 changed files with 3,351 additions and 3,528 deletions.
233 changes: 28 additions & 205 deletions Group-8-Retrieval-System/Advanced_Retrieval_System.ipynb

Large diffs are not rendered by default.

266 changes: 133 additions & 133 deletions Group-8-Retrieval-System/DontWORK/Group-8-PorterStemmer.ipynb
Original file line number Diff line number Diff line change
@@ -1,133 +1,133 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# You only need to execute this cell if you are using Google Golab.\n",
"# If you use GitHub Codespaces, everything is already installed.\n",
"# !pip3 install tira ir-datasets python-terrier\n",
"!wget https://files.webis.de/software/pyterrier-plugins/custom-terrier-token-processing-1.0-SNAPSHOT-jar-with-dependencies.jar -O /root/.pyterrier/custom-terrier-token-processing-0.0.1.jar"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Imports\n",
"from tira.third_party_integrations import ensure_pyterrier_is_loaded, persist_and_normalize_run\n",
"from tira.rest_api_client import Client\n",
"import pyterrier as pt\n",
"import pandas as pd\n",
"pd.set_option('display.max_colwidth', 0)\n",
"\n",
"if not pt.started():\n",
" pt.init(boot_packages=['mam10eks:custom-terrier-token-processing:0.0.1'])\n",
" from jnius import autoclass\n",
"\n",
"\n",
"# Create a REST client to the TIRA platform for retrieving the pre-indexed data.\n",
"ensure_pyterrier_is_loaded()\n",
"tira = Client()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Index erstellen und ausführen des Batch\n",
"#PorterStemmer\n",
"pt_dataset = pt.get_dataset('irds:ir-lab-sose-2024/ir-acl-anthology-20240504-training')\n",
"\n",
"#Build Index\n",
"indexer = pt.IterDictIndexer(\"/tmp/index\", overwrite=True, stemmer='PorterStemmer')\n",
"index_ref = indexer.index(pt_dataset.get_corpus_iter())\n",
"\n",
"#Load Index\n",
"index = pt.IndexFactory.of(index_ref)\n",
"\n",
"bm25 = pt.BatchRetrieve(index, wmodel=\"BM25\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# #Localtest\n",
"# pt.Experiment(\n",
"# [bm25], \n",
"# pt_dataset.get_topics(), \n",
"# pt_dataset.get_qrels(), \n",
"# eval_metrics=['P_1000', 'map', 'recip_rank']\n",
"# # names=['BM25'],\n",
"# # baseline=0\n",
"# )\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print('First, we have a short look at the first three topics:')\n",
"\n",
"pt_dataset.get_topics('text').head(3)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print('Now we do the retrieval...')\n",
"run = bm25(pt_dataset.get_topics('text'))\n",
"\n",
"print('Done. Here are the first 10 entries of the run')\n",
"run.head(10)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#The output of a prototypical retrieval system is a run file. \n",
"#This run file can later (optimally in a different notebook) be statistically evaluated.\n",
"persist_and_normalize_run(run, system_name='bm25-baseline', default_output='../runs')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# You only need to execute this cell if you are using Google Golab.\n",
"# If you use GitHub Codespaces, everything is already installed.\n",
"# !pip3 install tira ir-datasets python-terrier\n",
"!wget https://files.webis.de/software/pyterrier-plugins/custom-terrier-token-processing-1.0-SNAPSHOT-jar-with-dependencies.jar -O /root/.pyterrier/custom-terrier-token-processing-0.0.1.jar"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Imports\n",
"from tira.third_party_integrations import ensure_pyterrier_is_loaded, persist_and_normalize_run\n",
"from tira.rest_api_client import Client\n",
"import pyterrier as pt\n",
"import pandas as pd\n",
"pd.set_option('display.max_colwidth', 0)\n",
"\n",
"if not pt.started():\n",
" pt.init(boot_packages=['mam10eks:custom-terrier-token-processing:0.0.1'])\n",
" from jnius import autoclass\n",
"\n",
"\n",
"# Create a REST client to the TIRA platform for retrieving the pre-indexed data.\n",
"ensure_pyterrier_is_loaded()\n",
"tira = Client()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#Index erstellen und ausführen des Batch\n",
"#PorterStemmer\n",
"pt_dataset = pt.get_dataset('irds:ir-lab-sose-2024/ir-acl-anthology-20240504-training')\n",
"\n",
"#Build Index\n",
"indexer = pt.IterDictIndexer(\"/tmp/index\", overwrite=True, stemmer='PorterStemmer')\n",
"index_ref = indexer.index(pt_dataset.get_corpus_iter())\n",
"\n",
"#Load Index\n",
"index = pt.IndexFactory.of(index_ref)\n",
"\n",
"bm25 = pt.BatchRetrieve(index, wmodel=\"BM25\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# #Localtest\n",
"# pt.Experiment(\n",
"# [bm25], \n",
"# pt_dataset.get_topics(), \n",
"# pt_dataset.get_qrels(), \n",
"# eval_metrics=['P_1000', 'map', 'recip_rank']\n",
"# # names=['BM25'],\n",
"# # baseline=0\n",
"# )\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print('First, we have a short look at the first three topics:')\n",
"\n",
"pt_dataset.get_topics('text').head(3)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print('Now we do the retrieval...')\n",
"run = bm25(pt_dataset.get_topics('text'))\n",
"\n",
"print('Done. Here are the first 10 entries of the run')\n",
"run.head(10)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#The output of a prototypical retrieval system is a run file. \n",
"#This run file can later (optimally in a different notebook) be statistically evaluated.\n",
"persist_and_normalize_run(run, system_name='bm25-baseline', default_output='../runs')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Loading

0 comments on commit 59d5d6e

Please sign in to comment.