-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Christian A
committed
Jun 25, 2024
1 parent
ab03764
commit 59d5d6e
Showing
11 changed files
with
3,351 additions
and
3,528 deletions.
There are no files selected for viewing
233 changes: 28 additions & 205 deletions
233
Group-8-Retrieval-System/Advanced_Retrieval_System.ipynb
Large diffs are not rendered by default.
Oops, something went wrong.
266 changes: 133 additions & 133 deletions
266
Group-8-Retrieval-System/DontWORK/Group-8-PorterStemmer.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,133 +1,133 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# You only need to execute this cell if you are using Google Golab.\n", | ||
"# If you use GitHub Codespaces, everything is already installed.\n", | ||
"# !pip3 install tira ir-datasets python-terrier\n", | ||
"!wget https://files.webis.de/software/pyterrier-plugins/custom-terrier-token-processing-1.0-SNAPSHOT-jar-with-dependencies.jar -O /root/.pyterrier/custom-terrier-token-processing-0.0.1.jar" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Imports\n", | ||
"from tira.third_party_integrations import ensure_pyterrier_is_loaded, persist_and_normalize_run\n", | ||
"from tira.rest_api_client import Client\n", | ||
"import pyterrier as pt\n", | ||
"import pandas as pd\n", | ||
"pd.set_option('display.max_colwidth', 0)\n", | ||
"\n", | ||
"if not pt.started():\n", | ||
" pt.init(boot_packages=['mam10eks:custom-terrier-token-processing:0.0.1'])\n", | ||
" from jnius import autoclass\n", | ||
"\n", | ||
"\n", | ||
"# Create a REST client to the TIRA platform for retrieving the pre-indexed data.\n", | ||
"ensure_pyterrier_is_loaded()\n", | ||
"tira = Client()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"#Index erstellen und ausführen des Batch\n", | ||
"#PorterStemmer\n", | ||
"pt_dataset = pt.get_dataset('irds:ir-lab-sose-2024/ir-acl-anthology-20240504-training')\n", | ||
"\n", | ||
"#Build Index\n", | ||
"indexer = pt.IterDictIndexer(\"/tmp/index\", overwrite=True, stemmer='PorterStemmer')\n", | ||
"index_ref = indexer.index(pt_dataset.get_corpus_iter())\n", | ||
"\n", | ||
"#Load Index\n", | ||
"index = pt.IndexFactory.of(index_ref)\n", | ||
"\n", | ||
"bm25 = pt.BatchRetrieve(index, wmodel=\"BM25\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# #Localtest\n", | ||
"# pt.Experiment(\n", | ||
"# [bm25], \n", | ||
"# pt_dataset.get_topics(), \n", | ||
"# pt_dataset.get_qrels(), \n", | ||
"# eval_metrics=['P_1000', 'map', 'recip_rank']\n", | ||
"# # names=['BM25'],\n", | ||
"# # baseline=0\n", | ||
"# )\n", | ||
"\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"print('First, we have a short look at the first three topics:')\n", | ||
"\n", | ||
"pt_dataset.get_topics('text').head(3)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"print('Now we do the retrieval...')\n", | ||
"run = bm25(pt_dataset.get_topics('text'))\n", | ||
"\n", | ||
"print('Done. Here are the first 10 entries of the run')\n", | ||
"run.head(10)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"#The output of a prototypical retrieval system is a run file. \n", | ||
"#This run file can later (optimally in a different notebook) be statistically evaluated.\n", | ||
"persist_and_normalize_run(run, system_name='bm25-baseline', default_output='../runs')" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.10.12" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# You only need to execute this cell if you are using Google Golab.\n", | ||
"# If you use GitHub Codespaces, everything is already installed.\n", | ||
"# !pip3 install tira ir-datasets python-terrier\n", | ||
"!wget https://files.webis.de/software/pyterrier-plugins/custom-terrier-token-processing-1.0-SNAPSHOT-jar-with-dependencies.jar -O /root/.pyterrier/custom-terrier-token-processing-0.0.1.jar" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Imports\n", | ||
"from tira.third_party_integrations import ensure_pyterrier_is_loaded, persist_and_normalize_run\n", | ||
"from tira.rest_api_client import Client\n", | ||
"import pyterrier as pt\n", | ||
"import pandas as pd\n", | ||
"pd.set_option('display.max_colwidth', 0)\n", | ||
"\n", | ||
"if not pt.started():\n", | ||
" pt.init(boot_packages=['mam10eks:custom-terrier-token-processing:0.0.1'])\n", | ||
" from jnius import autoclass\n", | ||
"\n", | ||
"\n", | ||
"# Create a REST client to the TIRA platform for retrieving the pre-indexed data.\n", | ||
"ensure_pyterrier_is_loaded()\n", | ||
"tira = Client()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"#Index erstellen und ausführen des Batch\n", | ||
"#PorterStemmer\n", | ||
"pt_dataset = pt.get_dataset('irds:ir-lab-sose-2024/ir-acl-anthology-20240504-training')\n", | ||
"\n", | ||
"#Build Index\n", | ||
"indexer = pt.IterDictIndexer(\"/tmp/index\", overwrite=True, stemmer='PorterStemmer')\n", | ||
"index_ref = indexer.index(pt_dataset.get_corpus_iter())\n", | ||
"\n", | ||
"#Load Index\n", | ||
"index = pt.IndexFactory.of(index_ref)\n", | ||
"\n", | ||
"bm25 = pt.BatchRetrieve(index, wmodel=\"BM25\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# #Localtest\n", | ||
"# pt.Experiment(\n", | ||
"# [bm25], \n", | ||
"# pt_dataset.get_topics(), \n", | ||
"# pt_dataset.get_qrels(), \n", | ||
"# eval_metrics=['P_1000', 'map', 'recip_rank']\n", | ||
"# # names=['BM25'],\n", | ||
"# # baseline=0\n", | ||
"# )\n", | ||
"\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"print('First, we have a short look at the first three topics:')\n", | ||
"\n", | ||
"pt_dataset.get_topics('text').head(3)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"print('Now we do the retrieval...')\n", | ||
"run = bm25(pt_dataset.get_topics('text'))\n", | ||
"\n", | ||
"print('Done. Here are the first 10 entries of the run')\n", | ||
"run.head(10)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"#The output of a prototypical retrieval system is a run file. \n", | ||
"#This run file can later (optimally in a different notebook) be statistically evaluated.\n", | ||
"persist_and_normalize_run(run, system_name='bm25-baseline', default_output='../runs')" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.10.12" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 2 | ||
} |
Oops, something went wrong.