Update Advanec_retrieval_system

tira-io · Jun 25, 2024 · 59d5d6e · 59d5d6e
1 parent ab03764
commit 59d5d6e
Show file tree

Hide file tree

Showing 11 changed files with 3,351 additions and 3,528 deletions.
diff --git a/Group-8-Retrieval-System/Advanced_Retrieval_System.ipynb b/Group-8-Retrieval-System/Advanced_Retrieval_System.ipynb
diff --git a/Group-8-Retrieval-System/DontWORK/Group-8-PorterStemmer.ipynb b/Group-8-Retrieval-System/DontWORK/Group-8-PorterStemmer.ipynb
@@ -1,133 +1,133 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# You only need to execute this cell if you are using Google Golab.\n",
-    "# If you use GitHub Codespaces, everything is already installed.\n",
-    "# !pip3 install tira ir-datasets python-terrier\n",
-    "!wget https://files.webis.de/software/pyterrier-plugins/custom-terrier-token-processing-1.0-SNAPSHOT-jar-with-dependencies.jar -O /root/.pyterrier/custom-terrier-token-processing-0.0.1.jar"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Imports\n",
-    "from tira.third_party_integrations import ensure_pyterrier_is_loaded, persist_and_normalize_run\n",
-    "from tira.rest_api_client import Client\n",
-    "import pyterrier as pt\n",
-    "import pandas as pd\n",
-    "pd.set_option('display.max_colwidth', 0)\n",
-    "\n",
-    "if not pt.started():\n",
-    "    pt.init(boot_packages=['mam10eks:custom-terrier-token-processing:0.0.1'])\n",
-    "    from jnius import autoclass\n",
-    "\n",
-    "\n",
-    "# Create a REST client to the TIRA platform for retrieving the pre-indexed data.\n",
-    "ensure_pyterrier_is_loaded()\n",
-    "tira = Client()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#Index erstellen und ausführen des Batch\n",
-    "#PorterStemmer\n",
-    "pt_dataset = pt.get_dataset('irds:ir-lab-sose-2024/ir-acl-anthology-20240504-training')\n",
-    "\n",
-    "#Build Index\n",
-    "indexer = pt.IterDictIndexer(\"/tmp/index\", overwrite=True, stemmer='PorterStemmer')\n",
-    "index_ref = indexer.index(pt_dataset.get_corpus_iter())\n",
-    "\n",
-    "#Load Index\n",
-    "index = pt.IndexFactory.of(index_ref)\n",
-    "\n",
-    "bm25 = pt.BatchRetrieve(index, wmodel=\"BM25\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# #Localtest\n",
-    "# pt.Experiment(\n",
-    "#     [bm25], \n",
-    "#     pt_dataset.get_topics(), \n",
-    "#     pt_dataset.get_qrels(), \n",
-    "#     eval_metrics=['P_1000', 'map', 'recip_rank']\n",
-    "#     # names=['BM25'],\n",
-    "#     # baseline=0\n",
-    "#     )\n",
-    "\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "print('First, we have a short look at the first three topics:')\n",
-    "\n",
-    "pt_dataset.get_topics('text').head(3)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "print('Now we do the retrieval...')\n",
-    "run = bm25(pt_dataset.get_topics('text'))\n",
-    "\n",
-    "print('Done. Here are the first 10 entries of the run')\n",
-    "run.head(10)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#The output of a prototypical retrieval system is a run file. \n",
-    "#This run file can later (optimally in a different notebook) be statistically evaluated.\n",
-    "persist_and_normalize_run(run, system_name='bm25-baseline', default_output='../runs')"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.12"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# You only need to execute this cell if you are using Google Golab.\n",
+    "# If you use GitHub Codespaces, everything is already installed.\n",
+    "# !pip3 install tira ir-datasets python-terrier\n",
+    "!wget https://files.webis.de/software/pyterrier-plugins/custom-terrier-token-processing-1.0-SNAPSHOT-jar-with-dependencies.jar -O /root/.pyterrier/custom-terrier-token-processing-0.0.1.jar"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Imports\n",
+    "from tira.third_party_integrations import ensure_pyterrier_is_loaded, persist_and_normalize_run\n",
+    "from tira.rest_api_client import Client\n",
+    "import pyterrier as pt\n",
+    "import pandas as pd\n",
+    "pd.set_option('display.max_colwidth', 0)\n",
+    "\n",
+    "if not pt.started():\n",
+    "    pt.init(boot_packages=['mam10eks:custom-terrier-token-processing:0.0.1'])\n",
+    "    from jnius import autoclass\n",
+    "\n",
+    "\n",
+    "# Create a REST client to the TIRA platform for retrieving the pre-indexed data.\n",
+    "ensure_pyterrier_is_loaded()\n",
+    "tira = Client()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Index erstellen und ausführen des Batch\n",
+    "#PorterStemmer\n",
+    "pt_dataset = pt.get_dataset('irds:ir-lab-sose-2024/ir-acl-anthology-20240504-training')\n",
+    "\n",
+    "#Build Index\n",
+    "indexer = pt.IterDictIndexer(\"/tmp/index\", overwrite=True, stemmer='PorterStemmer')\n",
+    "index_ref = indexer.index(pt_dataset.get_corpus_iter())\n",
+    "\n",
+    "#Load Index\n",
+    "index = pt.IndexFactory.of(index_ref)\n",
+    "\n",
+    "bm25 = pt.BatchRetrieve(index, wmodel=\"BM25\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# #Localtest\n",
+    "# pt.Experiment(\n",
+    "#     [bm25], \n",
+    "#     pt_dataset.get_topics(), \n",
+    "#     pt_dataset.get_qrels(), \n",
+    "#     eval_metrics=['P_1000', 'map', 'recip_rank']\n",
+    "#     # names=['BM25'],\n",
+    "#     # baseline=0\n",
+    "#     )\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print('First, we have a short look at the first three topics:')\n",
+    "\n",
+    "pt_dataset.get_topics('text').head(3)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print('Now we do the retrieval...')\n",
+    "run = bm25(pt_dataset.get_topics('text'))\n",
+    "\n",
+    "print('Done. Here are the first 10 entries of the run')\n",
+    "run.head(10)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#The output of a prototypical retrieval system is a run file. \n",
+    "#This run file can later (optimally in a different notebook) be statistically evaluated.\n",
+    "persist_and_normalize_run(run, system_name='bm25-baseline', default_output='../runs')"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}