precommit

GoogleCloudPlatform · Apr 17, 2024 · e637a7a · e637a7a
1 parent 4cbbebf
commit e637a7a
Showing 1 changed file with 105 additions and 92 deletions.
diff --git a/notebooks/vertex_genai/solutions/vertex_llm_evaluation.ipynb b/notebooks/vertex_genai/solutions/vertex_llm_evaluation.ipynb
@@ -165,89 +165,6 @@
     "aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=BUCKET_URI)"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "NRkfTNeaHbZd",
-    "tags": []
-   },
-   "source": [
-    "### Define helper functions"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The functions below will allow us to display AutoSxS judgments in a more readable way within the notebook.\n",
-    "They are here only for cosmetic reasons. "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 56,
-   "metadata": {
-    "id": "ivbHUDiEHd2Q",
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "def print_autosxs_judgments(df, n=3):\n",
-    "    \"\"\"Print AutoSxS judgments in the notebook\"\"\"\n",
-    "\n",
-    "    style = \"white-space: pre-wrap; width: 800px; overflow-x: auto;\"\n",
-    "    df = df.sample(n=n)\n",
-    "\n",
-    "    for index, row in df.iterrows():\n",
-    "        if row[\"confidence\"] >= 0.5:\n",
-    "            display(\n",
-    "                HTML(\n",
-    "                    f\"<h2>Document:</h2> <div style='{style}'>{row['document']}</div>\"\n",
-    "                )\n",
-    "            )\n",
-    "            display(\n",
-    "                HTML(\n",
-    "                    f\"<h2>Response A:</h2> <div style='{style}'>{row['response_a']}</div>\"\n",
-    "                )\n",
-    "            )\n",
-    "            display(\n",
-    "                HTML(\n",
-    "                    f\"<h2>Response B:</h2> <div style='{style}'>{row['response_b']}</div>\"\n",
-    "                )\n",
-    "            )\n",
-    "            display(\n",
-    "                HTML(\n",
-    "                    f\"<h2>Explanation:</h2> <div style='{style}'>{row['explanation']}</div>\"\n",
-    "                )\n",
-    "            )\n",
-    "            display(\n",
-    "                HTML(\n",
-    "                    f\"<h2>Confidence score:</h2> <div style='{style}'>{row['confidence']}</div>\"\n",
-    "                )\n",
-    "            )\n",
-    "            display(HTML(\"<hr>\"))\n",
-    "\n",
-    "\n",
-    "def print_aggregated_metrics(scores):\n",
-    "    \"\"\"Print AutoSxS aggregated metrics\"\"\"\n",
-    "\n",
-    "    score_b = round(win_rate_metrics[\"autosxs_model_b_win_rate\"] * 100)\n",
-    "    display(\n",
-    "        HTML(\n",
-    "            f\"<h3>AutoSxS Autorater prefers {score_b}% of time Model B over Model A </h3>\"\n",
-    "        )\n",
-    "    )\n",
-    "\n",
-    "\n",
-    "def print_human_preference_metrics(metrics):\n",
-    "    \"\"\"Print AutoSxS Human-preference alignment metrics\"\"\"\n",
-    "    display(\n",
-    "        HTML(\n",
-    "            f\"<h3>AutoSxS Autorater prefers {score_b}% of time Model B over Model A </h3>\"\n",
-    "        )\n",
-    "    )"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {
@@ -553,6 +470,13 @@
     "judgments_df = pd.read_json(judgments_uri, lines=True)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The next cell contains a helper function to print a sample from AutoSxS judgments nicely in the notebook."
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": 107,
@@ -646,6 +570,42 @@
     }
    ],
    "source": [
+    "def print_autosxs_judgments(df, n=3):\n",
+    "    \"\"\"Print AutoSxS judgments in the notebook\"\"\"\n",
+    "\n",
+    "    style = \"white-space: pre-wrap; width: 800px; overflow-x: auto;\"\n",
+    "    df = df.sample(n=n)\n",
+    "\n",
+    "    for index, row in df.iterrows():\n",
+    "        if row[\"confidence\"] >= 0.5:\n",
+    "            display(\n",
+    "                HTML(\n",
+    "                    f\"<h2>Document:</h2> <div style='{style}'>{row['document']}</div>\"\n",
+    "                )\n",
+    "            )\n",
+    "            display(\n",
+    "                HTML(\n",
+    "                    f\"<h2>Response A:</h2> <div style='{style}'>{row['response_a']}</div>\"\n",
+    "                )\n",
+    "            )\n",
+    "            display(\n",
+    "                HTML(\n",
+    "                    f\"<h2>Response B:</h2> <div style='{style}'>{row['response_b']}</div>\"\n",
+    "                )\n",
+    "            )\n",
+    "            display(\n",
+    "                HTML(\n",
+    "                    f\"<h2>Explanation:</h2> <div style='{style}'>{row['explanation']}</div>\"\n",
+    "                )\n",
+    "            )\n",
+    "            display(\n",
+    "                HTML(\n",
+    "                    f\"<h2>Confidence score:</h2> <div style='{style}'>{row['confidence']}</div>\"\n",
+    "                )\n",
+    "            )\n",
+    "            display(HTML(\"<hr>\"))\n",
+    "\n",
+    "\n",
     "print_autosxs_judgments(judgments_df)"
    ]
   },
@@ -691,7 +651,33 @@
     "\n",
     "win_rate_metrics = MessageToDict(details.outputs[\"autosxs_metrics\"]._pb)[\n",
     "    \"artifacts\"\n",
-    "][0][\"metadata\"]\n",
+    "][0][\"metadata\"]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The next cell contains a helper function to print AutoSxS aggregated metrics nicely in the notebook."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def print_aggregated_metrics(scores):\n",
+    "    \"\"\"Print AutoSxS aggregated metrics\"\"\"\n",
+    "\n",
+    "    score_b = round(win_rate_metrics[\"autosxs_model_b_win_rate\"] * 100)\n",
+    "    display(\n",
+    "        HTML(\n",
+    "            f\"<h3>AutoSxS Autorater prefers {score_b}% of time Model B over Model A </h3>\"\n",
+    "        )\n",
+    "    )\n",
+    "\n",
+    "\n",
     "print_aggregated_metrics(win_rate_metrics)"
    ]
   },
@@ -963,7 +949,31 @@
     "        \"artifacts\"\n",
     "    ][0][\"metadata\"].items()\n",
     "    if \"win_rate\" in k\n",
-    "}\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The next cell contains a helper function to print AutoSxS alignment metrics nicely in the notebook."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def print_human_preference_metrics(metrics):\n",
+    "    \"\"\"Print AutoSxS Human-preference alignment metrics\"\"\"\n",
+    "    display(\n",
+    "        HTML(\n",
+    "            f\"<h3>AutoSxS Autorater prefers {score_b}% of time Model B over Model A </h3>\"\n",
+    "        )\n",
+    "    )\n",
+    "\n",
+    "\n",
     "pprint.pprint(human_aligned_metrics)"
    ]
   },
@@ -981,6 +991,16 @@
     "Otherwise, you can delete the individual resources you created in this tutorial."
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Acknowledgement \n",
+    "\n",
+    "This notebook is adapted from a [tutorial](https://github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/evaluation/evaluate_gemini_with_autosxs.ipynb)\n",
+    "written by Ivan Nardini."
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -999,13 +1019,6 @@
     "See the License for the specific language governing permissions and\n",
     "limitations under the License."
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {