From e637a7aae3e6948416dc1eff13eef66d2432678e Mon Sep 17 00:00:00 2001 From: BenoitDherin Date: Wed, 17 Apr 2024 00:51:12 +0000 Subject: [PATCH] precommit --- .../solutions/vertex_llm_evaluation.ipynb | 197 ++++++++++-------- 1 file changed, 105 insertions(+), 92 deletions(-) diff --git a/notebooks/vertex_genai/solutions/vertex_llm_evaluation.ipynb b/notebooks/vertex_genai/solutions/vertex_llm_evaluation.ipynb index 38a0bc68..37a257db 100644 --- a/notebooks/vertex_genai/solutions/vertex_llm_evaluation.ipynb +++ b/notebooks/vertex_genai/solutions/vertex_llm_evaluation.ipynb @@ -165,89 +165,6 @@ "aiplatform.init(project=PROJECT_ID, location=REGION, staging_bucket=BUCKET_URI)" ] }, - { - "cell_type": "markdown", - "metadata": { - "id": "NRkfTNeaHbZd", - "tags": [] - }, - "source": [ - "### Define helper functions" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The functions below will allow us to display AutoSxS judgments in a more readable way within the notebook.\n", - "They are here only for cosmetic reasons. " - ] - }, - { - "cell_type": "code", - "execution_count": 56, - "metadata": { - "id": "ivbHUDiEHd2Q", - "tags": [] - }, - "outputs": [], - "source": [ - "def print_autosxs_judgments(df, n=3):\n", - " \"\"\"Print AutoSxS judgments in the notebook\"\"\"\n", - "\n", - " style = \"white-space: pre-wrap; width: 800px; overflow-x: auto;\"\n", - " df = df.sample(n=n)\n", - "\n", - " for index, row in df.iterrows():\n", - " if row[\"confidence\"] >= 0.5:\n", - " display(\n", - " HTML(\n", - " f\"

Document:

{row['document']}
\"\n", - " )\n", - " )\n", - " display(\n", - " HTML(\n", - " f\"

Response A:

{row['response_a']}
\"\n", - " )\n", - " )\n", - " display(\n", - " HTML(\n", - " f\"

Response B:

{row['response_b']}
\"\n", - " )\n", - " )\n", - " display(\n", - " HTML(\n", - " f\"

Explanation:

{row['explanation']}
\"\n", - " )\n", - " )\n", - " display(\n", - " HTML(\n", - " f\"

Confidence score:

{row['confidence']}
\"\n", - " )\n", - " )\n", - " display(HTML(\"
\"))\n", - "\n", - "\n", - "def print_aggregated_metrics(scores):\n", - " \"\"\"Print AutoSxS aggregated metrics\"\"\"\n", - "\n", - " score_b = round(win_rate_metrics[\"autosxs_model_b_win_rate\"] * 100)\n", - " display(\n", - " HTML(\n", - " f\"

AutoSxS Autorater prefers {score_b}% of time Model B over Model A

\"\n", - " )\n", - " )\n", - "\n", - "\n", - "def print_human_preference_metrics(metrics):\n", - " \"\"\"Print AutoSxS Human-preference alignment metrics\"\"\"\n", - " display(\n", - " HTML(\n", - " f\"

AutoSxS Autorater prefers {score_b}% of time Model B over Model A

\"\n", - " )\n", - " )" - ] - }, { "cell_type": "markdown", "metadata": { @@ -553,6 +470,13 @@ "judgments_df = pd.read_json(judgments_uri, lines=True)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The next cell contains a helper function to print a sample from AutoSxS judgments nicely in the notebook." + ] + }, { "cell_type": "code", "execution_count": 107, @@ -646,6 +570,42 @@ } ], "source": [ + "def print_autosxs_judgments(df, n=3):\n", + " \"\"\"Print AutoSxS judgments in the notebook\"\"\"\n", + "\n", + " style = \"white-space: pre-wrap; width: 800px; overflow-x: auto;\"\n", + " df = df.sample(n=n)\n", + "\n", + " for index, row in df.iterrows():\n", + " if row[\"confidence\"] >= 0.5:\n", + " display(\n", + " HTML(\n", + " f\"

Document:

{row['document']}
\"\n", + " )\n", + " )\n", + " display(\n", + " HTML(\n", + " f\"

Response A:

{row['response_a']}
\"\n", + " )\n", + " )\n", + " display(\n", + " HTML(\n", + " f\"

Response B:

{row['response_b']}
\"\n", + " )\n", + " )\n", + " display(\n", + " HTML(\n", + " f\"

Explanation:

{row['explanation']}
\"\n", + " )\n", + " )\n", + " display(\n", + " HTML(\n", + " f\"

Confidence score:

{row['confidence']}
\"\n", + " )\n", + " )\n", + " display(HTML(\"
\"))\n", + "\n", + "\n", "print_autosxs_judgments(judgments_df)" ] }, @@ -691,7 +651,33 @@ "\n", "win_rate_metrics = MessageToDict(details.outputs[\"autosxs_metrics\"]._pb)[\n", " \"artifacts\"\n", - "][0][\"metadata\"]\n", + "][0][\"metadata\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The next cell contains a helper function to print AutoSxS aggregated metrics nicely in the notebook." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def print_aggregated_metrics(scores):\n", + " \"\"\"Print AutoSxS aggregated metrics\"\"\"\n", + "\n", + " score_b = round(win_rate_metrics[\"autosxs_model_b_win_rate\"] * 100)\n", + " display(\n", + " HTML(\n", + " f\"

AutoSxS Autorater prefers {score_b}% of time Model B over Model A

\"\n", + " )\n", + " )\n", + "\n", + "\n", "print_aggregated_metrics(win_rate_metrics)" ] }, @@ -963,7 +949,31 @@ " \"artifacts\"\n", " ][0][\"metadata\"].items()\n", " if \"win_rate\" in k\n", - "}\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The next cell contains a helper function to print AutoSxS alignment metrics nicely in the notebook." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def print_human_preference_metrics(metrics):\n", + " \"\"\"Print AutoSxS Human-preference alignment metrics\"\"\"\n", + " display(\n", + " HTML(\n", + " f\"

AutoSxS Autorater prefers {score_b}% of time Model B over Model A

\"\n", + " )\n", + " )\n", + "\n", + "\n", "pprint.pprint(human_aligned_metrics)" ] }, @@ -981,6 +991,16 @@ "Otherwise, you can delete the individual resources you created in this tutorial." ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Acknowledgement \n", + "\n", + "This notebook is adapted from a [tutorial](https://github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/evaluation/evaluate_gemini_with_autosxs.ipynb)\n", + "written by Ivan Nardini." + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -999,13 +1019,6 @@ "See the License for the specific language governing permissions and\n", "limitations under the License." ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": {