From efa7eee00f1b0dbfc476a29dba53b8cf49291b2d Mon Sep 17 00:00:00 2001
From: Roger Yang <80478925+RogerHYang@users.noreply.github.com>
Date: Wed, 29 Jan 2025 14:07:51 -0800
Subject: [PATCH] fix(client): exclude empty list of tools in sdk helper
 functions (#6203)

---
 .../client/helpers/sdk/anthropic/messages.py  |   4 +-
 .../phoenix/client/helpers/sdk/openai/chat.py |   4 +-
 .../internal/prompts/hallucination_eval.ipynb | 261 ++++++++++++++++++
 3 files changed, 265 insertions(+), 4 deletions(-)
 create mode 100644 tutorials/internal/prompts/hallucination_eval.ipynb

diff --git a/packages/phoenix-client/src/phoenix/client/helpers/sdk/anthropic/messages.py b/packages/phoenix-client/src/phoenix/client/helpers/sdk/anthropic/messages.py
index 0dd8ac1993..4163b885ff 100644
--- a/packages/phoenix-client/src/phoenix/client/helpers/sdk/anthropic/messages.py
+++ b/packages/phoenix-client/src/phoenix/client/helpers/sdk/anthropic/messages.py
@@ -115,8 +115,8 @@ def _to_model_kwargs(
         except (ValueError, TypeError):
             pass
 
-    if obj.tools:
-        ans["tools"] = list(_to_tools(obj.tools))
+    if obj.tools and (tools := list(_to_tools(obj.tools))):
+        ans["tools"] = tools
         if (tool_choice := parameters.get("tool_choice")) is not None:
             if tool_choice == "any":
                 ans["tool_choice"] = {"type": "any"}
diff --git a/packages/phoenix-client/src/phoenix/client/helpers/sdk/openai/chat.py b/packages/phoenix-client/src/phoenix/client/helpers/sdk/openai/chat.py
index c2ca2a91d2..12892b7928 100644
--- a/packages/phoenix-client/src/phoenix/client/helpers/sdk/openai/chat.py
+++ b/packages/phoenix-client/src/phoenix/client/helpers/sdk/openai/chat.py
@@ -78,8 +78,8 @@ def _to_model_kwargs(
     ans: _ModelKwargs = {
         "model": obj.model_name,
     }
-    if obj.tools:
-        ans["tools"] = list(_to_tools(obj.tools))
+    if obj.tools and (tools := list(_to_tools(obj.tools))):
+        ans["tools"] = tools
     parameters = obj.invocation_parameters or {}
     if (v := parameters.get("temperature")) is not None:
         try:
diff --git a/tutorials/internal/prompts/hallucination_eval.ipynb b/tutorials/internal/prompts/hallucination_eval.ipynb
new file mode 100644
index 0000000000..6528cf6c10
--- /dev/null
+++ b/tutorials/internal/prompts/hallucination_eval.ipynb
@@ -0,0 +1,261 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "initial_id",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "from secrets import token_hex\n",
+    "\n",
+    "import groq\n",
+    "import nest_asyncio\n",
+    "import openai\n",
+    "import pandas as pd\n",
+    "from dotenv import load_dotenv\n",
+    "from openinference.instrumentation.groq import GroqInstrumentor\n",
+    "from openinference.instrumentation.openai import OpenAIInstrumentor\n",
+    "from sklearn.metrics import accuracy_score\n",
+    "\n",
+    "import phoenix as px\n",
+    "from phoenix.client import Client\n",
+    "from phoenix.client.utils import to_chat_messages_and_kwargs\n",
+    "from phoenix.experiments import run_experiment\n",
+    "from phoenix.otel import register\n",
+    "\n",
+    "nest_asyncio.apply()\n",
+    "if (env_file := Path.home() / \".env\").exists():\n",
+    "    load_dotenv(env_file)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e7662ee0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tracer_provider = register()\n",
+    "OpenAIInstrumentor().instrument(tracer_provider=tracer_provider)\n",
+    "GroqInstrumentor().instrument(tracer_provider=tracer_provider)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1ef4c16985598a41",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "url = \"https://raw.githubusercontent.com/RUCAIBox/HaluEval/refs/heads/main/data/qa_data.json\"\n",
+    "qa = pd.read_json(url, lines=True)\n",
+    "qa.sample(5).iloc[:, ::-1]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a25ad645",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "k = qa.iloc[:, :2]\n",
+    "df = pd.concat(\n",
+    "    [\n",
+    "        pd.concat([k, qa.iloc[:, 2].rename(\"answer\")], axis=1).assign(true_label=\"factual\"),\n",
+    "        pd.concat([k, qa.iloc[:, 3].rename(\"answer\")], axis=1).assign(true_label=\"hallucinated\"),\n",
+    "    ]\n",
+    ")\n",
+    "df = df.sample(10, random_state=42).reset_index(drop=True).iloc[:, ::-1]\n",
+    "df"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "88dc3cc5",
+   "metadata": {},
+   "source": [
+    "# Get Prompt"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "14b09dc0",
+   "metadata": {},
+   "source": [
+    "https://github.com/Arize-ai/phoenix/blob/390cfaa42c5b2c28d3f9f83fbf7c694b8c2beeff/packages/phoenix-evals/src/phoenix/evals/default_templates.py#L56"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "593ec84e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "prompt = Client().prompts.get(prompt_identifier=\"hallu-eval\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cb0e0cba",
+   "metadata": {},
+   "source": [
+    "# GPT 4o Mini"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d0c38fc6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def openai_eval(input):\n",
+    "    messages, kwargs = to_chat_messages_and_kwargs(prompt, variables=dict(input))\n",
+    "    response = openai.OpenAI().chat.completions.create(messages=messages, **kwargs)\n",
+    "    return {\"label\": response.choices[0].message.content}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4b137880",
+   "metadata": {},
+   "source": [
+    "### DataFrame Apply"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bd9bdca3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "gpt_result = pd.concat([pd.json_normalize(df.apply(openai_eval, axis=1)), df.true_label], axis=1)\n",
+    "print(f\"Accuracy: {accuracy_score(gpt_result.true_label, gpt_result.label) * 100:.0f}%\")\n",
+    "gpt_result"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d0ea3a27",
+   "metadata": {},
+   "source": [
+    "# Upload Dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1cdbed73",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ds = px.Client().upload_dataset(\n",
+    "    dataframe=df,\n",
+    "    dataset_name=\"hallu-eval-\" + token_hex(),\n",
+    "    input_keys=[\"question\", \"knowledge\", \"answer\"],\n",
+    "    output_keys=[\"true_label\"],\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0dd97ebe",
+   "metadata": {},
+   "source": [
+    "# Run Experiment"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ef97ad99",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "run_experiment(ds, openai_eval)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2634281b",
+   "metadata": {},
+   "source": [
+    "# DeepSeek via Groq"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "afc4b171",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "async def groq_eval(input, model=\"deepseek-r1-distill-llama-70b\"):\n",
+    "    messages, *_ = to_chat_messages_and_kwargs(prompt, variables=dict(input))\n",
+    "    response = await groq.AsyncGroq().chat.completions.create(messages=messages, model=model)\n",
+    "    return {\"label\": response.choices[0].message.content}"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cb856e29",
+   "metadata": {},
+   "source": [
+    "### Run Experiment"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ff731996",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "exp = run_experiment(ds, groq_eval)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ea96b80b",
+   "metadata": {},
+   "source": [
+    "### Extract the Last Word to Calculate Accuracy"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c66d9f03",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "labels = pd.json_normalize(exp.as_dataframe().output).label.str.split(\"\\n\").str[-1]\n",
+    "result = pd.concat([labels, df.true_label], axis=1)\n",
+    "print(f\"Accuracy: {accuracy_score(result.true_label, result.label) * 100:.0f}%\")\n",
+    "result"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b24e8ee1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pd.concat([gpt_result.label.rename(\"gpt\"), result.rename({\"label\": \"deepseek\"}, axis=1)], axis=1)"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}