Skip to content

Commit

Permalink
fix(client): exclude empty list of tools in sdk helper functions (#6203)
Browse files Browse the repository at this point in the history
  • Loading branch information
RogerHYang authored Jan 29, 2025
1 parent ad42f06 commit efa7eee
Show file tree
Hide file tree
Showing 3 changed files with 265 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -115,8 +115,8 @@ def _to_model_kwargs(
except (ValueError, TypeError):
pass

if obj.tools:
ans["tools"] = list(_to_tools(obj.tools))
if obj.tools and (tools := list(_to_tools(obj.tools))):
ans["tools"] = tools
if (tool_choice := parameters.get("tool_choice")) is not None:
if tool_choice == "any":
ans["tool_choice"] = {"type": "any"}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,8 +78,8 @@ def _to_model_kwargs(
ans: _ModelKwargs = {
"model": obj.model_name,
}
if obj.tools:
ans["tools"] = list(_to_tools(obj.tools))
if obj.tools and (tools := list(_to_tools(obj.tools))):
ans["tools"] = tools
parameters = obj.invocation_parameters or {}
if (v := parameters.get("temperature")) is not None:
try:
Expand Down
261 changes: 261 additions & 0 deletions tutorials/internal/prompts/hallucination_eval.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,261 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "initial_id",
"metadata": {},
"outputs": [],
"source": [
"from pathlib import Path\n",
"from secrets import token_hex\n",
"\n",
"import groq\n",
"import nest_asyncio\n",
"import openai\n",
"import pandas as pd\n",
"from dotenv import load_dotenv\n",
"from openinference.instrumentation.groq import GroqInstrumentor\n",
"from openinference.instrumentation.openai import OpenAIInstrumentor\n",
"from sklearn.metrics import accuracy_score\n",
"\n",
"import phoenix as px\n",
"from phoenix.client import Client\n",
"from phoenix.client.utils import to_chat_messages_and_kwargs\n",
"from phoenix.experiments import run_experiment\n",
"from phoenix.otel import register\n",
"\n",
"nest_asyncio.apply()\n",
"if (env_file := Path.home() / \".env\").exists():\n",
" load_dotenv(env_file)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e7662ee0",
"metadata": {},
"outputs": [],
"source": [
"tracer_provider = register()\n",
"OpenAIInstrumentor().instrument(tracer_provider=tracer_provider)\n",
"GroqInstrumentor().instrument(tracer_provider=tracer_provider)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1ef4c16985598a41",
"metadata": {},
"outputs": [],
"source": [
"url = \"https://raw.githubusercontent.com/RUCAIBox/HaluEval/refs/heads/main/data/qa_data.json\"\n",
"qa = pd.read_json(url, lines=True)\n",
"qa.sample(5).iloc[:, ::-1]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a25ad645",
"metadata": {},
"outputs": [],
"source": [
"k = qa.iloc[:, :2]\n",
"df = pd.concat(\n",
" [\n",
" pd.concat([k, qa.iloc[:, 2].rename(\"answer\")], axis=1).assign(true_label=\"factual\"),\n",
" pd.concat([k, qa.iloc[:, 3].rename(\"answer\")], axis=1).assign(true_label=\"hallucinated\"),\n",
" ]\n",
")\n",
"df = df.sample(10, random_state=42).reset_index(drop=True).iloc[:, ::-1]\n",
"df"
]
},
{
"cell_type": "markdown",
"id": "88dc3cc5",
"metadata": {},
"source": [
"# Get Prompt"
]
},
{
"cell_type": "markdown",
"id": "14b09dc0",
"metadata": {},
"source": [
"https://github.com/Arize-ai/phoenix/blob/390cfaa42c5b2c28d3f9f83fbf7c694b8c2beeff/packages/phoenix-evals/src/phoenix/evals/default_templates.py#L56"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "593ec84e",
"metadata": {},
"outputs": [],
"source": [
"prompt = Client().prompts.get(prompt_identifier=\"hallu-eval\")"
]
},
{
"cell_type": "markdown",
"id": "cb0e0cba",
"metadata": {},
"source": [
"# GPT 4o Mini"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d0c38fc6",
"metadata": {},
"outputs": [],
"source": [
"def openai_eval(input):\n",
" messages, kwargs = to_chat_messages_and_kwargs(prompt, variables=dict(input))\n",
" response = openai.OpenAI().chat.completions.create(messages=messages, **kwargs)\n",
" return {\"label\": response.choices[0].message.content}"
]
},
{
"cell_type": "markdown",
"id": "4b137880",
"metadata": {},
"source": [
"### DataFrame Apply"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "bd9bdca3",
"metadata": {},
"outputs": [],
"source": [
"gpt_result = pd.concat([pd.json_normalize(df.apply(openai_eval, axis=1)), df.true_label], axis=1)\n",
"print(f\"Accuracy: {accuracy_score(gpt_result.true_label, gpt_result.label) * 100:.0f}%\")\n",
"gpt_result"
]
},
{
"cell_type": "markdown",
"id": "d0ea3a27",
"metadata": {},
"source": [
"# Upload Dataset"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1cdbed73",
"metadata": {},
"outputs": [],
"source": [
"ds = px.Client().upload_dataset(\n",
" dataframe=df,\n",
" dataset_name=\"hallu-eval-\" + token_hex(),\n",
" input_keys=[\"question\", \"knowledge\", \"answer\"],\n",
" output_keys=[\"true_label\"],\n",
")"
]
},
{
"cell_type": "markdown",
"id": "0dd97ebe",
"metadata": {},
"source": [
"# Run Experiment"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ef97ad99",
"metadata": {},
"outputs": [],
"source": [
"run_experiment(ds, openai_eval)"
]
},
{
"cell_type": "markdown",
"id": "2634281b",
"metadata": {},
"source": [
"# DeepSeek via Groq"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "afc4b171",
"metadata": {},
"outputs": [],
"source": [
"async def groq_eval(input, model=\"deepseek-r1-distill-llama-70b\"):\n",
" messages, *_ = to_chat_messages_and_kwargs(prompt, variables=dict(input))\n",
" response = await groq.AsyncGroq().chat.completions.create(messages=messages, model=model)\n",
" return {\"label\": response.choices[0].message.content}"
]
},
{
"cell_type": "markdown",
"id": "cb856e29",
"metadata": {},
"source": [
"### Run Experiment"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ff731996",
"metadata": {},
"outputs": [],
"source": [
"exp = run_experiment(ds, groq_eval)"
]
},
{
"cell_type": "markdown",
"id": "ea96b80b",
"metadata": {},
"source": [
"### Extract the Last Word to Calculate Accuracy"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c66d9f03",
"metadata": {},
"outputs": [],
"source": [
"labels = pd.json_normalize(exp.as_dataframe().output).label.str.split(\"\\n\").str[-1]\n",
"result = pd.concat([labels, df.true_label], axis=1)\n",
"print(f\"Accuracy: {accuracy_score(result.true_label, result.label) * 100:.0f}%\")\n",
"result"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b24e8ee1",
"metadata": {},
"outputs": [],
"source": [
"pd.concat([gpt_result.label.rename(\"gpt\"), result.rename({\"label\": \"deepseek\"}, axis=1)], axis=1)"
]
}
],
"metadata": {
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

0 comments on commit efa7eee

Please sign in to comment.