-
Notifications
You must be signed in to change notification settings - Fork 340
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
fix(client): exclude empty list of tools in sdk helper functions (#6203)
- Loading branch information
1 parent
ad42f06
commit efa7eee
Showing
3 changed files
with
265 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,261 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "initial_id", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"from pathlib import Path\n", | ||
"from secrets import token_hex\n", | ||
"\n", | ||
"import groq\n", | ||
"import nest_asyncio\n", | ||
"import openai\n", | ||
"import pandas as pd\n", | ||
"from dotenv import load_dotenv\n", | ||
"from openinference.instrumentation.groq import GroqInstrumentor\n", | ||
"from openinference.instrumentation.openai import OpenAIInstrumentor\n", | ||
"from sklearn.metrics import accuracy_score\n", | ||
"\n", | ||
"import phoenix as px\n", | ||
"from phoenix.client import Client\n", | ||
"from phoenix.client.utils import to_chat_messages_and_kwargs\n", | ||
"from phoenix.experiments import run_experiment\n", | ||
"from phoenix.otel import register\n", | ||
"\n", | ||
"nest_asyncio.apply()\n", | ||
"if (env_file := Path.home() / \".env\").exists():\n", | ||
" load_dotenv(env_file)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "e7662ee0", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"tracer_provider = register()\n", | ||
"OpenAIInstrumentor().instrument(tracer_provider=tracer_provider)\n", | ||
"GroqInstrumentor().instrument(tracer_provider=tracer_provider)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "1ef4c16985598a41", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"url = \"https://raw.githubusercontent.com/RUCAIBox/HaluEval/refs/heads/main/data/qa_data.json\"\n", | ||
"qa = pd.read_json(url, lines=True)\n", | ||
"qa.sample(5).iloc[:, ::-1]" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "a25ad645", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"k = qa.iloc[:, :2]\n", | ||
"df = pd.concat(\n", | ||
" [\n", | ||
" pd.concat([k, qa.iloc[:, 2].rename(\"answer\")], axis=1).assign(true_label=\"factual\"),\n", | ||
" pd.concat([k, qa.iloc[:, 3].rename(\"answer\")], axis=1).assign(true_label=\"hallucinated\"),\n", | ||
" ]\n", | ||
")\n", | ||
"df = df.sample(10, random_state=42).reset_index(drop=True).iloc[:, ::-1]\n", | ||
"df" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "88dc3cc5", | ||
"metadata": {}, | ||
"source": [ | ||
"# Get Prompt" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "14b09dc0", | ||
"metadata": {}, | ||
"source": [ | ||
"https://github.com/Arize-ai/phoenix/blob/390cfaa42c5b2c28d3f9f83fbf7c694b8c2beeff/packages/phoenix-evals/src/phoenix/evals/default_templates.py#L56" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "593ec84e", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"prompt = Client().prompts.get(prompt_identifier=\"hallu-eval\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "cb0e0cba", | ||
"metadata": {}, | ||
"source": [ | ||
"# GPT 4o Mini" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "d0c38fc6", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"def openai_eval(input):\n", | ||
" messages, kwargs = to_chat_messages_and_kwargs(prompt, variables=dict(input))\n", | ||
" response = openai.OpenAI().chat.completions.create(messages=messages, **kwargs)\n", | ||
" return {\"label\": response.choices[0].message.content}" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "4b137880", | ||
"metadata": {}, | ||
"source": [ | ||
"### DataFrame Apply" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "bd9bdca3", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"gpt_result = pd.concat([pd.json_normalize(df.apply(openai_eval, axis=1)), df.true_label], axis=1)\n", | ||
"print(f\"Accuracy: {accuracy_score(gpt_result.true_label, gpt_result.label) * 100:.0f}%\")\n", | ||
"gpt_result" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "d0ea3a27", | ||
"metadata": {}, | ||
"source": [ | ||
"# Upload Dataset" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "1cdbed73", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"ds = px.Client().upload_dataset(\n", | ||
" dataframe=df,\n", | ||
" dataset_name=\"hallu-eval-\" + token_hex(),\n", | ||
" input_keys=[\"question\", \"knowledge\", \"answer\"],\n", | ||
" output_keys=[\"true_label\"],\n", | ||
")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "0dd97ebe", | ||
"metadata": {}, | ||
"source": [ | ||
"# Run Experiment" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "ef97ad99", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"run_experiment(ds, openai_eval)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "2634281b", | ||
"metadata": {}, | ||
"source": [ | ||
"# DeepSeek via Groq" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "afc4b171", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"async def groq_eval(input, model=\"deepseek-r1-distill-llama-70b\"):\n", | ||
" messages, *_ = to_chat_messages_and_kwargs(prompt, variables=dict(input))\n", | ||
" response = await groq.AsyncGroq().chat.completions.create(messages=messages, model=model)\n", | ||
" return {\"label\": response.choices[0].message.content}" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "cb856e29", | ||
"metadata": {}, | ||
"source": [ | ||
"### Run Experiment" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "ff731996", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"exp = run_experiment(ds, groq_eval)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "ea96b80b", | ||
"metadata": {}, | ||
"source": [ | ||
"### Extract the Last Word to Calculate Accuracy" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "c66d9f03", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"labels = pd.json_normalize(exp.as_dataframe().output).label.str.split(\"\\n\").str[-1]\n", | ||
"result = pd.concat([labels, df.true_label], axis=1)\n", | ||
"print(f\"Accuracy: {accuracy_score(result.true_label, result.label) * 100:.0f}%\")\n", | ||
"result" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "b24e8ee1", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"pd.concat([gpt_result.label.rename(\"gpt\"), result.rename({\"label\": \"deepseek\"}, axis=1)], axis=1)" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"language_info": { | ||
"name": "python" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |