From efa7eee00f1b0dbfc476a29dba53b8cf49291b2d Mon Sep 17 00:00:00 2001 From: Roger Yang <80478925+RogerHYang@users.noreply.github.com> Date: Wed, 29 Jan 2025 14:07:51 -0800 Subject: [PATCH] fix(client): exclude empty list of tools in sdk helper functions (#6203) --- .../client/helpers/sdk/anthropic/messages.py | 4 +- .../phoenix/client/helpers/sdk/openai/chat.py | 4 +- .../internal/prompts/hallucination_eval.ipynb | 261 ++++++++++++++++++ 3 files changed, 265 insertions(+), 4 deletions(-) create mode 100644 tutorials/internal/prompts/hallucination_eval.ipynb diff --git a/packages/phoenix-client/src/phoenix/client/helpers/sdk/anthropic/messages.py b/packages/phoenix-client/src/phoenix/client/helpers/sdk/anthropic/messages.py index 0dd8ac1993..4163b885ff 100644 --- a/packages/phoenix-client/src/phoenix/client/helpers/sdk/anthropic/messages.py +++ b/packages/phoenix-client/src/phoenix/client/helpers/sdk/anthropic/messages.py @@ -115,8 +115,8 @@ def _to_model_kwargs( except (ValueError, TypeError): pass - if obj.tools: - ans["tools"] = list(_to_tools(obj.tools)) + if obj.tools and (tools := list(_to_tools(obj.tools))): + ans["tools"] = tools if (tool_choice := parameters.get("tool_choice")) is not None: if tool_choice == "any": ans["tool_choice"] = {"type": "any"} diff --git a/packages/phoenix-client/src/phoenix/client/helpers/sdk/openai/chat.py b/packages/phoenix-client/src/phoenix/client/helpers/sdk/openai/chat.py index c2ca2a91d2..12892b7928 100644 --- a/packages/phoenix-client/src/phoenix/client/helpers/sdk/openai/chat.py +++ b/packages/phoenix-client/src/phoenix/client/helpers/sdk/openai/chat.py @@ -78,8 +78,8 @@ def _to_model_kwargs( ans: _ModelKwargs = { "model": obj.model_name, } - if obj.tools: - ans["tools"] = list(_to_tools(obj.tools)) + if obj.tools and (tools := list(_to_tools(obj.tools))): + ans["tools"] = tools parameters = obj.invocation_parameters or {} if (v := parameters.get("temperature")) is not None: try: diff --git a/tutorials/internal/prompts/hallucination_eval.ipynb b/tutorials/internal/prompts/hallucination_eval.ipynb new file mode 100644 index 0000000000..6528cf6c10 --- /dev/null +++ b/tutorials/internal/prompts/hallucination_eval.ipynb @@ -0,0 +1,261 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "initial_id", + "metadata": {}, + "outputs": [], + "source": [ + "from pathlib import Path\n", + "from secrets import token_hex\n", + "\n", + "import groq\n", + "import nest_asyncio\n", + "import openai\n", + "import pandas as pd\n", + "from dotenv import load_dotenv\n", + "from openinference.instrumentation.groq import GroqInstrumentor\n", + "from openinference.instrumentation.openai import OpenAIInstrumentor\n", + "from sklearn.metrics import accuracy_score\n", + "\n", + "import phoenix as px\n", + "from phoenix.client import Client\n", + "from phoenix.client.utils import to_chat_messages_and_kwargs\n", + "from phoenix.experiments import run_experiment\n", + "from phoenix.otel import register\n", + "\n", + "nest_asyncio.apply()\n", + "if (env_file := Path.home() / \".env\").exists():\n", + " load_dotenv(env_file)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e7662ee0", + "metadata": {}, + "outputs": [], + "source": [ + "tracer_provider = register()\n", + "OpenAIInstrumentor().instrument(tracer_provider=tracer_provider)\n", + "GroqInstrumentor().instrument(tracer_provider=tracer_provider)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1ef4c16985598a41", + "metadata": {}, + "outputs": [], + "source": [ + "url = \"https://raw.githubusercontent.com/RUCAIBox/HaluEval/refs/heads/main/data/qa_data.json\"\n", + "qa = pd.read_json(url, lines=True)\n", + "qa.sample(5).iloc[:, ::-1]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a25ad645", + "metadata": {}, + "outputs": [], + "source": [ + "k = qa.iloc[:, :2]\n", + "df = pd.concat(\n", + " [\n", + " pd.concat([k, qa.iloc[:, 2].rename(\"answer\")], axis=1).assign(true_label=\"factual\"),\n", + " pd.concat([k, qa.iloc[:, 3].rename(\"answer\")], axis=1).assign(true_label=\"hallucinated\"),\n", + " ]\n", + ")\n", + "df = df.sample(10, random_state=42).reset_index(drop=True).iloc[:, ::-1]\n", + "df" + ] + }, + { + "cell_type": "markdown", + "id": "88dc3cc5", + "metadata": {}, + "source": [ + "# Get Prompt" + ] + }, + { + "cell_type": "markdown", + "id": "14b09dc0", + "metadata": {}, + "source": [ + "https://github.com/Arize-ai/phoenix/blob/390cfaa42c5b2c28d3f9f83fbf7c694b8c2beeff/packages/phoenix-evals/src/phoenix/evals/default_templates.py#L56" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "593ec84e", + "metadata": {}, + "outputs": [], + "source": [ + "prompt = Client().prompts.get(prompt_identifier=\"hallu-eval\")" + ] + }, + { + "cell_type": "markdown", + "id": "cb0e0cba", + "metadata": {}, + "source": [ + "# GPT 4o Mini" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d0c38fc6", + "metadata": {}, + "outputs": [], + "source": [ + "def openai_eval(input):\n", + " messages, kwargs = to_chat_messages_and_kwargs(prompt, variables=dict(input))\n", + " response = openai.OpenAI().chat.completions.create(messages=messages, **kwargs)\n", + " return {\"label\": response.choices[0].message.content}" + ] + }, + { + "cell_type": "markdown", + "id": "4b137880", + "metadata": {}, + "source": [ + "### DataFrame Apply" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bd9bdca3", + "metadata": {}, + "outputs": [], + "source": [ + "gpt_result = pd.concat([pd.json_normalize(df.apply(openai_eval, axis=1)), df.true_label], axis=1)\n", + "print(f\"Accuracy: {accuracy_score(gpt_result.true_label, gpt_result.label) * 100:.0f}%\")\n", + "gpt_result" + ] + }, + { + "cell_type": "markdown", + "id": "d0ea3a27", + "metadata": {}, + "source": [ + "# Upload Dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1cdbed73", + "metadata": {}, + "outputs": [], + "source": [ + "ds = px.Client().upload_dataset(\n", + " dataframe=df,\n", + " dataset_name=\"hallu-eval-\" + token_hex(),\n", + " input_keys=[\"question\", \"knowledge\", \"answer\"],\n", + " output_keys=[\"true_label\"],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "0dd97ebe", + "metadata": {}, + "source": [ + "# Run Experiment" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ef97ad99", + "metadata": {}, + "outputs": [], + "source": [ + "run_experiment(ds, openai_eval)" + ] + }, + { + "cell_type": "markdown", + "id": "2634281b", + "metadata": {}, + "source": [ + "# DeepSeek via Groq" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "afc4b171", + "metadata": {}, + "outputs": [], + "source": [ + "async def groq_eval(input, model=\"deepseek-r1-distill-llama-70b\"):\n", + " messages, *_ = to_chat_messages_and_kwargs(prompt, variables=dict(input))\n", + " response = await groq.AsyncGroq().chat.completions.create(messages=messages, model=model)\n", + " return {\"label\": response.choices[0].message.content}" + ] + }, + { + "cell_type": "markdown", + "id": "cb856e29", + "metadata": {}, + "source": [ + "### Run Experiment" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ff731996", + "metadata": {}, + "outputs": [], + "source": [ + "exp = run_experiment(ds, groq_eval)" + ] + }, + { + "cell_type": "markdown", + "id": "ea96b80b", + "metadata": {}, + "source": [ + "### Extract the Last Word to Calculate Accuracy" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c66d9f03", + "metadata": {}, + "outputs": [], + "source": [ + "labels = pd.json_normalize(exp.as_dataframe().output).label.str.split(\"\\n\").str[-1]\n", + "result = pd.concat([labels, df.true_label], axis=1)\n", + "print(f\"Accuracy: {accuracy_score(result.true_label, result.label) * 100:.0f}%\")\n", + "result" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b24e8ee1", + "metadata": {}, + "outputs": [], + "source": [ + "pd.concat([gpt_result.label.rename(\"gpt\"), result.rename({\"label\": \"deepseek\"}, axis=1)], axis=1)" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}