Skip to content

Commit

Permalink
update how to run eval and consensus
Browse files Browse the repository at this point in the history
  • Loading branch information
dannylee1020 committed Dec 20, 2024
1 parent 752d398 commit 64bff24
Show file tree
Hide file tree
Showing 4 changed files with 121 additions and 143 deletions.
6 changes: 3 additions & 3 deletions openpo/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from typing import Any, Dict, List, Optional

from .internal.error import AuthenticationError, ProviderError
from .internal.response import ChatCompletionOutput, ChatCompletionStreamOutput
from .internal.response import ChatCompletionOutput
from .resources.batch.batch import Batch
from .resources.eval.eval import Evaluation
from .resources.provider import Anthropic, HuggingFace, OpenAI, OpenRouter
Expand Down Expand Up @@ -73,7 +73,7 @@ def completions(
models: List[str],
messages: List[Dict[str, Any]],
params: Optional[Dict[str, Any]] = None,
) -> List[ChatCompletionOutput | ChatCompletionStreamOutput]:
) -> List[ChatCompletionOutput]:
"""Generate completions using the specified LLM provider.
Args:
Expand Down Expand Up @@ -112,7 +112,7 @@ def completions(
return responses

@property
def eval(self):
def evaluate(self):
return self._eval

@property
Expand Down
90 changes: 50 additions & 40 deletions openpo/resources/batch/batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,15 @@ def __init__(self, client):

def eval(
self,
model: str,
models: List[str],
questions: List[str],
responses: List[List[str]],
prompt: Optional[str] = None,
):
"""Use single LLM-as-a-judge method to evaluate responses for building preference data.
"""Use input model as a judge to evaluate responses.
Args:
model (str): Model identifier to use as a judge. Follows provider/model-identifier format.
models (str): List of model identifiers to use as a judge. Follows provider/model-identifier format.
questions (List(str)): Questions for each response pair.
responses (List[List[str]]): Pairwise responses to evaluate.
prompt (str): Optional custom prompt for judge model to follow.
Expand All @@ -37,21 +37,26 @@ def eval(
ValueError: If the model format is invalid or provider is not supported.
"""
try:
provider = self.client._get_model_provider(model)
model_id = self.client._get_model_id(model)

if provider not in ["openai", "anthropic"]:
raise ProviderError(provider, "Provider not supported for evaluation")

llm = self.client._get_provider_instance(provider=provider)
res = llm.generate_batch(
model=model_id,
questions=questions,
responses=responses,
prompt=prompt if prompt else None,
)
result = []
for m in models:
provider = self.client._get_model_provider(m)
model_id = self.client._get_model_id(m)

if provider not in ["openai", "anthropic"]:
raise ProviderError(
provider, "Provider not supported for evaluation"
)

return res
llm = self.client._get_provider_instance(provider=provider)
res = llm.generate_batch(
model=model_id,
questions=questions,
responses=responses,
prompt=prompt if prompt else None,
)

result.append(res)
return result
except (AuthenticationError, ValueError) as e:
raise e
except Exception as e:
Expand Down Expand Up @@ -87,17 +92,17 @@ def load_batch(self, filename: str, provider: str):

def get_consensus(
self,
batch_openai: List,
batch_anthropic: List,
):
"""Get consensus between OpenAI and Anthropic batch results.
batch_A: List,
batch_B: List,
) -> List[Dict]:
"""Reach consensus between two batch results.
Args:
batch_openai (List): List of batch results from OpenAI
batch_anthropic (List): List of batch results from Anthropic
batch_A (List): List of batch results to compare
batch_B (List): List of batch results to compare
Returns:
List: List of evaluation results where both providers agreed on the rank
List[Dict]: List of evaluation results where both providers agree on
Raises:
Exception: If there's an error processing the batch results
Expand All @@ -107,30 +112,35 @@ def get_consensus(
# only requires single pass on batch data to reach consensus.
res = []
check = {}
for r in batch_openai:
try:
for r in batch_A:
# check if batch is from openai
if isinstance(r, dict):
custom_id = r["custom_id"]
content = json.loads(
r["response"]["body"]["choices"][0]["message"]["content"]
)
check[custom_id] = content["evaluation"][0]["rank"]
except (KeyError, json.JSONDecodeError) as e:
continue # Skip malformed entries
else:
custom_id = r.custom_id
content = r.result.message.content[0].input

check[custom_id] = content["evaluation"][0]["rank"]

for r in batch_anthropic:
try:
for r in batch_B:
if isinstance(r, dict):
custom_id = r["custom_id"]
content = json.loads(
r["response"]["body"]["choices"][0]["message"]["content"]
)
else:
custom_id = r.custom_id
content = r.result.message.content[0].input

# Check if same custom_id exists in OpenAI results and ranks match
if (
custom_id in check
and check[custom_id] == content["evaluation"][0]["rank"]
):
record = {"q_index": custom_id} | content["evaluation"][0]
res.append(record)
except (KeyError, AttributeError) as e:
continue
if (
custom_id in check
and check[custom_id] == content["evaluation"][0]["rank"]
):
record = {"q_index": custom_id} | content["evaluation"][0]
res.append(record)

return res
except Exception as e:
Expand Down
166 changes: 67 additions & 99 deletions openpo/resources/eval/eval.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import json
from typing import Dict, List, Optional
from typing import Dict, List, Optional, Union

from openpo.internal.error import AuthenticationError, ProviderError
from openpo.resources.provider import Anthropic, OpenAI
Expand All @@ -14,138 +14,106 @@ def _get_model_consensus(
res_a: List[Dict],
res_b: List[Dict],
) -> List[int]:

matching_indices = []
for i, (a, b) in enumerate(zip(res_a, res_b)):
if a.get("q_index") == b.get("q_index") and a["rank"] == b["rank"]:
matching_indices.append(a.get("q_index", i))

return matching_indices

def eval_single(
self,
model: str,
questions: List[str],
responses: List[List[str]],
prompt: Optional[str] = None,
):
"""Use single LLM-as-a-judge method to evaluate responses for building preference data.
Args:
model (str): Model identifier to use as a judge. Follows provider/model-identifier format.
questions (List(str)): Questions for each response pair.
responses (List[List[str]]): Pairwise responses to evaluate.
prompt (str): Optional custom prompt for judge model to follow.
Returns (Dict): The evaluation data for responses with preferred, rejected, confidence_score and reason.
def _validate_provider(self, provider: str) -> None:
if provider not in ["openai", "anthropic"]:
raise ProviderError(provider, "Provider not supported for evaluation")

Raises:
b AuthenticationError: If required API keys are missing or invalid.
ProviderError: For provider-specific errors during evaluation.
ValueError: If the model format is invalid or provider is not supported.
"""
def _parse_response(self, response) -> List[Dict]:
try:
provider = self.client._get_model_provider(model)
model_id = self.client._get_model_id(model)

if provider not in ["openai", "anthropic"]:
raise ProviderError(provider, "Provider not supported for evaluation")

llm = self.client._get_provider_instance(provider=provider)
res = llm.generate(
model=model_id,
questions=questions,
responses=responses,
prompt=prompt if prompt else None,
)

if provider == "anthropic":
result = res.content[0].input['"evaluation']
result = json.loads(res.choices[0].message.content)["evaluation"]

return {"evaluation": result}
except (AuthenticationError, ValueError) as e:
raise e
if "chatcmpl" in response.id:
return json.loads(response.choices[0].message.content)["evaluation"]
return response.content[0].input["evaluation"]
except Exception as e:
raise ProviderError(
provider=provider, message=f"Error during evaluation: {str(e)}"
)
raise Exception(f"Error parsing model responses: {e}")

def eval_multi(
def eval(
self,
models: List[str],
questions: List[str],
responses: List[List],
responses: List[List[str]],
prompt: Optional[str] = None,
):
"""Use multiple LLMs as a judge for model consensus to evaluate responses for building preference data.
) -> List[Dict]:
"""Evaluate responses using either single or multiple LLMs as judges.
Args:
models (List): List of models to use as a judge. Follows provider/model-identifier format.
questions (List(str)): Questions for each response pair.
models (Union[str, List[str]]): List of model identifier or list of models to use as judges. Follows provider/model-identifier format.
questions (List[str]): Questions for each response pair.
responses (List[List[str]]): Pairwise responses to evaluate.
prompt (str): Optional custom prompt for judge model to follow.
Returns (Dict): The evaluation data for responses that all models agree on.
- preference: Evaluation data on the input responses.
- q_index: Index of questions that reached consensus by the models.
Returns:
List[Dict]: The evaluation data for responses. Response returns preferred, rejected, confidence_score and reason.
Raises:
AuthenticationError: If required API keys are missing or invalid.
ProviderError: For provider-specific errors during evaluation.
ValueError: If the model format is invalid or required models are missing.
"""
try:
judge_a = self.client._get_provider_instance("anthropic")
judge_o = self.client_get_provider_instance("openai")

a_model = ""
o_model = ""

eval_res = []
for m in models:
provider = self.client._get_model_provider(m)
if provider == "anthropic":
a_model = self.client._get_model_id(m)
elif provider == "openai":
o_model = self.client._get_model_id(m)
else:
raise ProviderError(
provider, "Provider not supported for evaluation"
)

if not a_model or not o_model:
raise ValueError("Both Anthropic and OpenAI models must be provided")

res_a = judge_a.generate(
model=a_model,
questions=questions,
responses=responses,
prompt=prompt if prompt else None,
)
parsed_res_a = res_a.content[0].input["evaluation"]
model_id = self.client._get_model_id(m)

res_o = judge_o.generate(
model=o_model,
questions=questions,
responses=responses,
prompt=prompt if prompt else None,
)
parsed_res_o = json.loads(res_o.choices[0].message.content)["evaluation"]
self._validate_provider(provider)

idx = self._get_model_consensus(
parsed_res_a,
parsed_res_o,
)
llm = self.client._get_provider_instance(provider=provider)
res = llm.generate(
model=model_id,
questions=questions,
responses=responses,
prompt=prompt if prompt else None,
)
eval_res.append(res)

return eval_res

return {
"evaluation": [parsed_res_o[i] for i in idx],
"q_index": idx,
}
except (AuthenticationError, ValueError) as e:
raise e
except Exception as e:
raise ProviderError(
provider="eval-multi",
message=f"Error during multi-model evaluation: {str(e)}",
provider="", message=f"Error during evaluation: {str(e)}"
)

def get_consensus(self, eval_A: List, eval_B: List):
"""Reach consensus between two evaluation results
Args:
eval_A (List): List of batch results to compare
eval_B (List): List of batch results to compare
Returns:
List: List of evaluation results where both providers agreed on the rank
Raises:
Exception: If there's an error processing the batch results
"""
try:
parsed_a = self._parse_response(
response=eval_A,
)
parsed_b = self._parse_response(
response=eval_B,
)

res = []
check = {}

for e in parsed_a:
q_index = e["q_index"]
check[q_index] = e["rank"]

for e in parsed_b:
q_index = e["q_index"]
if q_index in check and check[q_index] == e["rank"]:
res.append(e)
return res
except Exception as e:
raise Exception(f"Error processing responses for consensus: {str(e)}")
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "openpo"
version = "0.6.0"
version = "0.6.1"
description = "Build high quality synthetic datasets with AI feedback from 200+ LLMs"
authors = ["Daniel Lee <[email protected]>"]
license = "Apache-2.0"
Expand Down

0 comments on commit 64bff24

Please sign in to comment.