-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
2 changed files
with
143 additions
and
0 deletions.
There are no files selected for viewing
125 changes: 125 additions & 0 deletions
125
automation-api/yival_experiments/custom_configuration/claude_evaluator.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,125 @@ | ||
""" | ||
ClaudeEvaluator is an evaluator that uses Anthropic's Claude model for evaluations. | ||
The evaluator interfaces with Claude via litellm to present tasks and interpret | ||
the model's responses to determine the quality or correctness of a given | ||
experiment result. | ||
""" | ||
import copy | ||
import logging | ||
|
||
import litellm | ||
from claude_evaluator_config import ClaudeEvaluatorConfig | ||
from evaluator_common import ( | ||
CLASSIFY_STR, | ||
calculate_choice_score, | ||
choices_to_string, | ||
completion_with_backpff, | ||
extract_choice_from_response, | ||
format_template, | ||
) | ||
from yival.evaluators.base_evaluator import BaseEvaluator | ||
from yival.schemas.evaluator_config import ( | ||
EvaluatorOutput, | ||
EvaluatorType, | ||
MethodCalculationMethod, | ||
MetricCalculatorConfig, | ||
) | ||
from yival.schemas.experiment_config import ( | ||
ExperimentResult, | ||
InputData, | ||
MultimodalOutput, | ||
) | ||
|
||
logging.basicConfig(level=logging.INFO) | ||
logger = logging.getLogger(__name__) | ||
|
||
|
||
class ClaudeEvaluator(BaseEvaluator): | ||
"""Evaluator using Claude for evaluation.""" | ||
|
||
default_config = ClaudeEvaluatorConfig(name="claude_evaluator") # type: ignore | ||
|
||
def __init__(self, config: ClaudeEvaluatorConfig): | ||
super().__init__(config) | ||
self.config = config | ||
|
||
def evaluate(self, experiment_result: ExperimentResult) -> EvaluatorOutput: | ||
"""Evaluate the experiment result using Claude.""" | ||
format_dict = copy.deepcopy(experiment_result.input_data.content) | ||
format_dict["raw_output"] = experiment_result.raw_output.text_output | ||
|
||
prompt = format_template(self.config.prompt, format_dict) | ||
if isinstance(prompt, str): | ||
prompt = [{"role": "user", "content": prompt}] | ||
|
||
prompt[-1]["content"] += "\n\n" + CLASSIFY_STR.format( | ||
choices=choices_to_string(self.config.choices) | ||
) | ||
response = completion_with_backpff( | ||
model=self.config.model_name, | ||
messages=prompt, | ||
temperature=0.0, | ||
n=1, | ||
max_tokens=2000, | ||
request_timeout=60, | ||
caching=True, | ||
) | ||
response_content = response["choices"][0]["message"]["content"] | ||
choice = extract_choice_from_response(response_content, self.config.choices) | ||
score = calculate_choice_score(choice, self.config.choice_scores) | ||
return EvaluatorOutput( | ||
name=self.config.name, | ||
result=score if score is not None else choice, | ||
display_name=self.config.display_name, | ||
metric_calculators=self.config.metric_calculators, | ||
) | ||
|
||
|
||
BaseEvaluator.register_evaluator( | ||
"claude_evaluator", ClaudeEvaluator, ClaudeEvaluatorConfig | ||
) | ||
|
||
|
||
def main(): | ||
"""Main function to test the ClaudeEvaluator.""" | ||
from example_evaluator_data import ( | ||
choice_scores, | ||
choices, | ||
content, | ||
prompt, | ||
raw_output, | ||
) | ||
|
||
litellm.set_verbose = True | ||
|
||
evaluator_config = ClaudeEvaluatorConfig( | ||
name="claude_evaluator", | ||
display_name="correctness test", | ||
metric_calculators=[ | ||
MetricCalculatorConfig( | ||
MethodCalculationMethod(MethodCalculationMethod.AVERAGE) | ||
) | ||
], | ||
prompt=prompt, | ||
choices=choices, | ||
evaluator_type=EvaluatorType.INDIVIDUAL, | ||
choice_scores=choice_scores, | ||
) | ||
input_data_example = InputData(content=content) | ||
|
||
experiment_result_example = ExperimentResult( | ||
input_data=input_data_example, | ||
combination={"wrapper1": "var1", "wrapper2": "var2"}, | ||
raw_output=MultimodalOutput(text_output=raw_output), | ||
latency=150.0, | ||
token_usage=50, | ||
) | ||
|
||
evaluator = ClaudeEvaluator(evaluator_config) | ||
result = evaluator.evaluate(experiment_result_example) | ||
print("Result: ", result.result) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
18 changes: 18 additions & 0 deletions
18
automation-api/yival_experiments/custom_configuration/claude_evaluator_config.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
from dataclasses import asdict, dataclass, field | ||
from typing import Any, Dict, List, Optional, Union | ||
|
||
from yival.schemas.evaluator_config import EvaluatorConfig, EvaluatorType | ||
|
||
|
||
@dataclass | ||
class ClaudeEvaluatorConfig(EvaluatorConfig): | ||
evaluator_type: EvaluatorType = EvaluatorType.INDIVIDUAL | ||
prompt: Union[str, List[Dict[str, str]]] = "" | ||
choices: List[str] = field(default_factory=list) | ||
model_name: str = "claude-3-5-sonnet-20241022" | ||
description: str = "This is an evaluator that uses Anthropic's Claude model." | ||
scale_description: str = "0-4" | ||
choice_scores: Optional[Dict[str, float]] = None | ||
|
||
def asdict(self) -> Dict[str, Any]: | ||
return asdict(self) |