diff --git a/automation-api/pyproject.toml b/automation-api/pyproject.toml index dcc0548..cc210e5 100644 --- a/automation-api/pyproject.toml +++ b/automation-api/pyproject.toml @@ -69,6 +69,7 @@ langdetect = "^1.0.9" duckdb = "^0.10.2" duckdb-engine = "^0.12.0" jupysql = "^0.10.10" +anthropic = {extras = ["vertex"], version = "^0.25.9"} @@ -97,7 +98,6 @@ set_jupytext_as_default_viewer = "jupytext-config set-default-viewer" jupyter = 'jupyter-notebook' notebooks = ["install_kernel", "set_jupytext_as_default_viewer", "jupyter"] fetch_questions = "python yival_experiments/scripts/fetch_questions.py" -generate_experiment_config = "python yival_experiments/scripts/generate_experiment_config.py" generate_result = "python yival_experiments/scripts/generate_result.py" start_redis = "docker run --rm -p 26379:6379 --name local-redis redis redis-server --save 60 1 --loglevel warning" @@ -110,6 +110,17 @@ help = "Run a yival experiment with a given name." help = "Name of the experiment to run" options = ["-e", "--experiment"] +[tool.poe.tasks.generate_experiment_config] +shell = "python yival_experiments/scripts/generate_experiment_config.py --evaluator $evaluator" +help = "generate experiment config, and set evaluator" + + [[tool.poe.tasks.generate_experiment_config.args]] + name = "evaluator" + help = "Name of the evaluator" + options = ["-e", "--evaluator"] + default = "gpt4" + + [tool.poe] envfile = ".env" diff --git a/automation-api/yival_experiments/custom_configuration/gpt4_evaluator_config.py b/automation-api/yival_experiments/custom_configuration/gpt4_evaluator_config.py index 3fc2d01..aa9e0af 100644 --- a/automation-api/yival_experiments/custom_configuration/gpt4_evaluator_config.py +++ b/automation-api/yival_experiments/custom_configuration/gpt4_evaluator_config.py @@ -9,7 +9,7 @@ class GPT4EvaluatorConfig(EvaluatorConfig): evaluator_type: EvaluatorType = EvaluatorType.INDIVIDUAL prompt: Union[str, List[Dict[str, str]]] = "" choices: List[str] = field(default_factory=list) - model_name: str = "gpt-4" + model_name: str = "gpt-4o" description: str = "This is the description of the evaluator." scale_description: str = "0-4" choice_scores: Optional[Dict[str, float]] = None diff --git a/automation-api/yival_experiments/custom_configuration/vertex_ai_evaluator.py b/automation-api/yival_experiments/custom_configuration/vertex_ai_evaluator.py new file mode 100644 index 0000000..2eea798 --- /dev/null +++ b/automation-api/yival_experiments/custom_configuration/vertex_ai_evaluator.py @@ -0,0 +1,208 @@ +""" +An evaluator that uses Vertex AI's prompt-based system for evaluations. + +The evaluator interfaces with the Vertex AI API to present tasks and interpret +the model's responses to determine the quality or correctness of a given +experiment result. +""" +# FIXME: this file is just about the same as gpt4_evaluator.py. We should generalize these files. +import copy +import logging +import os +import string +from typing import Any, Dict, Iterable, List, Optional, Union + +# for exponential backoff +import litellm +from tenacity import before_sleep_log, retry, stop_after_attempt, wait_random +from vertex_ai_evaluator_config import VertexAIEvaluatorConfig +from yival.evaluators.base_evaluator import BaseEvaluator +from yival.schemas.evaluator_config import ( + EvaluatorOutput, + EvaluatorType, + MethodCalculationMethod, + MetricCalculatorConfig, +) +from yival.schemas.experiment_config import ( + ExperimentResult, + InputData, + MultimodalOutput, +) + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +# because claude opus is only available in one location, we will hard code it here. +VERTEX_LOCATION = "us-east5" +VERTEX_PROJECT = os.environ["VERTEXAI_PROJECT"] + +CLASSIFY_STR = """ +First, write out in a step by step manner your reasoning to be sure that your +conclusion is correct. +Avoid simply stating the correct answer at the outset. +Then print only a single choice from {choices} (without quotes or punctuation) +on its own line corresponding to the correct answer. +At the end, repeat just the answer by itself on a new line. +Reasoning: +""" + +MATCH_FNS = { + "include": lambda x, y: float(x in y), + "exact": lambda x, y: float(x == y), + "endswith": lambda x, y: x.endswith(y), + "starts_or_endswith": lambda x, y: x.startswith(y) or x.endswith(y), +} + + +def extract_choice_from_response(response: str, choice_strings: Iterable[str]) -> str: + """Extracts the choice from the response string.""" + lines = response.strip().split("\n") + for line in lines: + sanitized_line = "".join(c for c in line if c not in string.punctuation).strip() + if not sanitized_line: + continue + for choice in choice_strings: + if MATCH_FNS["exact"](sanitized_line, choice): + return choice + return "invalid response" + + +def calculate_choice_score( + choice: str, choice_scores: Optional[Dict[str, float]] = None +) -> Optional[float]: + """Calculates the score for the given choice.""" + if choice_scores is None: + return None + if choice == "invalid response": + return min(choice_scores.values()) + return choice_scores.get(choice) + + +def format_template( + template: Union[str, List[Dict[str, str]]], content: Dict[str, Any] +) -> Union[str, List[Dict[str, str]]]: + """Formats a string or list template with the provided content.""" + if isinstance(template, str): + try: + return template.format(**content) + except KeyError as e: + raise ValueError(f"Missing key {e} in content dictionary") + + res = [] + for t in template: + formatted_msg = copy.deepcopy(t) + try: + if "content" in formatted_msg: + formatted_msg["content"] = formatted_msg["content"].format(**content) + except KeyError as e: + raise ValueError(f"Missing key {e} in content dictionary") + res.append(formatted_msg) + return res + + +@retry( + wait=wait_random(min=1, max=20), + stop=stop_after_attempt(100), + before_sleep=before_sleep_log(logger, logging.DEBUG), +) +def completion_with_backpff(**kwargs): + # response = openai.ChatCompletion.create(**kwargs) + response = litellm.completion(**kwargs) + return response + + +def choices_to_string(choice_strings: Iterable[str]) -> str: + """Converts a list of choices into a formatted string.""" + return " or ".join(f'"{choice}"' for choice in choice_strings) + + +class VertexAIEvaluator(BaseEvaluator): + """Evaluator using VertexAI's prompt-based evaluation.""" + + default_config = VertexAIEvaluatorConfig(name="vertex_ai_evaluator") # type: ignore + + def __init__(self, config: VertexAIEvaluatorConfig): + super().__init__(config) + self.config = config + + def evaluate(self, experiment_result: ExperimentResult) -> EvaluatorOutput: + """Evaluate the experiment result using Vertex AI's prompt-based evaluation.""" + assert isinstance(self.config, VertexAIEvaluatorConfig) + format_dict = copy.deepcopy(experiment_result.input_data.content) + format_dict["raw_output"] = experiment_result.raw_output.text_output + + prompt = format_template(self.config.prompt, format_dict) + if isinstance(prompt, str): + prompt = [{"role": "user", "content": prompt}] + + prompt[-1]["content"] += "\n\n" + CLASSIFY_STR.format( + choices=choices_to_string(self.config.choices) + ) + response = completion_with_backpff( + model=self.config.model_name, + messages=prompt, + temperature=0.0, + n=1, + max_tokens=1000, + request_timeout=60, + caching=True, + vertex_ai_location=VERTEX_LOCATION, + vertex_ai_project=VERTEX_PROJECT, + ) + # response = openai.ChatCompletion.create( + # model="gpt-4", messages=prompt, temperature=0.5) + response_content = response["choices"][0]["message"]["content"] + choice = extract_choice_from_response(response_content, self.config.choices) + score = calculate_choice_score(choice, self.config.choice_scores) + return EvaluatorOutput( + name=self.config.name, + result=score if score is not None else choice, + display_name=self.config.display_name, + metric_calculators=self.config.metric_calculators, + ) + + +BaseEvaluator.register_evaluator( + "vertex_ai_evaluator", VertexAIEvaluator, VertexAIEvaluatorConfig +) + + +def main(): + """Main function to test the OpenAIPromptBasedEvaluator.""" + evaluator_config = VertexAIEvaluatorConfig( + name="vertex_ai_evaluator", + display_name="math calculator", + metric_calculators=[ + MetricCalculatorConfig( + MethodCalculationMethod(MethodCalculationMethod.AVERAGE) + ) + ], + prompt="{problem}\n\n Is the answer '{raw_output}' correct? .", + choices=["Yes", "No"], + evaluator_type=EvaluatorType.INDIVIDUAL, + choice_scores={"Yes": 1.0, "No": 0}, + ) + input_data_example = InputData( + content={ + "problem": "Calculate the area of a circle with radius 5.", + "method": "Using the formula for the area of a circle: pi*r^2", + } + ) + + experiment_result_example = ExperimentResult( + input_data=input_data_example, + combination={"wrapper1": "var1", "wrapper2": "var2"}, + raw_output=MultimodalOutput( + text_output="The area of the circle is 78.54 square units." + ), + latency=150.0, + token_usage=50, + ) + + evaluator = VertexAIEvaluator(evaluator_config) + result = evaluator.evaluate(experiment_result_example) + print(result) + + +if __name__ == "__main__": + main() diff --git a/automation-api/yival_experiments/custom_configuration/vertex_ai_evaluator_config.py b/automation-api/yival_experiments/custom_configuration/vertex_ai_evaluator_config.py new file mode 100644 index 0000000..8e81ab5 --- /dev/null +++ b/automation-api/yival_experiments/custom_configuration/vertex_ai_evaluator_config.py @@ -0,0 +1,18 @@ +from dataclasses import asdict, dataclass, field +from typing import Any, Dict, List, Optional, Union + +from yival.schemas.evaluator_config import EvaluatorConfig, EvaluatorType + + +@dataclass +class VertexAIEvaluatorConfig(EvaluatorConfig): + evaluator_type: EvaluatorType = EvaluatorType.INDIVIDUAL + prompt: Union[str, List[Dict[str, str]]] = "" + choices: List[str] = field(default_factory=list) + model_name: str = "vertex_ai/claude-3-opus@20240229" + description: str = "This is the description of the evaluator." + scale_description: str = "0-4" + choice_scores: Optional[Dict[str, float]] = None + + def asdict(self) -> Dict[str, Any]: + return asdict(self) diff --git a/automation-api/yival_experiments/experiment_defaults.yaml b/automation-api/yival_experiments/experiment_defaults.yaml index a6345d3..6a58f61 100644 --- a/automation-api/yival_experiments/experiment_defaults.yaml +++ b/automation-api/yival_experiments/experiment_defaults.yaml @@ -13,6 +13,11 @@ custom_evaluators: class: ./custom_configuration/gpt4_evaluator.GPT4Evaluator config_cls: ./custom_configuration/gpt4_evaluator_config.GPT4EvaluatorConfig + vertex_ai_evaluator: + class: ./custom_configuration/vertex_ai_evaluator.VertexAIEvaluator + config_cls: ./custom_configuration/vertex_ai_evaluator_config.VertexAIEvaluatorConfig + + custom_function: model_compare.model_compare dataset: diff --git a/automation-api/yival_experiments/scripts/generate_experiment_config.py b/automation-api/yival_experiments/scripts/generate_experiment_config.py index 57582b7..51903a3 100644 --- a/automation-api/yival_experiments/scripts/generate_experiment_config.py +++ b/automation-api/yival_experiments/scripts/generate_experiment_config.py @@ -1,3 +1,4 @@ +import argparse import os from datetime import datetime from pathlib import Path @@ -34,15 +35,23 @@ def str_presenter(dumper, data): latest_experiment_path = current_script_path / "../experiment_latest.yaml" -def get_evaluators(ai_eval_sheet: AiEvalData): +def get_evaluators(ai_eval_sheet: AiEvalData, evaluator_model="gpt4"): metrics = get_metrics(ai_eval_sheet) res = list() + + if evaluator_model == "gpt4": + evaluator_name = "gpt4_evaluator" + model_name = "gpt-4o" + elif evaluator_model == "claude": + evaluator_name = "vertex_ai_evaluator" + model_name = "vertex_ai/claude-3-opus@20240229" + for m in metrics: metric: Dict[str, Any] = dict() metric["evaluator_type"] = "individual" metric["metric_calculators"] = [{"method": "AVERAGE"}] - metric["name"] = "gpt4_evaluator" - metric["model_name"] = "gpt-4-0125-preview" + metric["name"] = evaluator_name + metric["model_name"] = model_name metric["prompt"] = m.prompt metric["choices"] = m.choices.split(", ") metric["description"] = m.description @@ -93,14 +102,14 @@ def get_prompt_variations_yaml_dict(prompt_variations: List[PromptVariation]): return res -def main(): +def main(evaluator_model): print("Reading AI eval spreadsheet") sheet = read_ai_eval_spreadsheet() # load default config config = yaml.load(open(base_configs_path, "r"), Loader=yaml.Loader) # metrics - config["evaluators"] = get_evaluators(sheet) + config["evaluators"] = get_evaluators(sheet, evaluator_model=evaluator_model) # model configs and prompt variations model_configs = get_model_configs(sheet) model_ids = {model.model_id for model, model_config in model_configs} @@ -161,4 +170,21 @@ def main(): if __name__ == "__main__": - main() + # Create the parser + parser = argparse.ArgumentParser(description="generate experiment config") + + # Add the -e/--evaluator argument + parser.add_argument( + "-e", + "--evaluator", + type=str, + required=False, + default="gpt4", + help="The evaluator string. (gpt4 or claude)", + ) + + # Parse the arguments + args = parser.parse_args() + + # run main + main(args.evaluator)