update how to run eval and consensus

dannylee1020 · Dec 20, 2024 · 64bff24 · 64bff24
1 parent 752d398
commit 64bff24
Show file tree

Hide file tree

Showing 4 changed files with 121 additions and 143 deletions.
diff --git a/openpo/client.py b/openpo/client.py
@@ -3,7 +3,7 @@
 from typing import Any, Dict, List, Optional
 
 from .internal.error import AuthenticationError, ProviderError
-from .internal.response import ChatCompletionOutput, ChatCompletionStreamOutput
+from .internal.response import ChatCompletionOutput
 from .resources.batch.batch import Batch
 from .resources.eval.eval import Evaluation
 from .resources.provider import Anthropic, HuggingFace, OpenAI, OpenRouter
@@ -73,7 +73,7 @@ def completions(
         models: List[str],
         messages: List[Dict[str, Any]],
         params: Optional[Dict[str, Any]] = None,
-    ) -> List[ChatCompletionOutput | ChatCompletionStreamOutput]:
+    ) -> List[ChatCompletionOutput]:
         """Generate completions using the specified LLM provider.
 
         Args:
@@ -112,7 +112,7 @@ def completions(
         return responses
 
     @property
-    def eval(self):
+    def evaluate(self):
         return self._eval
 
     @property

diff --git a/openpo/resources/batch/batch.py b/openpo/resources/batch/batch.py
@@ -16,15 +16,15 @@ def __init__(self, client):
 
     def eval(
         self,
-        model: str,
+        models: List[str],
         questions: List[str],
         responses: List[List[str]],
         prompt: Optional[str] = None,
     ):
-        """Use single LLM-as-a-judge method to evaluate responses for building preference data.
+        """Use input model as a judge to evaluate responses.
 
         Args:
-            model (str): Model identifier to use as a judge. Follows provider/model-identifier format.
+            models (str): List of model identifiers to use as a judge. Follows provider/model-identifier format.
             questions (List(str)): Questions for each response pair.
             responses (List[List[str]]): Pairwise responses to evaluate.
             prompt (str): Optional custom prompt for judge model to follow.
@@ -37,21 +37,26 @@ def eval(
             ValueError: If the model format is invalid or provider is not supported.
         """
         try:
-            provider = self.client._get_model_provider(model)
-            model_id = self.client._get_model_id(model)
-
-            if provider not in ["openai", "anthropic"]:
-                raise ProviderError(provider, "Provider not supported for evaluation")
-
-            llm = self.client._get_provider_instance(provider=provider)
-            res = llm.generate_batch(
-                model=model_id,
-                questions=questions,
-                responses=responses,
-                prompt=prompt if prompt else None,
-            )
+            result = []
+            for m in models:
+                provider = self.client._get_model_provider(m)
+                model_id = self.client._get_model_id(m)
+
+                if provider not in ["openai", "anthropic"]:
+                    raise ProviderError(
+                        provider, "Provider not supported for evaluation"
+                    )
 
-            return res
+                llm = self.client._get_provider_instance(provider=provider)
+                res = llm.generate_batch(
+                    model=model_id,
+                    questions=questions,
+                    responses=responses,
+                    prompt=prompt if prompt else None,
+                )
+
+                result.append(res)
+            return result
         except (AuthenticationError, ValueError) as e:
             raise e
         except Exception as e:
@@ -87,17 +92,17 @@ def load_batch(self, filename: str, provider: str):
 
     def get_consensus(
         self,
-        batch_openai: List,
-        batch_anthropic: List,
-    ):
-        """Get consensus between OpenAI and Anthropic batch results.
+        batch_A: List,
+        batch_B: List,
+    ) -> List[Dict]:
+        """Reach consensus between two batch results.
 
         Args:
-            batch_openai (List): List of batch results from OpenAI
-            batch_anthropic (List): List of batch results from Anthropic
+            batch_A (List): List of batch results to compare
+            batch_B (List): List of batch results to compare
 
         Returns:
-            List: List of evaluation results where both providers agreed on the rank
+            List[Dict]: List of evaluation results where both providers agree on
 
         Raises:
             Exception: If there's an error processing the batch results
@@ -107,30 +112,35 @@ def get_consensus(
             # only requires single pass on batch data to reach consensus.
             res = []
             check = {}
-            for r in batch_openai:
-                try:
+            for r in batch_A:
+                # check if batch is from openai
+                if isinstance(r, dict):
                     custom_id = r["custom_id"]
                     content = json.loads(
                         r["response"]["body"]["choices"][0]["message"]["content"]
                     )
-                    check[custom_id] = content["evaluation"][0]["rank"]
-                except (KeyError, json.JSONDecodeError) as e:
-                    continue  # Skip malformed entries
+                else:
+                    custom_id = r.custom_id
+                    content = r.result.message.content[0].input
+
+                check[custom_id] = content["evaluation"][0]["rank"]
 
-            for r in batch_anthropic:
-                try:
+            for r in batch_B:
+                if isinstance(r, dict):
+                    custom_id = r["custom_id"]
+                    content = json.loads(
+                        r["response"]["body"]["choices"][0]["message"]["content"]
+                    )
+                else:
                     custom_id = r.custom_id
                     content = r.result.message.content[0].input
 
-                    # Check if same custom_id exists in OpenAI results and ranks match
-                    if (
-                        custom_id in check
-                        and check[custom_id] == content["evaluation"][0]["rank"]
-                    ):
-                        record = {"q_index": custom_id} | content["evaluation"][0]
-                        res.append(record)
-                except (KeyError, AttributeError) as e:
-                    continue
+                if (
+                    custom_id in check
+                    and check[custom_id] == content["evaluation"][0]["rank"]
+                ):
+                    record = {"q_index": custom_id} | content["evaluation"][0]
+                    res.append(record)
 
             return res
         except Exception as e:

diff --git a/openpo/resources/eval/eval.py b/openpo/resources/eval/eval.py
@@ -1,5 +1,5 @@
 import json
-from typing import Dict, List, Optional
+from typing import Dict, List, Optional, Union
 
 from openpo.internal.error import AuthenticationError, ProviderError
 from openpo.resources.provider import Anthropic, OpenAI
@@ -14,138 +14,106 @@ def _get_model_consensus(
         res_a: List[Dict],
         res_b: List[Dict],
     ) -> List[int]:
-
         matching_indices = []
         for i, (a, b) in enumerate(zip(res_a, res_b)):
             if a.get("q_index") == b.get("q_index") and a["rank"] == b["rank"]:
                 matching_indices.append(a.get("q_index", i))
 
         return matching_indices
 
-    def eval_single(
-        self,
-        model: str,
-        questions: List[str],
-        responses: List[List[str]],
-        prompt: Optional[str] = None,
-    ):
-        """Use single LLM-as-a-judge method to evaluate responses for building preference data.
-
-         Args:
-             model (str): Model identifier to use as a judge. Follows provider/model-identifier format.
-             questions (List(str)): Questions for each response pair.
-             responses (List[List[str]]): Pairwise responses to evaluate.
-             prompt (str): Optional custom prompt for judge model to follow.
-
-         Returns (Dict): The evaluation data for responses with preferred, rejected, confidence_score and reason.
+    def _validate_provider(self, provider: str) -> None:
+        if provider not in ["openai", "anthropic"]:
+            raise ProviderError(provider, "Provider not supported for evaluation")
 
-         Raises:
-        b     AuthenticationError: If required API keys are missing or invalid.
-             ProviderError: For provider-specific errors during evaluation.
-             ValueError: If the model format is invalid or provider is not supported.
-        """
+    def _parse_response(self, response) -> List[Dict]:
         try:
-            provider = self.client._get_model_provider(model)
-            model_id = self.client._get_model_id(model)
-
-            if provider not in ["openai", "anthropic"]:
-                raise ProviderError(provider, "Provider not supported for evaluation")
-
-            llm = self.client._get_provider_instance(provider=provider)
-            res = llm.generate(
-                model=model_id,
-                questions=questions,
-                responses=responses,
-                prompt=prompt if prompt else None,
-            )
-
-            if provider == "anthropic":
-                result = res.content[0].input['"evaluation']
-            result = json.loads(res.choices[0].message.content)["evaluation"]
-
-            return {"evaluation": result}
-        except (AuthenticationError, ValueError) as e:
-            raise e
+            if "chatcmpl" in response.id:
+                return json.loads(response.choices[0].message.content)["evaluation"]
+            return response.content[0].input["evaluation"]
         except Exception as e:
-            raise ProviderError(
-                provider=provider, message=f"Error during evaluation: {str(e)}"
-            )
+            raise Exception(f"Error parsing model responses: {e}")
 
-    def eval_multi(
+    def eval(
         self,
         models: List[str],
         questions: List[str],
-        responses: List[List],
+        responses: List[List[str]],
         prompt: Optional[str] = None,
-    ):
-        """Use multiple LLMs as a judge for model consensus to evaluate responses for building preference data.
+    ) -> List[Dict]:
+        """Evaluate responses using either single or multiple LLMs as judges.
 
         Args:
-            models (List): List of models to use as a judge. Follows provider/model-identifier format.
-            questions (List(str)): Questions for each response pair.
+            models (Union[str, List[str]]): List of  model identifier or list of models to use as judges. Follows provider/model-identifier format.
+            questions (List[str]): Questions for each response pair.
             responses (List[List[str]]): Pairwise responses to evaluate.
             prompt (str): Optional custom prompt for judge model to follow.
 
-        Returns (Dict): The evaluation data for responses that all models agree on.
-
-            - preference: Evaluation data on the input responses.
-            - q_index: Index of questions that reached consensus by the models.
+        Returns:
+            List[Dict]: The evaluation data for responses. Response returns preferred, rejected, confidence_score and reason.
 
         Raises:
             AuthenticationError: If required API keys are missing or invalid.
             ProviderError: For provider-specific errors during evaluation.
             ValueError: If the model format is invalid or required models are missing.
         """
         try:
-            judge_a = self.client._get_provider_instance("anthropic")
-            judge_o = self.client_get_provider_instance("openai")
-
-            a_model = ""
-            o_model = ""
-
+            eval_res = []
             for m in models:
                 provider = self.client._get_model_provider(m)
-                if provider == "anthropic":
-                    a_model = self.client._get_model_id(m)
-                elif provider == "openai":
-                    o_model = self.client._get_model_id(m)
-                else:
-                    raise ProviderError(
-                        provider, "Provider not supported for evaluation"
-                    )
-
-            if not a_model or not o_model:
-                raise ValueError("Both Anthropic and OpenAI models must be provided")
-
-            res_a = judge_a.generate(
-                model=a_model,
-                questions=questions,
-                responses=responses,
-                prompt=prompt if prompt else None,
-            )
-            parsed_res_a = res_a.content[0].input["evaluation"]
+                model_id = self.client._get_model_id(m)
 
-            res_o = judge_o.generate(
-                model=o_model,
-                questions=questions,
-                responses=responses,
-                prompt=prompt if prompt else None,
-            )
-            parsed_res_o = json.loads(res_o.choices[0].message.content)["evaluation"]
+                self._validate_provider(provider)
 
-            idx = self._get_model_consensus(
-                parsed_res_a,
-                parsed_res_o,
-            )
+                llm = self.client._get_provider_instance(provider=provider)
+                res = llm.generate(
+                    model=model_id,
+                    questions=questions,
+                    responses=responses,
+                    prompt=prompt if prompt else None,
+                )
+                eval_res.append(res)
+
+            return eval_res
 
-            return {
-                "evaluation": [parsed_res_o[i] for i in idx],
-                "q_index": idx,
-            }
         except (AuthenticationError, ValueError) as e:
             raise e
         except Exception as e:
             raise ProviderError(
-                provider="eval-multi",
-                message=f"Error during multi-model evaluation: {str(e)}",
+                provider="", message=f"Error during evaluation: {str(e)}"
+            )
+
+    def get_consensus(self, eval_A: List, eval_B: List):
+        """Reach consensus between two evaluation results
+
+        Args:
+            eval_A (List): List of batch results to compare
+            eval_B (List): List of batch results to compare
+
+        Returns:
+            List: List of evaluation results where both providers agreed on the rank
+
+        Raises:
+            Exception: If there's an error processing the batch results
+        """
+        try:
+            parsed_a = self._parse_response(
+                response=eval_A,
+            )
+            parsed_b = self._parse_response(
+                response=eval_B,
             )
+
+            res = []
+            check = {}
+
+            for e in parsed_a:
+                q_index = e["q_index"]
+                check[q_index] = e["rank"]
+
+            for e in parsed_b:
+                q_index = e["q_index"]
+                if q_index in check and check[q_index] == e["rank"]:
+                    res.append(e)
+            return res
+        except Exception as e:
+            raise Exception(f"Error processing responses for consensus: {str(e)}")
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "openpo"
-version = "0.6.0"
+version = "0.6.1"
 description = "Build high quality synthetic datasets with AI feedback from 200+ LLMs"
 authors = ["Daniel Lee <[email protected]>"]
 license = "Apache-2.0"