From fcd7178e1e0e53871f6a20ae5c4be63ffbca5eaa Mon Sep 17 00:00:00 2001
From: Soumik Rakshit <19soumik.rakshit96@gmail.com>
Date: Wed, 30 Oct 2024 16:34:41 +0000
Subject: [PATCH] update: examples

---
 .../evaluate_mllm_metric_action.py            | 23 +++++------------
 .../evaluate_mllm_metric_complex.py           | 25 ++++++-------------
 2 files changed, 13 insertions(+), 35 deletions(-)

diff --git a/examples/multimodal_llm_eval/evaluate_mllm_metric_action.py b/examples/multimodal_llm_eval/evaluate_mllm_metric_action.py
index 738387b..cbdae4c 100644
--- a/examples/multimodal_llm_eval/evaluate_mllm_metric_action.py
+++ b/examples/multimodal_llm_eval/evaluate_mllm_metric_action.py
@@ -1,13 +1,12 @@
+import asyncio
 from typing import Optional
 
 import fire
 import weave
 
-import wandb
-from hemm.eval_pipelines import EvaluationPipeline
 from hemm.metrics.vqa import MultiModalLLMEvaluationMetric
 from hemm.metrics.vqa.judges.mmllm_judges import OpenAIJudge, PromptCategory
-from hemm.models import BaseDiffusionModel
+from hemm.models import DiffusersModel
 
 
 def main(
@@ -21,38 +20,28 @@ def main(
     image_height: int = 1024,
     image_width: int = 1024,
     num_inference_steps: int = 50,
-    mock_inference_dataset_address: Optional[str] = None,
-    save_inference_dataset_name: Optional[str] = None,
 ):
-    wandb.init(project=project, entity=entity, job_type="evaluation")
     weave.init(project_name=f"{entity}/{project}")
 
     dataset = weave.ref(dataset_ref).get()
     dataset = dataset.rows[:dataset_limit] if dataset_limit else dataset
 
-    diffusion_model = BaseDiffusionModel(
+    model = DiffusersModel(
         diffusion_model_name_or_path=diffusion_model_address,
         enable_cpu_offfload=diffusion_model_enable_cpu_offfload,
         image_height=image_height,
         image_width=image_width,
         num_inference_steps=num_inference_steps,
     )
-    diffusion_model._pipeline.set_progress_bar_config(disable=True)
-    evaluation_pipeline = EvaluationPipeline(
-        model=diffusion_model,
-        mock_inference_dataset_address=mock_inference_dataset_address,
-        save_inference_dataset_name=save_inference_dataset_name,
-    )
+    model._pipeline.set_progress_bar_config(disable=True)
 
     judge = OpenAIJudge(
         prompt_property=PromptCategory.action, openai_model=openai_judge_model
     )
     metric = MultiModalLLMEvaluationMetric(judge=judge)
-    evaluation_pipeline.add_metric(metric)
 
-    evaluation_pipeline(dataset=dataset)
-    wandb.finish()
-    evaluation_pipeline.cleanup()
+    evaluation = weave.Evaluation(dataset=dataset, scorers=[metric])
+    asyncio.run(evaluation.evaluate(model))
 
 
 if __name__ == "__main__":
diff --git a/examples/multimodal_llm_eval/evaluate_mllm_metric_complex.py b/examples/multimodal_llm_eval/evaluate_mllm_metric_complex.py
index 738387b..5ad871e 100644
--- a/examples/multimodal_llm_eval/evaluate_mllm_metric_complex.py
+++ b/examples/multimodal_llm_eval/evaluate_mllm_metric_complex.py
@@ -1,13 +1,12 @@
+import asyncio
 from typing import Optional
 
 import fire
 import weave
 
-import wandb
-from hemm.eval_pipelines import EvaluationPipeline
 from hemm.metrics.vqa import MultiModalLLMEvaluationMetric
 from hemm.metrics.vqa.judges.mmllm_judges import OpenAIJudge, PromptCategory
-from hemm.models import BaseDiffusionModel
+from hemm.models import DiffusersModel
 
 
 def main(
@@ -21,38 +20,28 @@ def main(
     image_height: int = 1024,
     image_width: int = 1024,
     num_inference_steps: int = 50,
-    mock_inference_dataset_address: Optional[str] = None,
-    save_inference_dataset_name: Optional[str] = None,
 ):
-    wandb.init(project=project, entity=entity, job_type="evaluation")
     weave.init(project_name=f"{entity}/{project}")
 
     dataset = weave.ref(dataset_ref).get()
     dataset = dataset.rows[:dataset_limit] if dataset_limit else dataset
 
-    diffusion_model = BaseDiffusionModel(
+    model = DiffusersModel(
         diffusion_model_name_or_path=diffusion_model_address,
         enable_cpu_offfload=diffusion_model_enable_cpu_offfload,
         image_height=image_height,
         image_width=image_width,
         num_inference_steps=num_inference_steps,
     )
-    diffusion_model._pipeline.set_progress_bar_config(disable=True)
-    evaluation_pipeline = EvaluationPipeline(
-        model=diffusion_model,
-        mock_inference_dataset_address=mock_inference_dataset_address,
-        save_inference_dataset_name=save_inference_dataset_name,
-    )
+    model._pipeline.set_progress_bar_config(disable=True)
 
     judge = OpenAIJudge(
-        prompt_property=PromptCategory.action, openai_model=openai_judge_model
+        prompt_property=PromptCategory.complex, openai_model=openai_judge_model
     )
     metric = MultiModalLLMEvaluationMetric(judge=judge)
-    evaluation_pipeline.add_metric(metric)
 
-    evaluation_pipeline(dataset=dataset)
-    wandb.finish()
-    evaluation_pipeline.cleanup()
+    evaluation = weave.Evaluation(dataset=dataset, scorers=[metric])
+    asyncio.run(evaluation.evaluate(model))
 
 
 if __name__ == "__main__":