diff --git a/examples/multimodal_llm_eval/evaluate_mllm_metric_action.py b/examples/multimodal_llm_eval/evaluate_mllm_metric_action.py index 738387b..cbdae4c 100644 --- a/examples/multimodal_llm_eval/evaluate_mllm_metric_action.py +++ b/examples/multimodal_llm_eval/evaluate_mllm_metric_action.py @@ -1,13 +1,12 @@ +import asyncio from typing import Optional import fire import weave -import wandb -from hemm.eval_pipelines import EvaluationPipeline from hemm.metrics.vqa import MultiModalLLMEvaluationMetric from hemm.metrics.vqa.judges.mmllm_judges import OpenAIJudge, PromptCategory -from hemm.models import BaseDiffusionModel +from hemm.models import DiffusersModel def main( @@ -21,38 +20,28 @@ def main( image_height: int = 1024, image_width: int = 1024, num_inference_steps: int = 50, - mock_inference_dataset_address: Optional[str] = None, - save_inference_dataset_name: Optional[str] = None, ): - wandb.init(project=project, entity=entity, job_type="evaluation") weave.init(project_name=f"{entity}/{project}") dataset = weave.ref(dataset_ref).get() dataset = dataset.rows[:dataset_limit] if dataset_limit else dataset - diffusion_model = BaseDiffusionModel( + model = DiffusersModel( diffusion_model_name_or_path=diffusion_model_address, enable_cpu_offfload=diffusion_model_enable_cpu_offfload, image_height=image_height, image_width=image_width, num_inference_steps=num_inference_steps, ) - diffusion_model._pipeline.set_progress_bar_config(disable=True) - evaluation_pipeline = EvaluationPipeline( - model=diffusion_model, - mock_inference_dataset_address=mock_inference_dataset_address, - save_inference_dataset_name=save_inference_dataset_name, - ) + model._pipeline.set_progress_bar_config(disable=True) judge = OpenAIJudge( prompt_property=PromptCategory.action, openai_model=openai_judge_model ) metric = MultiModalLLMEvaluationMetric(judge=judge) - evaluation_pipeline.add_metric(metric) - evaluation_pipeline(dataset=dataset) - wandb.finish() - evaluation_pipeline.cleanup() + evaluation = weave.Evaluation(dataset=dataset, scorers=[metric]) + asyncio.run(evaluation.evaluate(model)) if __name__ == "__main__": diff --git a/examples/multimodal_llm_eval/evaluate_mllm_metric_complex.py b/examples/multimodal_llm_eval/evaluate_mllm_metric_complex.py index 738387b..5ad871e 100644 --- a/examples/multimodal_llm_eval/evaluate_mllm_metric_complex.py +++ b/examples/multimodal_llm_eval/evaluate_mllm_metric_complex.py @@ -1,13 +1,12 @@ +import asyncio from typing import Optional import fire import weave -import wandb -from hemm.eval_pipelines import EvaluationPipeline from hemm.metrics.vqa import MultiModalLLMEvaluationMetric from hemm.metrics.vqa.judges.mmllm_judges import OpenAIJudge, PromptCategory -from hemm.models import BaseDiffusionModel +from hemm.models import DiffusersModel def main( @@ -21,38 +20,28 @@ def main( image_height: int = 1024, image_width: int = 1024, num_inference_steps: int = 50, - mock_inference_dataset_address: Optional[str] = None, - save_inference_dataset_name: Optional[str] = None, ): - wandb.init(project=project, entity=entity, job_type="evaluation") weave.init(project_name=f"{entity}/{project}") dataset = weave.ref(dataset_ref).get() dataset = dataset.rows[:dataset_limit] if dataset_limit else dataset - diffusion_model = BaseDiffusionModel( + model = DiffusersModel( diffusion_model_name_or_path=diffusion_model_address, enable_cpu_offfload=diffusion_model_enable_cpu_offfload, image_height=image_height, image_width=image_width, num_inference_steps=num_inference_steps, ) - diffusion_model._pipeline.set_progress_bar_config(disable=True) - evaluation_pipeline = EvaluationPipeline( - model=diffusion_model, - mock_inference_dataset_address=mock_inference_dataset_address, - save_inference_dataset_name=save_inference_dataset_name, - ) + model._pipeline.set_progress_bar_config(disable=True) judge = OpenAIJudge( - prompt_property=PromptCategory.action, openai_model=openai_judge_model + prompt_property=PromptCategory.complex, openai_model=openai_judge_model ) metric = MultiModalLLMEvaluationMetric(judge=judge) - evaluation_pipeline.add_metric(metric) - evaluation_pipeline(dataset=dataset) - wandb.finish() - evaluation_pipeline.cleanup() + evaluation = weave.Evaluation(dataset=dataset, scorers=[metric]) + asyncio.run(evaluation.evaluate(model)) if __name__ == "__main__":