diff --git a/tools/who_what_benchmark/tests/test_cli_image.py b/tools/who_what_benchmark/tests/test_cli_image.py index 536d015612..7b966f049e 100644 --- a/tools/who_what_benchmark/tests/test_cli_image.py +++ b/tools/who_what_benchmark/tests/test_cli_image.py @@ -1,3 +1,4 @@ +import itertools import subprocess # nosec B404 import os import shutil @@ -9,6 +10,9 @@ logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) +MODEL_CACHE = tempfile.mkdtemp() +OV_IMAGE_MODELS = ["OpenVINO/stable-diffusion-v1-5-int8-ov"] + def run_wwb(args): logger.info(" ".join(["TRANSFOREMRS_VERBOSITY=debug wwb"] + args)) @@ -17,6 +21,19 @@ def run_wwb(args): return result +def setup_module(): + for model_id in OV_IMAGE_MODELS: + MODEL_PATH = os.path.join(MODEL_CACHE, model_id.replace("/", "--")) + subprocess.run(["huggingface-cli", "download", + model_id, "--local-dir", + MODEL_PATH], capture_output=True, text=True) + + +def teardown_module(): + logger.info("Remove models") + shutil.rmtree(MODEL_CACHE) + + @pytest.mark.parametrize( ("model_id", "model_type", "backend"), [ @@ -25,6 +42,8 @@ def run_wwb(args): ("hf-internal-testing/tiny-stable-diffusion-torch", "text-to-image", "hf"), ("hf-internal-testing/tiny-stable-diffusion-torch", "text-to-image", "openvino"), ("hf-internal-testing/tiny-stable-diffusion-xl-pipe", "text-to-image", "hf"), + ("hf-internal-testing/tiny-stable-diffusion-torch", "image-inpainting", "hf"), + ("hf-internal-testing/tiny-stable-diffusion-xl-pipe", "image-inpainting", "hf"), ], ) def test_image_model_types(model_id, model_type, backend): @@ -68,21 +87,13 @@ def test_image_model_types(model_id, model_type, backend): @pytest.mark.parametrize( ("model_id", "model_type"), - [ - ("OpenVINO/LCM_Dreamshaper_v7-int8-ov", "image-to-image"), - ("OpenVINO/LCM_Dreamshaper_v7-int8-ov", "text-to-image"), - ], + list(itertools.product(OV_IMAGE_MODELS, + ["image-to-image", "text-to-image", "image-inpainting"])), ) def test_image_model_genai(model_id, model_type): with tempfile.TemporaryDirectory() as temp_dir: GT_FILE = os.path.join(temp_dir, "gt.csv") - MODEL_PATH = os.path.join(temp_dir, model_id.replace("/", "--")) - - result = subprocess.run(["huggingface-cli", "download", - model_id, "--local-dir", - MODEL_PATH], - capture_output=True, text=True) - assert result.returncode == 0 + MODEL_PATH = os.path.join(MODEL_CACHE, model_id.replace("/", "--")) wwb_args = [ "--base-model", @@ -169,7 +180,6 @@ def test_image_model_genai(model_id, model_type): shutil.rmtree("reference", ignore_errors=True) shutil.rmtree("target", ignore_errors=True) - shutil.rmtree(MODEL_PATH, ignore_errors=True) shutil.rmtree(output_dir, ignore_errors=True) diff --git a/tools/who_what_benchmark/whowhatbench/__init__.py b/tools/who_what_benchmark/whowhatbench/__init__.py index f608601ec8..194426f208 100644 --- a/tools/who_what_benchmark/whowhatbench/__init__.py +++ b/tools/who_what_benchmark/whowhatbench/__init__.py @@ -3,7 +3,8 @@ from .text_evaluator import TextEvaluator as Evaluator from .text2image_evaluator import Text2ImageEvaluator from .visualtext_evaluator import VisualTextEvaluator -from .image2image import Image2ImageEvaluator +from .im2im_evaluator import Image2ImageEvaluator +from .inpaint_evaluator import InpaintingEvaluator __all__ = [ @@ -13,5 +14,6 @@ "Text2ImageEvaluator", "VisualTextEvaluator", "Image2ImageEvaluator", + "InpaintingEvaluator", "EVALUATOR_REGISTRY", ] diff --git a/tools/who_what_benchmark/whowhatbench/image2image.py b/tools/who_what_benchmark/whowhatbench/im2im_evaluator.py similarity index 100% rename from tools/who_what_benchmark/whowhatbench/image2image.py rename to tools/who_what_benchmark/whowhatbench/im2im_evaluator.py diff --git a/tools/who_what_benchmark/whowhatbench/inpaint_evaluator.py b/tools/who_what_benchmark/whowhatbench/inpaint_evaluator.py new file mode 100644 index 0000000000..c3fe0825f7 --- /dev/null +++ b/tools/who_what_benchmark/whowhatbench/inpaint_evaluator.py @@ -0,0 +1,133 @@ +import os +from typing import Any, Union + +import datasets +import pandas as pd +from tqdm import tqdm +from transformers import set_seed +import torch +import openvino_genai + +from .registry import register_evaluator +from .text2image_evaluator import Text2ImageEvaluator + +from .whowhat_metrics import ImageSimilarity + + +def preprocess_fn(example): + return { + "prompts": example["inpaint_caption"], + "images": example["coco_image"], + "masks": example["mask"], + } + + +def prepare_default_data(num_samples=None): + DATASET_NAME = "phiyodr/InpaintCOCO" + NUM_SAMPLES = 10 if num_samples is None else num_samples + set_seed(42) + default_dataset = datasets.load_dataset( + DATASET_NAME, split="test", streaming=True + ).filter(lambda example: example["inpaint_caption"] != "").take(NUM_SAMPLES) + return default_dataset.map( + lambda x: preprocess_fn(x), remove_columns=default_dataset.column_names + ) + + +@register_evaluator("image-inpainting") +class InpaintingEvaluator(Text2ImageEvaluator): + def __init__( + self, + base_model: Any = None, + gt_data: str = None, + test_data: Union[str, list] = None, + metrics="similarity", + similarity_model_id: str = "openai/clip-vit-large-patch14", + num_inference_steps=4, + crop_prompts=True, + num_samples=None, + gen_image_fn=None, + seed=42, + is_genai=False, + ) -> None: + assert ( + base_model is not None or gt_data is not None + ), "Text generation pipeline for evaluation or ground trush data must be defined" + + self.test_data = test_data + self.metrics = metrics + self.crop_prompt = crop_prompts + self.num_samples = num_samples + self.num_inference_steps = num_inference_steps + self.seed = seed + self.similarity = None + self.similarity = ImageSimilarity(similarity_model_id) + self.last_cmp = None + self.gt_dir = os.path.dirname(gt_data) + self.generation_fn = gen_image_fn + self.is_genai = is_genai + self.resolution = None + + if base_model: + self.gt_data = self._generate_data( + base_model, gen_image_fn, os.path.join(self.gt_dir, "reference") + ) + else: + self.gt_data = pd.read_csv(gt_data, keep_default_na=False) + + def _generate_data(self, model, gen_image_fn=None, image_dir="reference"): + def default_gen_image_fn(model, prompt, image, mask, num_inference_steps, generator=None): + with torch.no_grad(): + output = model( + prompt, + image=image, + mask_image=mask, + num_inference_steps=num_inference_steps, + output_type="pil", + generator=generator, + ) + return output.images[0] + + generation_fn = gen_image_fn or default_gen_image_fn + + if self.test_data: + if isinstance(self.test_data, str): + data = pd.read_csv(self.test_data) + else: + if isinstance(self.test_data, dict): + assert "prompts" in self.test_data + assert "images" in self.test_data + assert "masks" in self.test_data + data = dict(self.test_data) + data = pd.DataFrame.from_dict(data) + else: + data = pd.DataFrame.from_dict(prepare_default_data(self.num_samples)) + + prompts = data["prompts"] + images = data["images"] + masks = data["masks"] + output_images = [] + rng = torch.Generator(device="cpu") + + if not os.path.exists(image_dir): + os.makedirs(image_dir) + + for i, (prompt, image, mask) in tqdm(enumerate(zip(prompts, images, masks)), desc="Evaluate pipeline"): + set_seed(self.seed) + rng = rng.manual_seed(self.seed) + output = generation_fn( + model, + prompt, + image=image, + mask=mask, + num_inference_steps=self.num_inference_steps, + generator=openvino_genai.TorchGenerator(self.seed) if self.is_genai else rng + ) + image_path = os.path.join(image_dir, f"{i}.png") + output.save(image_path) + output_images.append(image_path) + + res_data = {"prompts": list(prompts), "images": output_images} + df = pd.DataFrame(res_data) + + return df diff --git a/tools/who_what_benchmark/whowhatbench/model_loaders.py b/tools/who_what_benchmark/whowhatbench/model_loaders.py index f54d232bc2..8a00c70852 100644 --- a/tools/who_what_benchmark/whowhatbench/model_loaders.py +++ b/tools/who_what_benchmark/whowhatbench/model_loaders.py @@ -2,7 +2,7 @@ import json from transformers import AutoConfig, AutoModelForCausalLM, AutoModel, AutoModelForVision2Seq -from diffusers import DiffusionPipeline, AutoPipelineForImage2Image +from diffusers import DiffusionPipeline, AutoPipelineForImage2Image, AutoPipelineForInpainting logging.basicConfig(level=logging.INFO) @@ -107,7 +107,7 @@ def load_text2image_model( try: model = TEXT2IMAGEPipeline.from_pretrained( - model_id, trust_remote_code=True, device=device, ov_config=ov_config + model_id, trust_remote_code=True, device=device, ov_config=ov_config, safety_checker=None, ) except ValueError: config = AutoConfig.from_pretrained( @@ -119,6 +119,7 @@ def load_text2image_model( use_cache=True, device=device, ov_config=ov_config, + safety_checker=None, ) return model @@ -211,7 +212,7 @@ def load_imagetext2image_model( from optimum.intel.openvino import OVPipelineForImage2Image try: model = OVPipelineForImage2Image.from_pretrained( - model_id, trust_remote_code=True, device=device, ov_config=ov_config + model_id, trust_remote_code=True, device=device, ov_config=ov_config, safety_checker=None, ) except ValueError: config = AutoConfig.from_pretrained(model_id, trust_remote_code=True) @@ -222,6 +223,54 @@ def load_imagetext2image_model( use_cache=True, device=device, ov_config=ov_config, + safety_checker=None, + ) + return model + + +def load_inpainting_genai_pipeline(model_dir, device="CPU", ov_config=None): + try: + import openvino_genai + except ImportError as e: + logger.error("Failed to import openvino_genai package. Please install it. Details:\n", e) + exit(-1) + + return GenAIModelWrapper( + openvino_genai.InpaintingPipeline(model_dir, device, **ov_config), + model_dir, + "image-inpainting" + ) + + +def load_inpainting_model( + model_id, device="CPU", ov_config=None, use_hf=False, use_genai=False +): + if use_hf: + logger.info("Using HF Transformers API") + model = AutoPipelineForInpainting.from_pretrained( + model_id, trust_remote_code=True + ) + elif use_genai: + logger.info("Using OpenVINO GenAI API") + model = load_inpainting_genai_pipeline(model_id, device, ov_config) + else: + logger.info("Using Optimum API") + from optimum.intel.openvino import OVPipelineForInpainting + try: + model = OVPipelineForInpainting.from_pretrained( + model_id, trust_remote_code=True, device=device, ov_config=ov_config, safety_checker=None, + ) + except ValueError as e: + logger.error("Failed to load inpaiting pipeline. Details:\n", e) + config = AutoConfig.from_pretrained(model_id, trust_remote_code=True) + model = OVPipelineForInpainting.from_pretrained( + model_id, + config=config, + trust_remote_code=True, + use_cache=True, + device=device, + ov_config=ov_config, + safety_checker=None, ) return model @@ -248,5 +297,7 @@ def load_model( return load_visual_text_model(model_id, device, ov_options, use_hf, use_genai) elif model_type == "image-to-image": return load_imagetext2image_model(model_id, device, ov_options, use_hf, use_genai) + elif model_type == "image-inpainting": + return load_inpainting_model(model_id, device, ov_options, use_hf, use_genai) else: raise ValueError(f"Unsupported model type: {model_type}") diff --git a/tools/who_what_benchmark/whowhatbench/wwb.py b/tools/who_what_benchmark/whowhatbench/wwb.py index 2ff8c45975..7acf3cf5aa 100644 --- a/tools/who_what_benchmark/whowhatbench/wwb.py +++ b/tools/who_what_benchmark/whowhatbench/wwb.py @@ -55,7 +55,7 @@ def parse_args(): parser.add_argument( "--model-type", type=str, - choices=["text", "text-to-image", "visual-text", "image-to-image"], + choices=["text", "text-to-image", "visual-text", "image-to-image", "image-inpainting"], default="text", help="Indicated the model type: 'text' - for causal text generation, 'text-to-image' - for image generation, " "visual-text - for Visual Language Models, image-to-image - for image generation based on image and prompt", @@ -282,6 +282,20 @@ def genai_gen_image2image(model, prompt, image, num_inference_steps, generator=N return image +def genai_gen_inpainting(model, prompt, image, mask, num_inference_steps, generator=None): + image_data = ov.Tensor(np.array(image.getdata()).reshape(1, image.size[1], image.size[0], 3).astype(np.uint8)) + mask_data = ov.Tensor(np.array(mask.getdata()).reshape(1, mask.size[1], mask.size[0], 3).astype(np.uint8)) + image_tensor = model.generate( + prompt, + image=image_data, + mask_image=mask_data, + num_inference_steps=num_inference_steps, + generator=generator, + ) + image = Image.fromarray(image_tensor.data[0]) + return image + + def genai_gen_visual_text(model, prompt, image, processor, tokenizer, max_new_tokens, crop_question): image_data = ov.Tensor(np.array(image.getdata()).reshape(1, image.size[1], image.size[0], 3).astype(np.uint8)) config = model.get_generation_config() @@ -355,6 +369,17 @@ def create_evaluator(base_model, args): is_genai=args.genai, seed=args.seed, ) + elif task == "image-inpainting": + return EvaluatorCLS( + base_model=base_model, + gt_data=args.gt_data, + test_data=prompts, + num_samples=args.num_samples, + num_inference_steps=args.num_inference_steps, + gen_image_fn=genai_gen_inpainting if args.genai else None, + is_genai=args.genai, + seed=args.seed, + ) else: raise ValueError(f"Unsupported task: {task}")