diff --git a/src/diffusers/pipelines/pipeline_utils.py b/src/diffusers/pipelines/pipeline_utils.py index 6698d3ed2055..bad23a60293f 100644 --- a/src/diffusers/pipelines/pipeline_utils.py +++ b/src/diffusers/pipelines/pipeline_utils.py @@ -1360,7 +1360,7 @@ def enable_model_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[t ) # _offload_gpu_id should be set to passed gpu_id (or id in passed `device`) or default to previously set id or default to 0 - self._offload_gpu_id = gpu_id or torch_device.index or self._offload_gpu_id or 0 + self._offload_gpu_id = gpu_id or torch_device.index or getattr(self, "_offload_gpu_id", 0) device_type = torch_device.type device = torch.device(f"{device_type}:{self._offload_gpu_id}") @@ -1445,7 +1445,7 @@ def enable_sequential_cpu_offload(self, gpu_id: Optional[int] = None, device: Un ) # _offload_gpu_id should be set to passed gpu_id (or id in passed `device`) or default to previously set id or default to 0 - self._offload_gpu_id = gpu_id or torch_device.index or self._offload_gpu_id or 0 + self._offload_gpu_id = gpu_id or torch_device.index or getattr(self, "_offload_gpu_id", 0) device_type = torch_device.type device = torch.device(f"{device_type}:{self._offload_gpu_id}") diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_diffedit.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_diffedit.py index c4cfaee9cf31..abcb5e7ead7a 100644 --- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_diffedit.py +++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_diffedit.py @@ -37,6 +37,7 @@ floats_tensor, load_image, nightly, + numpy_cosine_similarity_distance, require_torch_gpu, slow, torch_device, @@ -303,8 +304,7 @@ def setUpClass(cls): raw_image = load_image( "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/diffedit/fruit.png" ) - - raw_image = raw_image.convert("RGB").resize((768, 768)) + raw_image = raw_image.convert("RGB").resize((256, 256)) cls.raw_image = raw_image @@ -312,9 +312,11 @@ def test_stable_diffusion_diffedit_full(self): generator = torch.manual_seed(0) pipe = StableDiffusionDiffEditPipeline.from_pretrained( - "stabilityai/stable-diffusion-2-1", safety_checker=None, torch_dtype=torch.float16 + "stabilityai/stable-diffusion-2-1-base", safety_checker=None, torch_dtype=torch.float16 ) pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config) + pipe.scheduler.clip_sample = True + pipe.inverse_scheduler = DDIMInverseScheduler.from_config(pipe.scheduler.config) pipe.enable_model_cpu_offload() pipe.set_progress_bar_config(disable=None) @@ -330,7 +332,11 @@ def test_stable_diffusion_diffedit_full(self): ) inv_latents = pipe.invert( - prompt=source_prompt, image=self.raw_image, inpaint_strength=0.7, generator=generator + prompt=source_prompt, + image=self.raw_image, + inpaint_strength=0.7, + generator=generator, + num_inference_steps=5, ).latents image = pipe( @@ -340,7 +346,8 @@ def test_stable_diffusion_diffedit_full(self): generator=generator, negative_prompt=source_prompt, inpaint_strength=0.7, - output_type="numpy", + num_inference_steps=5, + output_type="np", ).images[0] expected_image = ( @@ -348,11 +355,12 @@ def test_stable_diffusion_diffedit_full(self): load_image( "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main" "/diffedit/pears.png" - ).resize((768, 768)) + ).resize((256, 256)) ) / 255 ) - assert np.abs((expected_image - image).max()) < 5e-1 + + assert numpy_cosine_similarity_distance(expected_image.flatten(), image.flatten()) < 2e-1 @nightly diff --git a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py index 52bf5370f211..4d6bd85d981a 100644 --- a/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py +++ b/tests/pipelines/stable_diffusion_2/test_stable_diffusion_v_pred.py @@ -405,13 +405,20 @@ def test_stable_diffusion_text2img_pipeline_unflawed(self): pipe.scheduler.config, timestep_spacing="trailing", rescale_betas_zero_snr=True ) pipe.to(torch_device) - pipe.enable_attention_slicing() + pipe.enable_model_cpu_offload() pipe.set_progress_bar_config(disable=None) prompt = "A lion in galaxies, spirals, nebulae, stars, smoke, iridescent, intricate detail, octane render, 8k" generator = torch.Generator("cpu").manual_seed(0) - output = pipe(prompt=prompt, guidance_scale=7.5, guidance_rescale=0.7, generator=generator, output_type="np") + output = pipe( + prompt=prompt, + guidance_scale=7.5, + num_inference_steps=10, + guidance_rescale=0.7, + generator=generator, + output_type="np", + ) image = output.images[0] assert image.shape == (768, 768, 3) @@ -443,7 +450,7 @@ def test_download_local(self): pipe = StableDiffusionPipeline.from_single_file(filename, torch_dtype=torch.float16) pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config) - pipe.to("cuda") + pipe.enable_model_cpu_offload() image_out = pipe("test", num_inference_steps=1, output_type="np").images[0] @@ -460,7 +467,7 @@ def test_download_ckpt_diff_format_is_same(self): pipe_single.enable_model_cpu_offload() generator = torch.Generator(device="cpu").manual_seed(0) - image_ckpt = pipe_single("a turtle", num_inference_steps=5, generator=generator, output_type="np").images[0] + image_ckpt = pipe_single("a turtle", num_inference_steps=2, generator=generator, output_type="np").images[0] pipe = StableDiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1") pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config) @@ -468,7 +475,7 @@ def test_download_ckpt_diff_format_is_same(self): pipe.enable_model_cpu_offload() generator = torch.Generator(device="cpu").manual_seed(0) - image = pipe("a turtle", num_inference_steps=5, generator=generator, output_type="np").images[0] + image = pipe("a turtle", num_inference_steps=2, generator=generator, output_type="np").images[0] max_diff = numpy_cosine_similarity_distance(image.flatten(), image_ckpt.flatten()) assert max_diff < 1e-3 diff --git a/tests/pipelines/text_to_video/test_video_to_video.py b/tests/pipelines/text_to_video/test_video_to_video.py index 397d65cae52f..c50565280622 100644 --- a/tests/pipelines/text_to_video/test_video_to_video.py +++ b/tests/pipelines/text_to_video/test_video_to_video.py @@ -31,6 +31,7 @@ enable_full_determinism, floats_tensor, is_flaky, + numpy_cosine_similarity_distance, skip_mps, slow, torch_device, @@ -198,17 +199,18 @@ def test_progress_bar(self): @skip_mps class VideoToVideoSDPipelineSlowTests(unittest.TestCase): def test_two_step_model(self): - pipe = VideoToVideoSDPipeline.from_pretrained("cerspense/zeroscope_v2_XL", torch_dtype=torch.float16) + pipe = VideoToVideoSDPipeline.from_pretrained("cerspense/zeroscope_v2_576w", torch_dtype=torch.float16) pipe.enable_model_cpu_offload() # 10 frames generator = torch.Generator(device="cpu").manual_seed(0) - video = torch.randn((1, 10, 3, 1024, 576), generator=generator) - video = video.to("cuda") + video = torch.randn((1, 10, 3, 320, 576), generator=generator) prompt = "Spiderman is surfing" video_frames = pipe(prompt, video=video, generator=generator, num_inference_steps=3, output_type="pt").frames - expected_array = np.array([-1.0458984, -1.1279297, -0.9663086, -0.91503906, -0.75097656]) - assert np.abs(video_frames.cpu().numpy()[0, 0, 0, 0, -5:] - expected_array).sum() < 1e-2 + expected_array = np.array([-0.9770508, -0.8027344, -0.62646484, -0.8334961, -0.7573242]) + output_array = video_frames.cpu().numpy()[0, 0, 0, 0, -5:] + + assert numpy_cosine_similarity_distance(expected_array, output_array) < 1e-2