diffusers_0.31.0_base_dev

mindspore-lab · Nov 27, 2024 · 04c7b17 · 04c7b17
1 parent 0ad4991
commit 04c7b17
Show file tree

Hide file tree

Showing 90 changed files with 3,002 additions and 798 deletions.
diff --git a/docs/diffusers/_toctree.yml b/docs/diffusers/_toctree.yml
@@ -157,8 +157,53 @@
     title: Loaders
   - isExpanded: False
     sections:
-      - local: api/models/overview
-        title: Overview
+    - local: api/models/overview
+      title: Overview
+    - sections:
+      - local: api/models/controlnet
+        title: ControlNetModel
+      - local: api/models/controlnet_flux
+        title: FluxControlNetModel
+      - local: api/models/controlnet_hunyuandit
+        title: HunyuanDiT2DControlNetModel
+      - local: api/models/controlnet_sd3
+        title: SD3ControlNetModel
+      - local: api/models/controlnet_sparsectrl
+        title: SparseControlNetModel
+      title: ControlNets
+    - sections:
+      - local: api/models/aura_flow_transformer2d
+        title: AuraFlowTransformer2DModel
+      - local: api/models/cogvideox_transformer3d
+        title: CogVideoXTransformer3DModel
+      - local: api/models/cogview3plus_transformer2d
+        title: CogView3PlusTransformer2DModel
+      - local: api/models/dit_transformer2d
+        title: DiTTransformer2DModel
+      - local: api/models/flux_transformer
+        title: FluxTransformer2DModel
+      - local: api/models/hunyuan_transformer2d
+        title: HunyuanDiT2DModel
+      - local: api/models/latte_transformer3d
+        title: LatteTransformer3DModel
+      - local: api/models/lumina_nextdit2d
+        title: LuminaNextDiT2DModel
+      - local: api/models/pixart_transformer2d
+        title: PixArtTransformer2DModel
+      - local: api/models/prior_transformer
+        title: PriorTransformer
+      - local: api/models/sd3_transformer2d
+        title: SD3Transformer2DModel
+      - local: api/models/stable_audio_transformer
+        title: StableAudioDiTModel
+      - local: api/models/transformer2d
+        title: Transformer2DModel
+      - local: api/models/transformer_temporal
+        title: TransformerTemporalModel
+      title: Transformers
+    - sections:
+      - local: api/models/stable_cascade_unet
+        title: StableCascadeUNet
       - local: api/models/unet
         title: UNet1DModel
       - local: api/models/unet2d
@@ -171,42 +216,23 @@
         title: UNetMotionModel
       - local: api/models/uvit2d
         title: UViT2DModel
-      - local: api/models/vq
-        title: UVQModel
+      title: UNets
+    - sections:
       - local: api/models/autoencoderkl
-        title: AutoEncoderKL
+        title: AutoencoderKL
       - local: api/models/autoencoderkl_cogvideox
         title: AutoencoderKLCogVideoX
       - local: api/models/asymmetricautoencoderkl
-        title: AsymmetricAutoEncoderKL
-      - local: api/models/stable_cascade_unet
-        title: StableCascadeUNet
+        title: AsymmetricAutoencoderKL
+      - local: api/models/consistency_decoder_vae
+        title: ConsistencyDecoderVAE
+      - local: api/models/autoencoder_oobleck
+        title: Oobleck AutoEncoder
       - local: api/models/autoencoder_tiny
         title: Tiny AutoEncoder
-      - local: api/models/consistency_decoder_vae
-        title: ConsistencyDecoderVae
-      - local: api/models/transformer2d
-        title: Transformer2DModel
-      - local: api/models/pixart_transformer2d
-        title: PixArtTransformer2DModel
-      - local: api/models/dit_transformer2d
-        title: DiTTransformer2DModel
-      - local: api/models/hunyuan_transformer2d
-        title: HunyuanDiT2DModel
-      - local: api/models/flux_transformer
-        title: FluxTransformer2DModel
-      - local: api/models/cogvideox_transformer3d
-        title: CogVideoXTransformer3DModel
-      - local: api/models/transformer_temporal
-        title: TransformerTemporalModel
-      - local: api/models/sd3_transformer2d
-        title: SD3Transformer2DModel
-      - local: api/models/prior_transformer
-        title: PriorTransformer
-      - local: api/models/controlnet
-        title: ControlNetModel
-      - local: api/models/controlnet_sd3
-        title: SD3ControlNetModel
+      - local: api/models/vq
+        title: VQModel
+      title: VAEs
     title: Models
   - isExpanded: False
     sections:

diff --git a/docs/diffusers/api/models/controlnet.md b/docs/diffusers/api/models/controlnet.md
@@ -29,7 +29,7 @@ from mindone.diffusers import StableDiffusionControlNetPipeline, ControlNetModel
 url = "https://huggingface.co/lllyasviel/ControlNet-v1-1/blob/main/control_v11p_sd15_canny.pth"  # can also be a local path
 controlnet = ControlNetModel.from_single_file(url)
 
-url = "https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned.safetensors"  # can also be a local path
+url = "https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5/blob/main/v1-5-pruned.safetensors"  # can also be a local path
 pipe = StableDiffusionControlNetPipeline.from_single_file(url, controlnet=controlnet)
 ```
 

diff --git a/docs/diffusers/api/pipelines/overview.md b/docs/diffusers/api/pipelines/overview.md
@@ -29,9 +29,13 @@ The table below lists all the pipelines currently available in 🤗 Diffusers an
 | Pipeline                                                       | Tasks |
 |----------------------------------------------------------------|---|
 | [AnimateDiff](animatediff.md)                                  | text2video |
+| [AuraFlow](auraflow) 											 | text2image |
 | [BLIP Diffusion](blip_diffusion.md)                            | text2image |
+| [CogVideoX](cogvideox) 										 | text2video |
 | [Consistency Models](consistency_models.md)                    | unconditional image generation |
 | [ControlNet](controlnet.md)                                    | text2image, image2image, inpainting |
+| [ControlNet with Flux.1](controlnet_flux) 					 | text2image |
+| [ControlNet with Hunyuan-DiT](controlnet_hunyuandit) 			 | text2image |
 | [ControlNet with Stable Diffusion 3](controlnet_sd3.md)        | text2image |
 | [ControlNet with Stable Diffusion XL](controlnet_sdxl.md)      | text2image |
 | [ControlNet-XS](controlnetxs.md)                               | text2image |
@@ -42,19 +46,27 @@ The table below lists all the pipelines currently available in 🤗 Diffusers an
 | [DeepFloyd IF](deepfloyd_if.md)                                | text2image, image2image, inpainting, super-resolution |
 | [DiffEdit](diffedit.md)                                        | inpainting |
 | [DiT](dit.md)                                                  | text2image |
+| [Flux](flux) 													 | text2image |
 | [Hunyuan-DiT](hunyuandit.md)                                   | text2image |
 | [I2VGen-XL](i2vgenxl.md)                                       | text2video |
 | [InstructPix2Pix](pix2pix.md)                                  | image editing |
 | [Kandinsky 2.1](kandinsky.md)                                  | text2image, image2image, inpainting, interpolation |
 | [Kandinsky 2.2](kandinsky_v22.md)                              | text2image, image2image, inpainting |
 | [Kandinsky 3](kandinsky3.md)                                   | text2image, image2image |
+| [Kolors](kolors) 												 | text2image |
 | [Latent Consistency Models](latent_consistency_models.md)      | text2image |
 | [Latent Diffusion](latent_diffusion.md)                        | text2image, super-resolution |
+| [Latte](latte) 												 | text2image |
+| [Lumina-T2X](lumina) 											 | text2image |
 | [Marigold](marigold.md)                                        | depth |
+| [PAG](pag) 													 | text2image |
 | [PixArt-α](pixart.md)                                          | text2image |
 | [PixArt-Σ](pixart_sigma.md)                                    | text2image |
 | [Shap-E](shap_e.md)                                            | text-to-3D, image-to-3D |
 | [Stable Cascade](stable_cascade.md)                            | text2image |
+| [Stable Diffusion](stable_diffusion/overview) 				 | text2image, image2image, depth2image, inpainting, image variation, latent upscaler, super-resolution |
+| [Stable Diffusion XL](stable_diffusion/stable_diffusion_xl) 	 | text2image, image2image, inpainting |
+| [T2I-Adapter](stable_diffusion/adapter) 						 | text2image |
 | [unCLIP](unclip.md)                                            | text2image, image variation |
 | [Wuerstchen](wuerstchen.md)                                    | text2image |
 

diff --git a/docs/diffusers/api/schedulers/overview.md b/docs/diffusers/api/schedulers/overview.md
@@ -44,6 +44,14 @@ Many schedulers are implemented from the [k-diffusion](https://github.com/crowso
 | LMS Karras          | [`LMSDiscreteScheduler`](https://mindspore-lab.github.io/mindone/latest/diffusers/api/schedulers/lms_discrete/#mindone.diffusers.LMSDiscreteScheduler)                                  | init with `use_karras_sigmas=True`                                           |
 | N/A                 | [`DEISMultistepScheduler`](https://mindspore-lab.github.io/mindone/latest/diffusers/api/schedulers/deis/#mindone.diffusers.schedulers.scheduling_deis_multistep.DEISMultistepScheduler) |                                                                              |
 | N/A                 | [`UniPCMultistepScheduler`](https://mindspore-lab.github.io/mindone/latest/diffusers/api/schedulers/unipc/#mindone.diffusers.UniPCMultistepScheduler)                                   |                                                                              |
+## Noise schedules and schedule types
+| A1111/k-diffusion        | 🤗 Diffusers                                                               |
+|--------------------------|----------------------------------------------------------------------------|
+| Karras                   | init with `use_karras_sigmas=True`                                         |
+| sgm_uniform              | init with `timestep_spacing="trailing"`                                    |
+| simple                   | init with `timestep_spacing="trailing"`                                    |
+| exponential              | init with `timestep_spacing="linspace"`, `use_exponential_sigmas=True`     |
+| beta                     | init with `timestep_spacing="linspace"`, `use_beta_sigmas=True`            |
 
 All schedulers are built from the base [`SchedulerMixin`](https://mindspore-lab.github.io/mindone/latest/diffusers/api/schedulers/overview/#mindone.diffusers.SchedulerMixin) class which implements low level utilities shared by all schedulers.
 

diff --git a/docs/diffusers/optimization/fp16.md b/docs/diffusers/optimization/fp16.md
@@ -123,3 +123,5 @@ make_image_grid(images, rows=2, cols=2)
     <figcaption class="mt-2 text-center text-sm text-gray-500">distilled Stable Diffusion + Tiny AutoEncoder</figcaption>
   </div>
 </div>
+
+More tiny autoencoder models for other Stable Diffusion models, like Stable Diffusion 3, are available from [madebyollin](https://huggingface.co/madebyollin).
diff --git a/docs/diffusers/stable_diffusion.md b/docs/diffusers/stable_diffusion.md
@@ -30,7 +30,7 @@ pipeline = DiffusionPipeline.from_pretrained(model_id, use_safetensors=True)
 The example prompt you'll use is a portrait of an old warrior chief, but feel free to use your own prompt:
 
 ```python
-prompt = "portrait photo of a old warrior chief"
+prompt = "portrait photo of an old warrior chief"
 ```
 
 ## Speed
@@ -222,7 +222,7 @@ Pretty impressive! Let's tweak the second image - corresponding to the `Generato
 ```python
 prompts = [
     "portrait photo of the oldest warrior chief, tribal panther make up, blue on red, side profile, looking away, serious eyes 50mm portrait photography, hard rim lighting photography--beta --ar 2:3  --beta --upbeta",
-    "portrait photo of a old warrior chief, tribal panther make up, blue on red, side profile, looking away, serious eyes 50mm portrait photography, hard rim lighting photography--beta --ar 2:3  --beta --upbeta",
+    "portrait photo of an old warrior chief, tribal panther make up, blue on red, side profile, looking away, serious eyes 50mm portrait photography, hard rim lighting photography--beta --ar 2:3  --beta --upbeta",
     "portrait photo of a warrior chief, tribal panther make up, blue on red, side profile, looking away, serious eyes 50mm portrait photography, hard rim lighting photography--beta --ar 2:3  --beta --upbeta",
     "portrait photo of a young warrior chief, tribal panther make up, blue on red, side profile, looking away, serious eyes 50mm portrait photography, hard rim lighting photography--beta --ar 2:3  --beta --upbeta",
 ]

diff --git a/docs/diffusers/using-diffusers/callback.md b/docs/diffusers/using-diffusers/callback.md
@@ -108,7 +108,7 @@ def latents_to_rgb(latents):
     weights = (
         (60, -60, 25, -70),
         (60,  -5, 15, -50),
-        (60,  10, -5, -35)
+        (60,  10, -5, -35),
     )
 
     def einsum(tensor1, tensor2):
@@ -120,8 +120,7 @@ def latents_to_rgb(latents):
     weights_tensor = ops.t(ms.Tensor(weights, dtype=latents.dtype))
     biases_tensor = ms.Tensor((150, 140, 130), dtype=latents.dtype)
     rgb_tensor = einsum(latents, weights_tensor) + biases_tensor.unsqueeze(-1).unsqueeze(-1)
-    image_array = rgb_tensor.clamp(0, 255)[0].to(ms.uint8).asnumpy()
-    image_array = image_array.transpose(1, 2, 0)
+    image_array = rgb_tensor.clamp(0, 255)[0].to(ms.uint8).asnumpy().transpose(1, 2, 0)
 
     return Image.fromarray(image_array)
 ```

diff --git a/examples/diffusers/controlnet/README.md b/examples/diffusers/controlnet/README.md
@@ -26,7 +26,7 @@ pip install -r requirements.txt
 
 The original dataset is hosted in the [ControlNet repo](https://huggingface.co/lllyasviel/ControlNet/blob/main/training/fill50k.zip). We re-uploaded it to be compatible with `datasets` [here](https://huggingface.co/datasets/fusing/fill50k). Note that `datasets` handles dataloading within the training script.
 
-Our training examples use [Stable Diffusion 1.5](https://huggingface.co/runwayml/stable-diffusion-v1-5) as the original set of ControlNet models were trained from it. However, ControlNet can be trained to augment any Stable Diffusion compatible model (such as [CompVis/stable-diffusion-v1-4](https://huggingface.co/CompVis/stable-diffusion-v1-4)) or [stabilityai/stable-diffusion-2-1](https://huggingface.co/stabilityai/stable-diffusion-2-1).
+Our training examples use [Stable Diffusion 1.5](https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5) as the original set of ControlNet models were trained from it. However, ControlNet can be trained to augment any Stable Diffusion compatible model (such as [CompVis/stable-diffusion-v1-4](https://huggingface.co/CompVis/stable-diffusion-v1-4)) or [stabilityai/stable-diffusion-2-1](https://huggingface.co/stabilityai/stable-diffusion-2-1).
 
 ## Training
 
@@ -39,7 +39,7 @@ wget https://huggingface.co/datasets/huggingface/documentation-images/resolve/ma
 ```
 
 ```bash
-export MODEL_DIR="runwayml/stable-diffusion-v1-5"
+export MODEL_DIR="stable-diffusion-v1-5/stable-diffusion-v1-5"
 export OUTPUT_DIR="path to save model"
 
 python train_controlnet.py \
@@ -56,7 +56,7 @@ python train_controlnet.py \
 Gradient accumulation with a smaller batch size can be used to reduce training requirements,
 
 ```bash
-export MODEL_DIR="runwayml/stable-diffusion-v1-5"
+export MODEL_DIR="stable-diffusion-v1-5/stable-diffusion-v1-5"
 export OUTPUT_DIR="path to save model"
 
 python train_controlnet.py \

diff --git a/examples/diffusers/controlnet/train_controlnet.py b/examples/diffusers/controlnet/train_controlnet.py
@@ -1013,7 +1013,9 @@ def forward(self, pixel_values, conditioning_pixel_values, input_ids):
 
         # Add noise to the latents according to the noise magnitude at each timestep
         # (this is the forward diffusion process)
-        noisy_latents = self.noise_scheduler.add_noise(latents, noise, timesteps)
+        noisy_latents = self.noise_scheduler.add_noise(latents.float(), noise.float(), timesteps).to(
+            dtype=self.weight_dtype
+        )
 
         # Get the text embedding for conditioning
         encoder_hidden_states = self.text_encoder(input_ids, return_dict=False)[0]

diff --git a/examples/diffusers/controlnet/train_controlnet_sdxl.py b/examples/diffusers/controlnet/train_controlnet_sdxl.py
@@ -1123,7 +1123,9 @@ def forward(self, pixel_values, conditioning_pixel_values, prompt_ids, add_text_
 
         # Add noise to the latents according to the noise magnitude at each timestep
         # (this is the forward diffusion process)
-        noisy_latents = self.noise_scheduler.add_noise(latents, noise, timesteps)
+        noisy_latents = self.noise_scheduler.add_noise(latents.float(), noise.float(), timesteps).to(
+            dtype=self.weight_dtype
+        )
 
         prompt_ids = prompt_ids.to(self.weight_dtype)
         unet_added_conditions = {

diff --git a/examples/diffusers/dreambooth/README.md b/examples/diffusers/dreambooth/README.md
@@ -175,7 +175,7 @@ Let's get started with a simple example. We will re-use the dog example of the [
 First, you need to set-up your dreambooth training example as is explained in the [installation section](#Installing-the-dependencies).
 Next, let's download the dog dataset. Download images from [here](https://drive.google.com/drive/folders/1BO_dyz-p65qhBRRMRA4TbZ8qW4rB99JZ) and save them in a directory. Make sure to set `INSTANCE_DIR` to the name of your directory further below. This will be our training data.
 
-Now, you can launch the training. Here we will use [Stable Diffusion 1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5).
+Now, you can launch the training. Here we will use [Stable Diffusion 1-5](https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5).
 
 **___Note: Change the `resolution` to 768 if you are using the [stable-diffusion-2](https://huggingface.co/stabilityai/stable-diffusion-2) 768x768 model.___**
 
@@ -184,7 +184,7 @@ Now, you can launch the training. Here we will use [Stable Diffusion 1-5](https:
 Now we can start training!
 
 ```bash
-export MODEL_NAME="runwayml/stable-diffusion-v1-5"
+export MODEL_NAME="stable-diffusion-v1-5/stable-diffusion-v1-5"
 export INSTANCE_DIR="dog"
 export OUTPUT_DIR="path-to-save-model"
 

diff --git a/examples/diffusers/dreambooth/train_dreambooth_lora.py b/examples/diffusers/dreambooth/train_dreambooth_lora.py
@@ -84,7 +84,7 @@ def log_validation(
     else:
         for image in args.validation_images:
             image = Image.open(image)
-            image = pipeline(**pipeline_args, image=image, generator=generator).images[0]
+            image = pipeline(**pipeline_args, image=image, generator=generator)[0][0]
             images.append(image)
 
     phase_name = "test" if is_final_validation else "validation"

diff --git a/examples/diffusers/textual_inversion/README.md b/examples/diffusers/textual_inversion/README.md
@@ -51,7 +51,7 @@ Now we can launch the training using:
 **___Note: Please follow the [README_sdxl.md](./README_sdxl.md) if you are using the [stable-diffusion-xl](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0).___**
 
 ```bash
-export MODEL_NAME="runwayml/stable-diffusion-v1-5"
+export MODEL_NAME="stable-diffusion-v1-5/stable-diffusion-v1-5"
 export DATA_DIR="./cat"
 
 python textual_inversion.py \
@@ -95,6 +95,9 @@ from mindone.diffusers import StableDiffusionPipeline
 model_id = "path-to-your-trained-model"
 pipe = StableDiffusionPipeline.from_pretrained(model_id, mindspore_dtype=ms.float16)
 
+repo_id_embeds = "path-to-your-learned-embeds"
+pipe.load_textual_inversion(repo_id_embeds)
+
 prompt = "A <cat-toy> backpack"
 
 image = pipe(prompt, num_inference_steps=50, guidance_scale=7.5)[0][0]

diff --git a/mindone/diffusers/__init__.py b/mindone/diffusers/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.30.3"
+__version__ = "0.31.0"
 
 from typing import TYPE_CHECKING
-Original file line number
+Diff line change
@@ Expand Up / @@ -123,3 +123,5 @@ make_image_grid(images, rows=2, cols=2) @@
         <figcaption class="mt-2 text-center text-sm text-gray-500">distilled Stable Diffusion + Tiny AutoEncoder</figcaption>
       </div>
     </div>
+    More tiny autoencoder models for other Stable Diffusion models, like Stable Diffusion 3, are available from [madebyollin](https://huggingface.co/madebyollin).