invoke-ai · RyanJDick · Sep 18, 2024 · Sep 3, 2024 · Sep 3, 2024 · Sep 4, 2024
@@ -20,6 +20,7 @@
 from invokeai.app.services.shared.invocation_context import InvocationContext
 from invokeai.app.util.ti_utils import generate_ti_list
 from invokeai.backend.lora.lora_model_raw import LoRAModelRaw
+from invokeai.backend.lora.lora_patcher import LoRAPatcher
 from invokeai.backend.model_patcher import ModelPatcher
 from invokeai.backend.stable_diffusion.diffusion.conditioning_data import (
     BasicConditioningInfo,
@@ -81,9 +82,10 @@ def _lora_loader() -> Iterator[Tuple[LoRAModelRaw, float]]:
             # apply all patches while the model is on the target device
             text_encoder_info.model_on_device() as (cached_weights, text_encoder),
             tokenizer_info as tokenizer,
-            ModelPatcher.apply_lora_text_encoder(
-                text_encoder,
-                loras=_lora_loader(),
+            LoRAPatcher.apply_lora_patches(
+                model=text_encoder,
+                patches=_lora_loader(),
+                prefix="lora_te_",
                 cached_weights=cached_weights,
             ),
             # Apply CLIP Skip after LoRA to prevent LoRA application from failing on skipped layers.
@@ -176,9 +178,9 @@ def _lora_loader() -> Iterator[Tuple[LoRAModelRaw, float]]:
             # apply all patches while the model is on the target device
             text_encoder_info.model_on_device() as (cached_weights, text_encoder),
             tokenizer_info as tokenizer,
-            ModelPatcher.apply_lora(
+            LoRAPatcher.apply_lora_patches(
                 text_encoder,
-                loras=_lora_loader(),
+                patches=_lora_loader(),
                 prefix=lora_prefix,
                 cached_weights=cached_weights,
             ),

@@ -37,6 +37,7 @@
 from invokeai.app.util.controlnet_utils import prepare_control_image
 from invokeai.backend.ip_adapter.ip_adapter import IPAdapter
 from invokeai.backend.lora.lora_model_raw import LoRAModelRaw
+from invokeai.backend.lora.lora_patcher import LoRAPatcher
 from invokeai.backend.model_manager import BaseModelType, ModelVariantType
 from invokeai.backend.model_patcher import ModelPatcher
 from invokeai.backend.stable_diffusion import PipelineIntermediateState
@@ -979,9 +980,10 @@ def _lora_loader() -> Iterator[Tuple[LoRAModelRaw, float]]:
             ModelPatcher.apply_freeu(unet, self.unet.freeu_config),
             SeamlessExt.static_patch_model(unet, self.unet.seamless_axes),  # FIXME
             # Apply the LoRA after unet has been moved to its target device for faster patching.
-            ModelPatcher.apply_lora_unet(
-                unet,
-                loras=_lora_loader(),
+            LoRAPatcher.apply_lora_patches(
+                model=unet,
+                patches=_lora_loader(),
+                prefix="lora_unet_",
                 cached_weights=cached_weights,
             ),
         ):

@@ -1,4 +1,5 @@
-from typing import Callable, Optional
+from contextlib import ExitStack
+from typing import Callable, Iterator, Optional, Tuple
 
 import torch
 import torchvision.transforms as tv_transforms
@@ -29,6 +30,9 @@
     pack,
     unpack,
 )
+from invokeai.backend.lora.lora_model_raw import LoRAModelRaw
+from invokeai.backend.lora.lora_patcher import LoRAPatcher
+from invokeai.backend.model_manager.config import ModelFormat
 from invokeai.backend.stable_diffusion.diffusers_pipeline import PipelineIntermediateState
 from invokeai.backend.stable_diffusion.diffusion.conditioning_data import FLUXConditioningInfo
 from invokeai.backend.util.devices import TorchDevice
@@ -187,9 +191,41 @@ def _run_diffusion(
                 noise=noise,
             )
 
-        with transformer_info as transformer:
+        with (
+            transformer_info.model_on_device() as (cached_weights, transformer),
+            ExitStack() as exit_stack,
+        ):
             assert isinstance(transformer, Flux)
 
+            config = transformer_info.config
+            assert config is not None
+
+            # Apply LoRA models to the transformer.
+            # Note: We apply the LoRA after the transformer has been moved to its target device for faster patching.
+            if config.format in [ModelFormat.Checkpoint]:
+                # The model is non-quantized, so we can apply the LoRA weights directly into the model.
+                exit_stack.enter_context(
+                    LoRAPatcher.apply_lora_patches(
+                        model=transformer,
+                        patches=self._lora_iterator(context),
+                        prefix="",
+                        cached_weights=cached_weights,
+                    )
+                )
+            elif config.format in [ModelFormat.BnbQuantizedLlmInt8b, ModelFormat.BnbQuantizednf4b]:
+                # The model is quantized, so apply the LoRA weights as sidecar layers. This results in slower inference,
+                # than directly patching the weights, but is agnostic to the quantization format.
+                exit_stack.enter_context(
+                    LoRAPatcher.apply_lora_sidecar_patches(
+                        model=transformer,
+                        patches=self._lora_iterator(context),
+                        prefix="",
+                        dtype=inference_dtype,
+                    )
+                )
+            else:
+                raise ValueError(f"Unsupported model format: {config.format}")
+
             x = denoise(
                 model=transformer,
                 img=x,
@@ -247,6 +283,13 @@ def _prep_inpaint_mask(self, context: InvocationContext, latents: torch.Tensor)
         # `latents`.
         return mask.expand_as(latents)
 
+    def _lora_iterator(self, context: InvocationContext) -> Iterator[Tuple[LoRAModelRaw, float]]:
+        for lora in self.transformer.loras:
+            lora_info = context.models.load(lora.lora)
+            assert isinstance(lora_info.model, LoRAModelRaw)
+            yield (lora_info.model, lora.weight)
+            del lora_info
+
     def _build_step_callback(self, context: InvocationContext) -> Callable[[PipelineIntermediateState], None]:
         def step_callback(state: PipelineIntermediateState) -> None:
             state.latents = unpack(state.latents.float(), self.height, self.width).squeeze()

@@ -0,0 +1,53 @@
+from invokeai.app.invocations.baseinvocation import BaseInvocation, BaseInvocationOutput, invocation, invocation_output
+from invokeai.app.invocations.fields import FieldDescriptions, Input, InputField, OutputField, UIType
+from invokeai.app.invocations.model import LoRAField, ModelIdentifierField, TransformerField
+from invokeai.app.services.shared.invocation_context import InvocationContext
+
+
+@invocation_output("flux_lora_loader_output")
+class FluxLoRALoaderOutput(BaseInvocationOutput):
+    """FLUX LoRA Loader Output"""
+
+    transformer: TransformerField = OutputField(
+        default=None, description=FieldDescriptions.transformer, title="FLUX Transformer"
+    )
+
+
+@invocation(
+    "flux_lora_loader",
+    title="FLUX LoRA",
+    tags=["lora", "model", "flux"],
+    category="model",
+    version="1.0.0",
+)
+class FluxLoRALoaderInvocation(BaseInvocation):
+    """Apply a LoRA model to a FLUX transformer."""
+
+    lora: ModelIdentifierField = InputField(
+        description=FieldDescriptions.lora_model, title="LoRA", ui_type=UIType.LoRAModel
+    )
+    weight: float = InputField(default=0.75, description=FieldDescriptions.lora_weight)
+    transformer: TransformerField = InputField(
+        description=FieldDescriptions.transformer,
+        input=Input.Connection,
+        title="FLUX Transformer",
+    )
+
+    def invoke(self, context: InvocationContext) -> FluxLoRALoaderOutput:
+        lora_key = self.lora.key
+
+        if not context.models.exists(lora_key):
+            raise ValueError(f"Unknown lora: {lora_key}!")
+
+        if any(lora.lora.key == lora_key for lora in self.transformer.loras):
+            raise ValueError(f'LoRA "{lora_key}" already applied to transformer.')
+
+        transformer = self.transformer.model_copy(deep=True)
+        transformer.loras.append(
+            LoRAField(
+                lora=self.lora,
+                weight=self.weight,
+            )
+        )
+
+        return FluxLoRALoaderOutput(transformer=transformer)
@@ -69,6 +69,7 @@ class CLIPField(BaseModel):
 
 class TransformerField(BaseModel):
     transformer: ModelIdentifierField = Field(description="Info to load Transformer submodel")
+    loras: List[LoRAField] = Field(description="LoRAs to apply on model loading")
 
 
 class T5EncoderField(BaseModel):
@@ -202,7 +203,7 @@ def invoke(self, context: InvocationContext) -> FluxModelLoaderOutput:
         assert isinstance(transformer_config, CheckpointConfigBase)
 
         return FluxModelLoaderOutput(
-            transformer=TransformerField(transformer=transformer),
+            transformer=TransformerField(transformer=transformer, loras=[]),
             clip=CLIPField(tokenizer=tokenizer, text_encoder=clip_encoder, loras=[], skipped_layers=0),
             t5_encoder=T5EncoderField(tokenizer=tokenizer2, text_encoder=t5_encoder),
             vae=VAEField(vae=vae),

@@ -23,7 +23,7 @@
 from invokeai.app.invocations.primitives import LatentsOutput
 from invokeai.app.services.shared.invocation_context import InvocationContext
 from invokeai.backend.lora.lora_model_raw import LoRAModelRaw
-from invokeai.backend.model_patcher import ModelPatcher
+from invokeai.backend.lora.lora_patcher import LoRAPatcher
 from invokeai.backend.stable_diffusion.diffusers_pipeline import ControlNetData, PipelineIntermediateState
 from invokeai.backend.stable_diffusion.multi_diffusion_pipeline import (
     MultiDiffusionPipeline,
@@ -204,7 +204,11 @@ def _lora_loader() -> Iterator[Tuple[LoRAModelRaw, float]]:
         # Load the UNet model.
         unet_info = context.models.load(self.unet.unet)
 
-        with ExitStack() as exit_stack, unet_info as unet, ModelPatcher.apply_lora_unet(unet, _lora_loader()):
+        with (
+            ExitStack() as exit_stack,
+            unet_info as unet,
+            LoRAPatcher.apply_lora_patches(model=unet, patches=_lora_loader(), prefix="lora_unet_"),
+        ):
             assert isinstance(unet, UNet2DConditionModel)
             latents = latents.to(device=unet.device, dtype=unet.dtype)
             if noise is not None: