From f4f5c46a7725656527b00987393f69c85525d59f Mon Sep 17 00:00:00 2001
From: Brandon Rising <brandon@invoke.ai>
Date: Fri, 16 Aug 2024 17:04:48 -0400
Subject: [PATCH] Add backend functions and classes for Flux implementation,
 Update the way flux encoders/tokenizers are loaded for prompt encoding,
 Update way flux vae is loaded

---
 invokeai/app/invocations/flux_text_encoder.py |  31 +-
 .../app/invocations/flux_text_to_image.py     |   4 +-
 invokeai/app/invocations/model.py             |  81 +++-
 .../model_records/model_records_base.py       |   1 +
 .../model_records/model_records_sql.py        |   2 +-
 .../app/services/shared/invocation_context.py |  15 +
 invokeai/backend/flux/math.py                 |  30 ++
 invokeai/backend/flux/model.py                | 111 ++++++
 invokeai/backend/flux/modules/autoencoder.py  | 312 ++++++++++++++++
 invokeai/backend/flux/modules/conditioner.py  |  30 ++
 invokeai/backend/flux/modules/layers.py       | 253 +++++++++++++
 invokeai/backend/model_manager/config.py      |  30 ++
 .../model_manager/load/model_loaders/flux.py  | 159 ++++++++
 .../load/model_loaders/stable_diffusion.py    |  10 +-
 .../backend/model_manager/load/model_util.py  |   4 +-
 invokeai/backend/model_manager/probe.py       |  44 ++-
 invokeai/configs/flux/flux1-dev.yaml          |  33 ++
 invokeai/configs/flux/flux1-schnell.yaml      |  34 ++
 .../frontend/web/src/services/api/schema.ts   | 353 +++++++++++-------
 19 files changed, 1340 insertions(+), 197 deletions(-)
 create mode 100644 invokeai/backend/flux/math.py
 create mode 100644 invokeai/backend/flux/model.py
 create mode 100644 invokeai/backend/flux/modules/autoencoder.py
 create mode 100644 invokeai/backend/flux/modules/conditioner.py
 create mode 100644 invokeai/backend/flux/modules/layers.py
 create mode 100644 invokeai/backend/model_manager/load/model_loaders/flux.py
 create mode 100644 invokeai/configs/flux/flux1-dev.yaml
 create mode 100644 invokeai/configs/flux/flux1-schnell.yaml

diff --git a/invokeai/app/invocations/flux_text_encoder.py b/invokeai/app/invocations/flux_text_encoder.py
index 8e33a3f0cd2..5c0d0ef2ac7 100644
--- a/invokeai/app/invocations/flux_text_encoder.py
+++ b/invokeai/app/invocations/flux_text_encoder.py
@@ -1,6 +1,9 @@
 import torch
+
+
+from einops import repeat
 from diffusers.pipelines.flux.pipeline_flux import FluxPipeline
-from transformers import CLIPTextModel, CLIPTokenizer, T5EncoderModel, T5TokenizerFast
+from transformers import CLIPTextModel, CLIPTokenizer, T5EncoderModel, T5Tokenizer
 
 from invokeai.app.invocations.baseinvocation import BaseInvocation, invocation
 from invokeai.app.invocations.fields import FieldDescriptions, Input, InputField
@@ -9,6 +12,7 @@
 from invokeai.app.services.shared.invocation_context import InvocationContext
 from invokeai.backend.stable_diffusion.diffusion.conditioning_data import ConditioningFieldData, FLUXConditioningInfo
 from invokeai.backend.util.devices import TorchDevice
+from invokeai.backend.flux.modules.conditioner import HFEncoder
 
 
 @invocation(
@@ -69,26 +73,15 @@ def _encode_prompt(self, context: InvocationContext) -> tuple[torch.Tensor, torc
             assert isinstance(clip_text_encoder, CLIPTextModel)
             assert isinstance(t5_text_encoder, T5EncoderModel)
             assert isinstance(clip_tokenizer, CLIPTokenizer)
-            assert isinstance(t5_tokenizer, T5TokenizerFast)
+            assert isinstance(t5_tokenizer, T5Tokenizer)
+
+            clip_encoder = HFEncoder(clip_text_encoder, clip_tokenizer, True, 77)
+            t5_encoder = HFEncoder(t5_text_encoder, t5_tokenizer, False, max_seq_len)
 
-            pipeline = FluxPipeline(
-                scheduler=None,
-                vae=None,
-                text_encoder=clip_text_encoder,
-                tokenizer=clip_tokenizer,
-                text_encoder_2=t5_text_encoder,
-                tokenizer_2=t5_tokenizer,
-                transformer=None,
-            )
+            prompt = [self.positive_prompt]
+            prompt_embeds = t5_encoder(prompt)
 
-            # prompt_embeds: T5 embeddings
-            # pooled_prompt_embeds: CLIP embeddings
-            prompt_embeds, pooled_prompt_embeds, _ = pipeline.encode_prompt(
-                prompt=self.positive_prompt,
-                prompt_2=self.positive_prompt,
-                device=TorchDevice.choose_torch_device(),
-                max_sequence_length=max_seq_len,
-            )
+            pooled_prompt_embeds = clip_encoder(prompt)
 
         assert isinstance(prompt_embeds, torch.Tensor)
         assert isinstance(pooled_prompt_embeds, torch.Tensor)
diff --git a/invokeai/app/invocations/flux_text_to_image.py b/invokeai/app/invocations/flux_text_to_image.py
index bfb1484ed12..d976535d59b 100644
--- a/invokeai/app/invocations/flux_text_to_image.py
+++ b/invokeai/app/invocations/flux_text_to_image.py
@@ -85,7 +85,6 @@ def _run_diffusion(
         clip_embeddings: torch.Tensor,
         t5_embeddings: torch.Tensor,
     ):
-        scheduler_info = context.models.load(self.transformer.scheduler)
         transformer_info = context.models.load(self.transformer.transformer)
 
         # HACK(ryand): Manually empty the cache. Currently we don't check the size of the model before loading it from
@@ -93,9 +92,8 @@ def _run_diffusion(
         # if the cache is not empty.
         # context.models._services.model_manager.load.ram_cache.make_room(24 * 2**30)
 
-        with transformer_info as transformer, scheduler_info as scheduler:
+        with transformer_info as transformer:
             assert isinstance(transformer, FluxTransformer2DModel)
-            assert isinstance(scheduler, FlowMatchEulerDiscreteScheduler)
 
             flux_pipeline_with_transformer = FluxPipeline(
                 scheduler=scheduler,
diff --git a/invokeai/app/invocations/model.py b/invokeai/app/invocations/model.py
index c3902c1cb14..3908bef4da9 100644
--- a/invokeai/app/invocations/model.py
+++ b/invokeai/app/invocations/model.py
@@ -1,5 +1,6 @@
 import copy
-from typing import List, Optional
+from time import sleep
+from typing import List, Optional, Literal, Dict
 
 from pydantic import BaseModel, Field
 
@@ -13,7 +14,8 @@
 from invokeai.app.invocations.fields import FieldDescriptions, Input, InputField, OutputField, UIType
 from invokeai.app.services.shared.invocation_context import InvocationContext
 from invokeai.app.shared.models import FreeUConfig
-from invokeai.backend.model_manager.config import AnyModelConfig, BaseModelType, ModelType, SubModelType
+from invokeai.app.services.model_records import ModelRecordChanges
+from invokeai.backend.model_manager.config import AnyModelConfig, BaseModelType, ModelType, SubModelType, ModelFormat
 
 
 class ModelIdentifierField(BaseModel):
@@ -62,7 +64,6 @@ class CLIPField(BaseModel):
 
 class TransformerField(BaseModel):
     transformer: ModelIdentifierField = Field(description="Info to load Transformer submodel")
-    scheduler: ModelIdentifierField = Field(description="Info to load scheduler submodel")
 
 
 class T5EncoderField(BaseModel):
@@ -131,6 +132,30 @@ def invoke(self, context: InvocationContext) -> ModelIdentifierOutput:
 
         return ModelIdentifierOutput(model=self.model)
 
+T5_ENCODER_OPTIONS = Literal["base", "16b_quantized", "8b_quantized"]
+T5_ENCODER_MAP: Dict[str, Dict[str, str]] = {
+    "base": {
+        "text_encoder_repo": "black-forest-labs/FLUX.1-schnell::text_encoder_2",
+        "tokenizer_repo": "black-forest-labs/FLUX.1-schnell::tokenizer_2",
+        "text_encoder_name": "FLUX.1-schnell_text_encoder_2",
+        "tokenizer_name": "FLUX.1-schnell_tokenizer_2",
+        "format": ModelFormat.T5Encoder,
+    },
+    "8b_quantized": {
+        "text_encoder_repo": "hf_repo1",
+        "tokenizer_repo": "hf_repo1",
+        "text_encoder_name": "hf_repo1",
+        "tokenizer_name": "hf_repo1",
+        "format": ModelFormat.T5Encoder8b,
+    },
+    "4b_quantized": {
+        "text_encoder_repo": "hf_repo2",
+        "tokenizer_repo": "hf_repo2",
+        "text_encoder_name": "hf_repo2",
+        "tokenizer_name": "hf_repo2",
+        "format": ModelFormat.T5Encoder8b,
+    },
+}
 
 @invocation_output("flux_model_loader_output")
 class FluxModelLoaderOutput(BaseInvocationOutput):
@@ -151,29 +176,55 @@ class FluxModelLoaderInvocation(BaseInvocation):
         ui_type=UIType.FluxMainModel,
         input=Input.Direct,
     )
+    
+    t5_encoder: T5_ENCODER_OPTIONS = InputField(description="The T5 Encoder model to use.")
 
     def invoke(self, context: InvocationContext) -> FluxModelLoaderOutput:
         model_key = self.model.key
 
-        # TODO: not found exceptions
         if not context.models.exists(model_key):
             raise Exception(f"Unknown model: {model_key}")
-
-        transformer = self.model.model_copy(update={"submodel_type": SubModelType.Transformer})
-        scheduler = self.model.model_copy(update={"submodel_type": SubModelType.Scheduler})
-        tokenizer = self.model.model_copy(update={"submodel_type": SubModelType.Tokenizer})
-        text_encoder = self.model.model_copy(update={"submodel_type": SubModelType.TextEncoder})
-        tokenizer2 = self.model.model_copy(update={"submodel_type": SubModelType.Tokenizer2})
-        text_encoder2 = self.model.model_copy(update={"submodel_type": SubModelType.TextEncoder2})
-        vae = self.model.model_copy(update={"submodel_type": SubModelType.VAE})
+        transformer = self._get_model(context, SubModelType.Transformer)
+        tokenizer = self._get_model(context, SubModelType.Tokenizer)
+        tokenizer2 = self._get_model(context, SubModelType.Tokenizer2)
+        clip_encoder = self._get_model(context, SubModelType.TextEncoder)
+        t5_encoder = self._get_model(context, SubModelType.TextEncoder2)
+        vae = self._install_model(context, SubModelType.VAE, "FLUX.1-schnell_ae", "black-forest-labs/FLUX.1-schnell::ae.safetensors", ModelFormat.Checkpoint, ModelType.VAE, BaseModelType.Flux)
 
         return FluxModelLoaderOutput(
-            transformer=TransformerField(transformer=transformer, scheduler=scheduler),
-            clip=CLIPField(tokenizer=tokenizer, text_encoder=text_encoder, loras=[], skipped_layers=0),
-            t5Encoder=T5EncoderField(tokenizer=tokenizer2, text_encoder=text_encoder2),
+            transformer=TransformerField(transformer=transformer),
+            clip=CLIPField(tokenizer=tokenizer, text_encoder=clip_encoder, loras=[], skipped_layers=0),
+            t5Encoder=T5EncoderField(tokenizer=tokenizer2, text_encoder=t5_encoder),
             vae=VAEField(vae=vae),
         )
 
+    def _get_model(self, context: InvocationContext, submodel:SubModelType) -> ModelIdentifierField:
+        match(submodel):
+            case SubModelType.Transformer:
+                return self.model.model_copy(update={"submodel_type": SubModelType.Transformer})
+            case submodel if submodel in [SubModelType.Tokenizer, SubModelType.TextEncoder]:
+                return self._install_model(context, submodel, "clip-vit-large-patch14", "openai/clip-vit-large-patch14", ModelFormat.Diffusers, ModelType.CLIPEmbed, BaseModelType.Any)
+            case SubModelType.TextEncoder2:
+                return self._install_model(context, submodel, T5_ENCODER_MAP[self.t5_encoder]["text_encoder_name"], T5_ENCODER_MAP[self.t5_encoder]["text_encoder_repo"], ModelFormat(T5_ENCODER_MAP[self.t5_encoder]["format"]), ModelType.T5Encoder, BaseModelType.Any)
+            case SubModelType.Tokenizer2:
+                return self._install_model(context, submodel, T5_ENCODER_MAP[self.t5_encoder]["tokenizer_name"], T5_ENCODER_MAP[self.t5_encoder]["tokenizer_repo"], ModelFormat(T5_ENCODER_MAP[self.t5_encoder]["format"]), ModelType.T5Encoder, BaseModelType.Any)
+            case _:
+                raise Exception(f"{submodel.value} is not a supported submodule for a flux model")  
+
+    def _install_model(self, context: InvocationContext, submodel:SubModelType, name: str, repo_id: str, format: ModelFormat, type: ModelType, base: BaseModelType):
+        if (models := context.models.search_by_attrs(name=name, base=base, type=type)):
+            if len(models) != 1:
+                raise Exception(f"Multiple models detected for selected model with name {name}")
+            return ModelIdentifierField.from_config(models[0]).model_copy(update={"submodel_type": submodel})
+        else:
+            model_path = context.models.download_and_cache_model(repo_id)
+            config = ModelRecordChanges(name = name, base = base, type=type, format=format)
+            model_install_job = context.models.import_local_model(model_path=model_path, config=config)
+            while not model_install_job.in_terminal_state:
+                sleep(0.01)
+            if not model_install_job.config_out:
+                raise Exception(f"Failed to install {name}")
+            return ModelIdentifierField.from_config(model_install_job.config_out).model_copy(update={"submodel_type": submodel})
 
 @invocation(
     "main_model_loader",
diff --git a/invokeai/app/services/model_records/model_records_base.py b/invokeai/app/services/model_records/model_records_base.py
index 46d11d4ddf2..9cc1486a019 100644
--- a/invokeai/app/services/model_records/model_records_base.py
+++ b/invokeai/app/services/model_records/model_records_base.py
@@ -77,6 +77,7 @@ class ModelRecordChanges(BaseModelExcludeNull):
     type: Optional[ModelType] = Field(description="Type of model", default=None)
     key: Optional[str] = Field(description="Database ID for this model", default=None)
     hash: Optional[str] = Field(description="hash of model file", default=None)
+    format: Optional[str] = Field(description="format of model file", default=None)
     trigger_phrases: Optional[set[str]] = Field(description="Set of trigger phrases for this model", default=None)
     default_settings: Optional[MainModelDefaultSettings | ControlAdapterDefaultSettings] = Field(
         description="Default settings for this model", default=None
diff --git a/invokeai/app/services/model_records/model_records_sql.py b/invokeai/app/services/model_records/model_records_sql.py
index 1d0780efe1f..d1ec0152429 100644
--- a/invokeai/app/services/model_records/model_records_sql.py
+++ b/invokeai/app/services/model_records/model_records_sql.py
@@ -301,7 +301,7 @@ def search_by_attr(
         for row in result:
             try:
                 model_config = ModelConfigFactory.make_config(json.loads(row[0]), timestamp=row[1])
-            except pydantic.ValidationError:
+            except pydantic.ValidationError as e:
                 # We catch this error so that the app can still run if there are invalid model configs in the database.
                 # One reason that an invalid model config might be in the database is if someone had to rollback from a
                 # newer version of the app that added a new model type.
diff --git a/invokeai/app/services/shared/invocation_context.py b/invokeai/app/services/shared/invocation_context.py
index 01662335e46..9a5ac3fb5a9 100644
--- a/invokeai/app/services/shared/invocation_context.py
+++ b/invokeai/app/services/shared/invocation_context.py
@@ -13,6 +13,7 @@
 from invokeai.app.services.image_records.image_records_common import ImageCategory, ResourceOrigin
 from invokeai.app.services.images.images_common import ImageDTO
 from invokeai.app.services.invocation_services import InvocationServices
+from invokeai.app.services.model_records import ModelRecordChanges
 from invokeai.app.services.model_records.model_records_base import UnknownModelException
 from invokeai.app.util.step_callback import stable_diffusion_step_callback
 from invokeai.backend.model_manager.config import (
@@ -463,6 +464,20 @@ def download_and_cache_model(
         """
         return self._services.model_manager.install.download_and_cache_model(source=source)
 
+    def import_local_model(
+            self,
+            model_path: Path,
+            config: Optional[ModelRecordChanges] = None,
+            access_token: Optional[str] = None,
+            inplace: Optional[bool] = False,
+    ):
+        """
+        TODO: Fill out description of this method
+        """
+        if not model_path.exists():
+            raise Exception("Models provided to import_local_model must already exist on disk")
+        return self._services.model_manager.install.heuristic_import(str(model_path), config=config, access_token=access_token, inplace=inplace)
+
     def load_local_model(
         self,
         model_path: Path,
diff --git a/invokeai/backend/flux/math.py b/invokeai/backend/flux/math.py
new file mode 100644
index 00000000000..71b91fa0f5a
--- /dev/null
+++ b/invokeai/backend/flux/math.py
@@ -0,0 +1,30 @@
+import torch
+from einops import rearrange
+from torch import Tensor
+
+
+def attention(q: Tensor, k: Tensor, v: Tensor, pe: Tensor) -> Tensor:
+    q, k = apply_rope(q, k, pe)
+
+    x = torch.nn.functional.scaled_dot_product_attention(q, k, v)
+    x = rearrange(x, "B H L D -> B L (H D)")
+
+    return x
+
+
+def rope(pos: Tensor, dim: int, theta: int) -> Tensor:
+    assert dim % 2 == 0
+    scale = torch.arange(0, dim, 2, dtype=torch.float64, device=pos.device) / dim
+    omega = 1.0 / (theta**scale)
+    out = torch.einsum("...n,d->...nd", pos, omega)
+    out = torch.stack([torch.cos(out), -torch.sin(out), torch.sin(out), torch.cos(out)], dim=-1)
+    out = rearrange(out, "b n d (i j) -> b n d i j", i=2, j=2)
+    return out.float()
+
+
+def apply_rope(xq: Tensor, xk: Tensor, freqs_cis: Tensor) -> tuple[Tensor, Tensor]:
+    xq_ = xq.float().reshape(*xq.shape[:-1], -1, 1, 2)
+    xk_ = xk.float().reshape(*xk.shape[:-1], -1, 1, 2)
+    xq_out = freqs_cis[..., 0] * xq_[..., 0] + freqs_cis[..., 1] * xq_[..., 1]
+    xk_out = freqs_cis[..., 0] * xk_[..., 0] + freqs_cis[..., 1] * xk_[..., 1]
+    return xq_out.reshape(*xq.shape).type_as(xq), xk_out.reshape(*xk.shape).type_as(xk)
\ No newline at end of file
diff --git a/invokeai/backend/flux/model.py b/invokeai/backend/flux/model.py
new file mode 100644
index 00000000000..2cb0aa102e7
--- /dev/null
+++ b/invokeai/backend/flux/model.py
@@ -0,0 +1,111 @@
+from dataclasses import dataclass
+
+import torch
+from torch import Tensor, nn
+
+from invokeai.backend.flux.modules.layers import (DoubleStreamBlock, EmbedND, LastLayer,
+                                 MLPEmbedder, SingleStreamBlock,
+                                 timestep_embedding)
+
+@dataclass
+class FluxParams:
+    in_channels: int
+    vec_in_dim: int
+    context_in_dim: int
+    hidden_size: int
+    mlp_ratio: float
+    num_heads: int
+    depth: int
+    depth_single_blocks: int
+    axes_dim: list[int]
+    theta: int
+    qkv_bias: bool
+    guidance_embed: bool
+
+
+class Flux(nn.Module):
+    """
+    Transformer model for flow matching on sequences.
+    """
+
+    def __init__(self, params: FluxParams):
+        super().__init__()
+
+        self.params = params
+        self.in_channels = params.in_channels
+        self.out_channels = self.in_channels
+        if params.hidden_size % params.num_heads != 0:
+            raise ValueError(
+                f"Hidden size {params.hidden_size} must be divisible by num_heads {params.num_heads}"
+            )
+        pe_dim = params.hidden_size // params.num_heads
+        if sum(params.axes_dim) != pe_dim:
+            raise ValueError(f"Got {params.axes_dim} but expected positional dim {pe_dim}")
+        self.hidden_size = params.hidden_size
+        self.num_heads = params.num_heads
+        self.pe_embedder = EmbedND(dim=pe_dim, theta=params.theta, axes_dim=params.axes_dim)
+        self.img_in = nn.Linear(self.in_channels, self.hidden_size, bias=True)
+        self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size)
+        self.vector_in = MLPEmbedder(params.vec_in_dim, self.hidden_size)
+        self.guidance_in = (
+            MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size) if params.guidance_embed else nn.Identity()
+        )
+        self.txt_in = nn.Linear(params.context_in_dim, self.hidden_size)
+
+        self.double_blocks = nn.ModuleList(
+            [
+                DoubleStreamBlock(
+                    self.hidden_size,
+                    self.num_heads,
+                    mlp_ratio=params.mlp_ratio,
+                    qkv_bias=params.qkv_bias,
+                )
+                for _ in range(params.depth)
+            ]
+        )
+
+        self.single_blocks = nn.ModuleList(
+            [
+                SingleStreamBlock(self.hidden_size, self.num_heads, mlp_ratio=params.mlp_ratio)
+                for _ in range(params.depth_single_blocks)
+            ]
+        )
+
+        self.final_layer = LastLayer(self.hidden_size, 1, self.out_channels)
+
+    def forward(
+        self,
+        img: Tensor,
+        img_ids: Tensor,
+        txt: Tensor,
+        txt_ids: Tensor,
+        timesteps: Tensor,
+        y: Tensor,
+        guidance: Tensor | None = None,
+    ) -> Tensor:
+        if img.ndim != 3 or txt.ndim != 3:
+            raise ValueError("Input img and txt tensors must have 3 dimensions.")
+
+        # running on sequences img
+        img = self.img_in(img)
+        vec = self.time_in(timestep_embedding(timesteps, 256))
+        if self.params.guidance_embed:
+            if guidance is None:
+                raise ValueError("Didn't get guidance strength for guidance distilled model.")
+            vec = vec + self.guidance_in(timestep_embedding(guidance, 256))
+        vec = vec + self.vector_in(y)
+        txt = self.txt_in(txt)
+
+        ids = torch.cat((txt_ids, img_ids), dim=1)
+        pe = self.pe_embedder(ids)
+
+        for block in self.double_blocks:
+            img, txt = block(img=img, txt=txt, vec=vec, pe=pe)
+
+        img = torch.cat((txt, img), 1)
+        for block in self.single_blocks:
+            img = block(img, vec=vec, pe=pe)
+        img = img[:, txt.shape[1] :, ...]
+
+        img = self.final_layer(img, vec)  # (N, T, patch_size ** 2 * out_channels)
+        return img
\ No newline at end of file
diff --git a/invokeai/backend/flux/modules/autoencoder.py b/invokeai/backend/flux/modules/autoencoder.py
new file mode 100644
index 00000000000..f6e072ecaaa
--- /dev/null
+++ b/invokeai/backend/flux/modules/autoencoder.py
@@ -0,0 +1,312 @@
+from dataclasses import dataclass
+
+import torch
+from einops import rearrange
+from torch import Tensor, nn
+
+
+@dataclass
+class AutoEncoderParams:
+    resolution: int
+    in_channels: int
+    ch: int
+    out_ch: int
+    ch_mult: list[int]
+    num_res_blocks: int
+    z_channels: int
+    scale_factor: float
+    shift_factor: float
+
+
+def swish(x: Tensor) -> Tensor:
+    return x * torch.sigmoid(x)
+
+
+class AttnBlock(nn.Module):
+    def __init__(self, in_channels: int):
+        super().__init__()
+        self.in_channels = in_channels
+
+        self.norm = nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
+
+        self.q = nn.Conv2d(in_channels, in_channels, kernel_size=1)
+        self.k = nn.Conv2d(in_channels, in_channels, kernel_size=1)
+        self.v = nn.Conv2d(in_channels, in_channels, kernel_size=1)
+        self.proj_out = nn.Conv2d(in_channels, in_channels, kernel_size=1)
+
+    def attention(self, h_: Tensor) -> Tensor:
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+
+        b, c, h, w = q.shape
+        q = rearrange(q, "b c h w -> b 1 (h w) c").contiguous()
+        k = rearrange(k, "b c h w -> b 1 (h w) c").contiguous()
+        v = rearrange(v, "b c h w -> b 1 (h w) c").contiguous()
+        h_ = nn.functional.scaled_dot_product_attention(q, k, v)
+
+        return rearrange(h_, "b 1 (h w) c -> b c h w", h=h, w=w, c=c, b=b)
+
+    def forward(self, x: Tensor) -> Tensor:
+        return x + self.proj_out(self.attention(x))
+
+
+class ResnetBlock(nn.Module):
+    def __init__(self, in_channels: int, out_channels: int):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+
+        self.norm1 = nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
+        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        self.norm2 = nn.GroupNorm(num_groups=32, num_channels=out_channels, eps=1e-6, affine=True)
+        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        if self.in_channels != self.out_channels:
+            self.nin_shortcut = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
+
+    def forward(self, x):
+        h = x
+        h = self.norm1(h)
+        h = swish(h)
+        h = self.conv1(h)
+
+        h = self.norm2(h)
+        h = swish(h)
+        h = self.conv2(h)
+
+        if self.in_channels != self.out_channels:
+            x = self.nin_shortcut(x)
+
+        return x + h
+
+
+class Downsample(nn.Module):
+    def __init__(self, in_channels: int):
+        super().__init__()
+        # no asymmetric padding in torch conv, must do it ourselves
+        self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=2, padding=0)
+
+    def forward(self, x: Tensor):
+        pad = (0, 1, 0, 1)
+        x = nn.functional.pad(x, pad, mode="constant", value=0)
+        x = self.conv(x)
+        return x
+
+
+class Upsample(nn.Module):
+    def __init__(self, in_channels: int):
+        super().__init__()
+        self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)
+
+    def forward(self, x: Tensor):
+        x = nn.functional.interpolate(x, scale_factor=2.0, mode="nearest")
+        x = self.conv(x)
+        return x
+
+
+class Encoder(nn.Module):
+    def __init__(
+        self,
+        resolution: int,
+        in_channels: int,
+        ch: int,
+        ch_mult: list[int],
+        num_res_blocks: int,
+        z_channels: int,
+    ):
+        super().__init__()
+        self.ch = ch
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        # downsampling
+        self.conv_in = nn.Conv2d(in_channels, self.ch, kernel_size=3, stride=1, padding=1)
+
+        curr_res = resolution
+        in_ch_mult = (1,) + tuple(ch_mult)
+        self.in_ch_mult = in_ch_mult
+        self.down = nn.ModuleList()
+        block_in = self.ch
+        for i_level in range(self.num_resolutions):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_in = ch * in_ch_mult[i_level]
+            block_out = ch * ch_mult[i_level]
+            for _ in range(self.num_res_blocks):
+                block.append(ResnetBlock(in_channels=block_in, out_channels=block_out))
+                block_in = block_out
+            down = nn.Module()
+            down.block = block
+            down.attn = attn
+            if i_level != self.num_resolutions - 1:
+                down.downsample = Downsample(block_in)
+                curr_res = curr_res // 2
+            self.down.append(down)
+
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+        self.mid.attn_1 = AttnBlock(block_in)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+
+        # end
+        self.norm_out = nn.GroupNorm(num_groups=32, num_channels=block_in, eps=1e-6, affine=True)
+        self.conv_out = nn.Conv2d(block_in, 2 * z_channels, kernel_size=3, stride=1, padding=1)
+
+    def forward(self, x: Tensor) -> Tensor:
+        # downsampling
+        hs = [self.conv_in(x)]
+        for i_level in range(self.num_resolutions):
+            for i_block in range(self.num_res_blocks):
+                h = self.down[i_level].block[i_block](hs[-1])
+                if len(self.down[i_level].attn) > 0:
+                    h = self.down[i_level].attn[i_block](h)
+                hs.append(h)
+            if i_level != self.num_resolutions - 1:
+                hs.append(self.down[i_level].downsample(hs[-1]))
+
+        # middle
+        h = hs[-1]
+        h = self.mid.block_1(h)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h)
+        # end
+        h = self.norm_out(h)
+        h = swish(h)
+        h = self.conv_out(h)
+        return h
+
+
+class Decoder(nn.Module):
+    def __init__(
+        self,
+        ch: int,
+        out_ch: int,
+        ch_mult: list[int],
+        num_res_blocks: int,
+        in_channels: int,
+        resolution: int,
+        z_channels: int,
+    ):
+        super().__init__()
+        self.ch = ch
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        self.ffactor = 2 ** (self.num_resolutions - 1)
+
+        # compute in_ch_mult, block_in and curr_res at lowest res
+        block_in = ch * ch_mult[self.num_resolutions - 1]
+        curr_res = resolution // 2 ** (self.num_resolutions - 1)
+        self.z_shape = (1, z_channels, curr_res, curr_res)
+
+        # z to block_in
+        self.conv_in = nn.Conv2d(z_channels, block_in, kernel_size=3, stride=1, padding=1)
+
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+        self.mid.attn_1 = AttnBlock(block_in)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+
+        # upsampling
+        self.up = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_out = ch * ch_mult[i_level]
+            for _ in range(self.num_res_blocks + 1):
+                block.append(ResnetBlock(in_channels=block_in, out_channels=block_out))
+                block_in = block_out
+            up = nn.Module()
+            up.block = block
+            up.attn = attn
+            if i_level != 0:
+                up.upsample = Upsample(block_in)
+                curr_res = curr_res * 2
+            self.up.insert(0, up)  # prepend to get consistent order
+
+        # end
+        self.norm_out = nn.GroupNorm(num_groups=32, num_channels=block_in, eps=1e-6, affine=True)
+        self.conv_out = nn.Conv2d(block_in, out_ch, kernel_size=3, stride=1, padding=1)
+
+    def forward(self, z: Tensor) -> Tensor:
+        # z to block_in
+        h = self.conv_in(z)
+
+        # middle
+        h = self.mid.block_1(h)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h)
+
+        # upsampling
+        for i_level in reversed(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks + 1):
+                h = self.up[i_level].block[i_block](h)
+                if len(self.up[i_level].attn) > 0:
+                    h = self.up[i_level].attn[i_block](h)
+            if i_level != 0:
+                h = self.up[i_level].upsample(h)
+
+        # end
+        h = self.norm_out(h)
+        h = swish(h)
+        h = self.conv_out(h)
+        return h
+
+
+class DiagonalGaussian(nn.Module):
+    def __init__(self, sample: bool = True, chunk_dim: int = 1):
+        super().__init__()
+        self.sample = sample
+        self.chunk_dim = chunk_dim
+
+    def forward(self, z: Tensor) -> Tensor:
+        mean, logvar = torch.chunk(z, 2, dim=self.chunk_dim)
+        if self.sample:
+            std = torch.exp(0.5 * logvar)
+            return mean + std * torch.randn_like(mean)
+        else:
+            return mean
+
+
+class AutoEncoder(nn.Module):
+    def __init__(self, params: AutoEncoderParams):
+        super().__init__()
+        self.encoder = Encoder(
+            resolution=params.resolution,
+            in_channels=params.in_channels,
+            ch=params.ch,
+            ch_mult=params.ch_mult,
+            num_res_blocks=params.num_res_blocks,
+            z_channels=params.z_channels,
+        )
+        self.decoder = Decoder(
+            resolution=params.resolution,
+            in_channels=params.in_channels,
+            ch=params.ch,
+            out_ch=params.out_ch,
+            ch_mult=params.ch_mult,
+            num_res_blocks=params.num_res_blocks,
+            z_channels=params.z_channels,
+        )
+        self.reg = DiagonalGaussian()
+
+        self.scale_factor = params.scale_factor
+        self.shift_factor = params.shift_factor
+
+    def encode(self, x: Tensor) -> Tensor:
+        z = self.reg(self.encoder(x))
+        z = self.scale_factor * (z - self.shift_factor)
+        return z
+
+    def decode(self, z: Tensor) -> Tensor:
+        z = z / self.scale_factor + self.shift_factor
+        return self.decoder(z)
+
+    def forward(self, x: Tensor) -> Tensor:
+        return self.decode(self.encode(x))
\ No newline at end of file
diff --git a/invokeai/backend/flux/modules/conditioner.py b/invokeai/backend/flux/modules/conditioner.py
new file mode 100644
index 00000000000..2a9e17c20e3
--- /dev/null
+++ b/invokeai/backend/flux/modules/conditioner.py
@@ -0,0 +1,30 @@
+from torch import Tensor, nn
+from transformers import (PreTrainedModel, PreTrainedTokenizer)
+
+class HFEncoder(nn.Module):
+    def __init__(self, encoder: PreTrainedModel, tokenizer: PreTrainedTokenizer, is_clip: bool, max_length: int):
+        super().__init__()
+        self.max_length = max_length
+        self.is_clip = is_clip
+        self.output_key = "pooler_output" if self.is_clip else "last_hidden_state"
+        self.tokenizer = tokenizer
+        self.hf_module = encoder
+        self.hf_module = self.hf_module.eval().requires_grad_(False)
+
+    def forward(self, text: list[str]) -> Tensor:
+        batch_encoding = self.tokenizer(
+            text,
+            truncation=True,
+            max_length=self.max_length,
+            return_length=False,
+            return_overflowing_tokens=False,
+            padding="max_length",
+            return_tensors="pt",
+        )
+
+        outputs = self.hf_module(
+            input_ids=batch_encoding["input_ids"].to(self.hf_module.device),
+            attention_mask=None,
+            output_hidden_states=False,
+        )
+        return outputs[self.output_key]
\ No newline at end of file
diff --git a/invokeai/backend/flux/modules/layers.py b/invokeai/backend/flux/modules/layers.py
new file mode 100644
index 00000000000..cb4eee0c2d7
--- /dev/null
+++ b/invokeai/backend/flux/modules/layers.py
@@ -0,0 +1,253 @@
+import math
+from dataclasses import dataclass
+
+import torch
+from einops import rearrange
+from torch import Tensor, nn
+
+from ..math import attention, rope
+
+
+class EmbedND(nn.Module):
+    def __init__(self, dim: int, theta: int, axes_dim: list[int]):
+        super().__init__()
+        self.dim = dim
+        self.theta = theta
+        self.axes_dim = axes_dim
+
+    def forward(self, ids: Tensor) -> Tensor:
+        n_axes = ids.shape[-1]
+        emb = torch.cat(
+            [rope(ids[..., i], self.axes_dim[i], self.theta) for i in range(n_axes)],
+            dim=-3,
+        )
+
+        return emb.unsqueeze(1)
+
+
+def timestep_embedding(t: Tensor, dim, max_period=10000, time_factor: float = 1000.0):
+    """
+    Create sinusoidal timestep embeddings.
+    :param t: a 1-D Tensor of N indices, one per batch element.
+                      These may be fractional.
+    :param dim: the dimension of the output.
+    :param max_period: controls the minimum frequency of the embeddings.
+    :return: an (N, D) Tensor of positional embeddings.
+    """
+    t = time_factor * t
+    half = dim // 2
+    freqs = torch.exp(-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half).to(
+        t.device
+    )
+
+    args = t[:, None].float() * freqs[None]
+    embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+    if dim % 2:
+        embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+    if torch.is_floating_point(t):
+        embedding = embedding.to(t)
+    return embedding
+
+
+class MLPEmbedder(nn.Module):
+    def __init__(self, in_dim: int, hidden_dim: int):
+        super().__init__()
+        self.in_layer = nn.Linear(in_dim, hidden_dim, bias=True)
+        self.silu = nn.SiLU()
+        self.out_layer = nn.Linear(hidden_dim, hidden_dim, bias=True)
+
+    def forward(self, x: Tensor) -> Tensor:
+        return self.out_layer(self.silu(self.in_layer(x)))
+
+
+class RMSNorm(torch.nn.Module):
+    def __init__(self, dim: int):
+        super().__init__()
+        self.scale = nn.Parameter(torch.ones(dim))
+
+    def forward(self, x: Tensor):
+        x_dtype = x.dtype
+        x = x.float()
+        rrms = torch.rsqrt(torch.mean(x**2, dim=-1, keepdim=True) + 1e-6)
+        return (x * rrms).to(dtype=x_dtype) * self.scale
+
+
+class QKNorm(torch.nn.Module):
+    def __init__(self, dim: int):
+        super().__init__()
+        self.query_norm = RMSNorm(dim)
+        self.key_norm = RMSNorm(dim)
+
+    def forward(self, q: Tensor, k: Tensor, v: Tensor) -> tuple[Tensor, Tensor]:
+        q = self.query_norm(q)
+        k = self.key_norm(k)
+        return q.to(v), k.to(v)
+
+
+class SelfAttention(nn.Module):
+    def __init__(self, dim: int, num_heads: int = 8, qkv_bias: bool = False):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.norm = QKNorm(head_dim)
+        self.proj = nn.Linear(dim, dim)
+
+    def forward(self, x: Tensor, pe: Tensor) -> Tensor:
+        qkv = self.qkv(x)
+        q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+        q, k = self.norm(q, k, v)
+        x = attention(q, k, v, pe=pe)
+        x = self.proj(x)
+        return x
+
+
+@dataclass
+class ModulationOut:
+    shift: Tensor
+    scale: Tensor
+    gate: Tensor
+
+
+class Modulation(nn.Module):
+    def __init__(self, dim: int, double: bool):
+        super().__init__()
+        self.is_double = double
+        self.multiplier = 6 if double else 3
+        self.lin = nn.Linear(dim, self.multiplier * dim, bias=True)
+
+    def forward(self, vec: Tensor) -> tuple[ModulationOut, ModulationOut | None]:
+        out = self.lin(nn.functional.silu(vec))[:, None, :].chunk(self.multiplier, dim=-1)
+
+        return (
+            ModulationOut(*out[:3]),
+            ModulationOut(*out[3:]) if self.is_double else None,
+        )
+
+
+class DoubleStreamBlock(nn.Module):
+    def __init__(self, hidden_size: int, num_heads: int, mlp_ratio: float, qkv_bias: bool = False):
+        super().__init__()
+
+        mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        self.num_heads = num_heads
+        self.hidden_size = hidden_size
+        self.img_mod = Modulation(hidden_size, double=True)
+        self.img_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.img_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias)
+
+        self.img_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.img_mlp = nn.Sequential(
+            nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
+            nn.GELU(approximate="tanh"),
+            nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
+        )
+
+        self.txt_mod = Modulation(hidden_size, double=True)
+        self.txt_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.txt_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias)
+
+        self.txt_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.txt_mlp = nn.Sequential(
+            nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
+            nn.GELU(approximate="tanh"),
+            nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
+        )
+
+    def forward(self, img: Tensor, txt: Tensor, vec: Tensor, pe: Tensor) -> tuple[Tensor, Tensor]:
+        img_mod1, img_mod2 = self.img_mod(vec)
+        txt_mod1, txt_mod2 = self.txt_mod(vec)
+
+        # prepare image for attention
+        img_modulated = self.img_norm1(img)
+        img_modulated = (1 + img_mod1.scale) * img_modulated + img_mod1.shift
+        img_qkv = self.img_attn.qkv(img_modulated)
+        img_q, img_k, img_v = rearrange(img_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+        img_q, img_k = self.img_attn.norm(img_q, img_k, img_v)
+
+        # prepare txt for attention
+        txt_modulated = self.txt_norm1(txt)
+        txt_modulated = (1 + txt_mod1.scale) * txt_modulated + txt_mod1.shift
+        txt_qkv = self.txt_attn.qkv(txt_modulated)
+        txt_q, txt_k, txt_v = rearrange(txt_qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+        txt_q, txt_k = self.txt_attn.norm(txt_q, txt_k, txt_v)
+
+        # run actual attention
+        q = torch.cat((txt_q, img_q), dim=2)
+        k = torch.cat((txt_k, img_k), dim=2)
+        v = torch.cat((txt_v, img_v), dim=2)
+
+        attn = attention(q, k, v, pe=pe)
+        txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1] :]
+
+        # calculate the img bloks
+        img = img + img_mod1.gate * self.img_attn.proj(img_attn)
+        img = img + img_mod2.gate * self.img_mlp((1 + img_mod2.scale) * self.img_norm2(img) + img_mod2.shift)
+
+        # calculate the txt bloks
+        txt = txt + txt_mod1.gate * self.txt_attn.proj(txt_attn)
+        txt = txt + txt_mod2.gate * self.txt_mlp((1 + txt_mod2.scale) * self.txt_norm2(txt) + txt_mod2.shift)
+        return img, txt
+
+
+class SingleStreamBlock(nn.Module):
+    """
+    A DiT block with parallel linear layers as described in
+    https://arxiv.org/abs/2302.05442 and adapted modulation interface.
+    """
+
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qk_scale: float | None = None,
+    ):
+        super().__init__()
+        self.hidden_dim = hidden_size
+        self.num_heads = num_heads
+        head_dim = hidden_size // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+
+        self.mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        # qkv and mlp_in
+        self.linear1 = nn.Linear(hidden_size, hidden_size * 3 + self.mlp_hidden_dim)
+        # proj and mlp_out
+        self.linear2 = nn.Linear(hidden_size + self.mlp_hidden_dim, hidden_size)
+
+        self.norm = QKNorm(head_dim)
+
+        self.hidden_size = hidden_size
+        self.pre_norm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+
+        self.mlp_act = nn.GELU(approximate="tanh")
+        self.modulation = Modulation(hidden_size, double=False)
+
+    def forward(self, x: Tensor, vec: Tensor, pe: Tensor) -> Tensor:
+        mod, _ = self.modulation(vec)
+        x_mod = (1 + mod.scale) * self.pre_norm(x) + mod.shift
+        qkv, mlp = torch.split(self.linear1(x_mod), [3 * self.hidden_size, self.mlp_hidden_dim], dim=-1)
+
+        q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+        q, k = self.norm(q, k, v)
+
+        # compute attention
+        attn = attention(q, k, v, pe=pe)
+        # compute activation in mlp stream, cat again and run second linear layer
+        output = self.linear2(torch.cat((attn, self.mlp_act(mlp)), 2))
+        return x + mod.gate * output
+
+
+class LastLayer(nn.Module):
+    def __init__(self, hidden_size: int, patch_size: int, out_channels: int):
+        super().__init__()
+        self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.linear = nn.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True)
+        self.adaLN_modulation = nn.Sequential(nn.SiLU(), nn.Linear(hidden_size, 2 * hidden_size, bias=True))
+
+    def forward(self, x: Tensor, vec: Tensor) -> Tensor:
+        shift, scale = self.adaLN_modulation(vec).chunk(2, dim=1)
+        x = (1 + scale[:, None, :]) * self.norm_final(x) + shift[:, None, :]
+        x = self.linear(x)
+        return x
\ No newline at end of file
diff --git a/invokeai/backend/model_manager/config.py b/invokeai/backend/model_manager/config.py
index 29ef9536668..dfa6cef29b3 100644
--- a/invokeai/backend/model_manager/config.py
+++ b/invokeai/backend/model_manager/config.py
@@ -67,7 +67,9 @@ class ModelType(str, Enum):
     TextualInversion = "embedding"
     IPAdapter = "ip_adapter"
     CLIPVision = "clip_vision"
+    CLIPEmbed = "clip_embed"
     T2IAdapter = "t2i_adapter"
+    T5Encoder = "t5_encoder"
     SpandrelImageToImage = "spandrel_image_to_image"
 
 
@@ -106,6 +108,9 @@ class ModelFormat(str, Enum):
     EmbeddingFile = "embedding_file"
     EmbeddingFolder = "embedding_folder"
     InvokeAI = "invokeai"
+    T5Encoder = "t5_encoder"
+    T5Encoder8b = "t5_encoder_8b"
+    T5Encoder4b = "t5_encoder_4b"
 
 
 class SchedulerPredictionType(str, Enum):
@@ -207,6 +212,18 @@ class LoRAConfigBase(ModelConfigBase):
     trigger_phrases: Optional[set[str]] = Field(description="Set of trigger phrases for this model", default=None)
 
 
+class T5EncoderConfigBase(ModelConfigBase):
+    type: Literal[ModelType.T5Encoder] = ModelType.T5Encoder
+
+
+class T5EncoderConfig(T5EncoderConfigBase):
+    format: Literal[ModelFormat.T5Encoder] = ModelFormat.T5Encoder
+
+    @staticmethod
+    def get_tag() -> Tag:
+        return Tag(f"{ModelType.T5Encoder.value}.{ModelFormat.T5Encoder.value}")
+
+
 class LoRALyCORISConfig(LoRAConfigBase):
     """Model config for LoRA/Lycoris models."""
 
@@ -352,6 +369,17 @@ def get_tag() -> Tag:
         return Tag(f"{ModelType.IPAdapter.value}.{ModelFormat.Checkpoint.value}")
 
 
+class CLIPEmbedDiffusersConfig(DiffusersConfigBase):
+    """Model config for Clip Embeddings."""
+
+    type: Literal[ModelType.CLIPEmbed] = ModelType.CLIPEmbed
+    format: Literal[ModelFormat.Diffusers] = ModelFormat.Diffusers
+
+    @staticmethod
+    def get_tag() -> Tag:
+        return Tag(f"{ModelType.CLIPEmbed.value}.{ModelFormat.Diffusers.value}")
+
+
 class CLIPVisionDiffusersConfig(DiffusersConfigBase):
     """Model config for CLIPVision."""
 
@@ -416,6 +444,7 @@ def get_model_discriminator_value(v: Any) -> str:
         Annotated[ControlNetCheckpointConfig, ControlNetCheckpointConfig.get_tag()],
         Annotated[LoRALyCORISConfig, LoRALyCORISConfig.get_tag()],
         Annotated[LoRADiffusersConfig, LoRADiffusersConfig.get_tag()],
+        Annotated[T5EncoderConfig, T5EncoderConfig.get_tag()],
         Annotated[TextualInversionFileConfig, TextualInversionFileConfig.get_tag()],
         Annotated[TextualInversionFolderConfig, TextualInversionFolderConfig.get_tag()],
         Annotated[IPAdapterInvokeAIConfig, IPAdapterInvokeAIConfig.get_tag()],
@@ -423,6 +452,7 @@ def get_model_discriminator_value(v: Any) -> str:
         Annotated[T2IAdapterConfig, T2IAdapterConfig.get_tag()],
         Annotated[SpandrelImageToImageConfig, SpandrelImageToImageConfig.get_tag()],
         Annotated[CLIPVisionDiffusersConfig, CLIPVisionDiffusersConfig.get_tag()],
+        Annotated[CLIPEmbedDiffusersConfig, CLIPEmbedDiffusersConfig.get_tag()],
     ],
     Discriminator(get_model_discriminator_value),
 ]
diff --git a/invokeai/backend/model_manager/load/model_loaders/flux.py b/invokeai/backend/model_manager/load/model_loaders/flux.py
new file mode 100644
index 00000000000..7a028a55e10
--- /dev/null
+++ b/invokeai/backend/model_manager/load/model_loaders/flux.py
@@ -0,0 +1,159 @@
+# Copyright (c) 2024, Brandon W. Rising and the InvokeAI Development Team
+"""Class for Flux model loading in InvokeAI."""
+
+from pathlib import Path
+import yaml
+
+from dataclasses import fields
+from safetensors.torch import load_file
+from typing import Optional, Any
+from transformers import T5EncoderModel, T5Tokenizer
+
+from invokeai.backend.model_manager import (
+    AnyModel,
+    AnyModelConfig,
+    BaseModelType,
+    ModelFormat,
+    ModelType,
+    SubModelType,
+)
+from invokeai.backend.model_manager.config import (
+    CheckpointConfigBase,
+    MainCheckpointConfig,
+    CLIPEmbedDiffusersConfig,
+    T5EncoderConfig,
+    VAECheckpointConfig,
+)
+from invokeai.app.services.config.config_default import get_config
+from invokeai.backend.model_manager.load.model_loader_registry import ModelLoaderRegistry
+from invokeai.backend.model_manager.load.model_loaders.generic_diffusers import GenericDiffusersLoader
+from invokeai.backend.util.silence_warnings import SilenceWarnings
+from invokeai.backend.util.devices import TorchDevice
+from invokeai.backend.flux.model import Flux, FluxParams
+from invokeai.backend.flux.modules.autoencoder import AutoEncoderParams, AutoEncoder
+from transformers import (CLIPTextModel, CLIPTokenizer, T5EncoderModel,
+                          T5Tokenizer)
+
+app_config = get_config()
+
+
+@ModelLoaderRegistry.register(base=BaseModelType.Flux, type=ModelType.VAE, format=ModelFormat.Checkpoint)
+class FluxVAELoader(GenericDiffusersLoader):
+    """Class to load VAE models."""
+
+    def _load_model(
+        self,
+        config: AnyModelConfig,
+        submodel_type: Optional[SubModelType] = None,
+    ) -> AnyModel:
+        if isinstance(config, VAECheckpointConfig):
+            model_path = Path(config.path)
+            load_class = AutoEncoder
+            legacy_config_path = app_config.legacy_conf_path / config.config_path
+            config_path = legacy_config_path.as_posix()
+            with open(config_path, "r") as stream:
+                try:
+                    flux_conf = yaml.safe_load(stream)
+                except:
+                    raise
+            
+            dataclass_fields = {f.name for f in fields(AutoEncoderParams)}
+            filtered_data = {k: v for k, v in flux_conf['params']['ae_params'].items() if k in dataclass_fields}
+            params = AutoEncoderParams(**filtered_data)
+
+            with SilenceWarnings():
+                model = load_class(params).to(self._torch_dtype)
+                # load_sft doesn't support torch.device
+                sd = load_file(model_path, device=str(TorchDevice.choose_torch_device()))
+                model.load_state_dict(sd, strict=False, assign=True)
+
+            return model
+        else:
+            return super()._load_model(config, submodel_type)
+
+
+@ModelLoaderRegistry.register(base=BaseModelType.Any, type=ModelType.CLIPEmbed, format=ModelFormat.Diffusers)
+class ClipCheckpointModel(GenericDiffusersLoader):
+    """Class to load main models."""
+
+    def _load_model(
+        self,
+        config: AnyModelConfig,
+        submodel_type: Optional[SubModelType] = None,
+    ) -> AnyModel:
+        if not isinstance(config, CLIPEmbedDiffusersConfig):
+            raise Exception("Only Checkpoint Flux models are currently supported.")
+
+        match submodel_type:
+            case SubModelType.Tokenizer:
+                return CLIPTokenizer.from_pretrained(config.path, max_length=77)
+            case SubModelType.TextEncoder:
+                return CLIPTextModel.from_pretrained(config.path)
+
+        raise Exception("Only Checkpoint Flux models are currently supported.")
+
+@ModelLoaderRegistry.register(base=BaseModelType.Any, type=ModelType.T5Encoder, format=ModelFormat.T5Encoder)
+class T5EncoderCheckpointModel(GenericDiffusersLoader):
+    """Class to load main models."""
+
+    def _load_model(
+        self,
+        config: AnyModelConfig,
+        submodel_type: Optional[SubModelType] = None,
+    ) -> AnyModel:
+        if not isinstance(config, T5EncoderConfig):
+            raise Exception("Only Checkpoint Flux models are currently supported.")
+
+        match submodel_type:
+            case SubModelType.Tokenizer2:
+                return T5Tokenizer.from_pretrained(Path(config.path), max_length=512)
+            case SubModelType.TextEncoder2:
+                return T5EncoderModel.from_pretrained(Path(config.path))
+
+        raise Exception("Only Checkpoint Flux models are currently supported.")
+
+
+@ModelLoaderRegistry.register(base=BaseModelType.Flux, type=ModelType.Main, format=ModelFormat.Checkpoint)
+class FluxCheckpointModel(GenericDiffusersLoader):
+    """Class to load main models."""
+
+    def _load_model(
+        self,
+        config: AnyModelConfig,
+        submodel_type: Optional[SubModelType] = None,
+    ) -> AnyModel:
+        if not isinstance(config, CheckpointConfigBase):
+            raise Exception("Only Checkpoint Flux models are currently supported.")
+        legacy_config_path = app_config.legacy_conf_path / config.config_path
+        config_path = legacy_config_path.as_posix()
+        with open(config_path, "r") as stream:
+            try:
+                flux_conf = yaml.safe_load(stream)
+            except:
+                raise
+
+        match submodel_type:
+            case SubModelType.Transformer:
+                return self._load_from_singlefile(config, flux_conf)
+
+        raise Exception("Only Checkpoint Flux models are currently supported.")
+
+    def _load_from_singlefile(
+        self,
+        config: AnyModelConfig,
+        flux_conf: Any,
+    ) -> AnyModel:
+        assert isinstance(config, MainCheckpointConfig)
+        load_class = Flux
+        params = None
+        model_path = Path(config.path)
+        dataclass_fields = {f.name for f in fields(FluxParams)}
+        filtered_data = {k: v for k, v in flux_conf['params'].items() if k in dataclass_fields}
+        params = FluxParams(**filtered_data)
+
+        with SilenceWarnings():
+            model = load_class(params).to(self._torch_dtype)
+            # load_sft doesn't support torch.device
+            sd = load_file(model_path, device=str(TorchDevice.choose_torch_device()))
+            model.load_state_dict(sd, strict=False, assign=True)
+        return model
diff --git a/invokeai/backend/model_manager/load/model_loaders/stable_diffusion.py b/invokeai/backend/model_manager/load/model_loaders/stable_diffusion.py
index 33ce4abc4d4..e034e110115 100644
--- a/invokeai/backend/model_manager/load/model_loaders/stable_diffusion.py
+++ b/invokeai/backend/model_manager/load/model_loaders/stable_diffusion.py
@@ -36,8 +36,14 @@
 }
 
 
-@ModelLoaderRegistry.register(base=BaseModelType.Any, type=ModelType.Main, format=ModelFormat.Diffusers)
-@ModelLoaderRegistry.register(base=BaseModelType.Any, type=ModelType.Main, format=ModelFormat.Checkpoint)
+@ModelLoaderRegistry.register(base=BaseModelType.StableDiffusion1, type=ModelType.Main, format=ModelFormat.Diffusers)
+@ModelLoaderRegistry.register(base=BaseModelType.StableDiffusion2, type=ModelType.Main, format=ModelFormat.Diffusers)
+@ModelLoaderRegistry.register(base=BaseModelType.StableDiffusionXL, type=ModelType.Main, format=ModelFormat.Diffusers)
+@ModelLoaderRegistry.register(base=BaseModelType.StableDiffusionXLRefiner, type=ModelType.Main, format=ModelFormat.Diffusers)
+@ModelLoaderRegistry.register(base=BaseModelType.StableDiffusion1, type=ModelType.Main, format=ModelFormat.Checkpoint)
+@ModelLoaderRegistry.register(base=BaseModelType.StableDiffusion2, type=ModelType.Main, format=ModelFormat.Checkpoint)
+@ModelLoaderRegistry.register(base=BaseModelType.StableDiffusionXL, type=ModelType.Main, format=ModelFormat.Checkpoint)
+@ModelLoaderRegistry.register(base=BaseModelType.StableDiffusionXLRefiner, type=ModelType.Main, format=ModelFormat.Checkpoint)
 class StableDiffusionDiffusersModel(GenericDiffusersLoader):
     """Class to load main models."""
 
diff --git a/invokeai/backend/model_manager/load/model_util.py b/invokeai/backend/model_manager/load/model_util.py
index f93b12a9b02..455f97d5484 100644
--- a/invokeai/backend/model_manager/load/model_util.py
+++ b/invokeai/backend/model_manager/load/model_util.py
@@ -9,7 +9,7 @@
 import torch
 from diffusers.pipelines.pipeline_utils import DiffusionPipeline
 from diffusers.schedulers.scheduling_utils import SchedulerMixin
-from transformers import CLIPTokenizer, T5TokenizerFast
+from transformers import CLIPTokenizer, T5TokenizerFast, T5Tokenizer
 
 from invokeai.backend.image_util.grounding_dino.grounding_dino_pipeline import GroundingDinoPipeline
 from invokeai.backend.image_util.segment_anything.segment_anything_pipeline import SegmentAnythingPipeline
@@ -50,7 +50,7 @@ def calc_model_size_by_data(logger: logging.Logger, model: AnyModel) -> int:
         return model.calc_size()
     elif isinstance(
         model,
-        (T5TokenizerFast,),
+        (T5TokenizerFast,T5Tokenizer,),
     ):
         return len(model)
     else:
diff --git a/invokeai/backend/model_manager/probe.py b/invokeai/backend/model_manager/probe.py
index 82053149ad1..a3a648806fc 100644
--- a/invokeai/backend/model_manager/probe.py
+++ b/invokeai/backend/model_manager/probe.py
@@ -56,7 +56,7 @@
     },
     BaseModelType.StableDiffusionXLRefiner: {
         ModelVariantType.Normal: "sd_xl_refiner.yaml",
-    },
+    }
 }
 
 
@@ -132,7 +132,7 @@ def probe(
             fields = {}
 
         model_path = model_path.resolve()
-
+        
         format_type = ModelFormat.Diffusers if model_path.is_dir() else ModelFormat.Checkpoint
         model_info = None
         model_type = ModelType(fields["type"]) if "type" in fields and fields["type"] else None
@@ -162,7 +162,7 @@ def probe(
         fields["description"] = (
             fields.get("description") or f"{fields['base'].value} {model_type.value} model {fields['name']}"
         )
-        fields["format"] = fields.get("format") or probe.get_format()
+        fields["format"] = ModelFormat(fields.get("format")) or probe.get_format()
         fields["hash"] = fields.get("hash") or ModelHash(algorithm=hash_algo).hash(model_path)
 
         fields["default_settings"] = fields.get("default_settings")
@@ -223,7 +223,7 @@ def get_model_type_from_checkpoint(cls, model_path: Path, checkpoint: Optional[C
         ckpt = ckpt.get("state_dict", ckpt)
 
         for key in [str(k) for k in ckpt.keys()]:
-            if key.startswith(("cond_stage_model.", "first_stage_model.", "model.diffusion_model.")):
+            if key.startswith(("cond_stage_model.", "first_stage_model.", "model.diffusion_model.", "double_blocks.")):
                 return ModelType.Main
             elif key.startswith(("encoder.conv_in", "decoder.conv_in")):
                 return ModelType.VAE
@@ -322,10 +322,13 @@ def _get_checkpoint_config_path(
             return possible_conf.absolute()
 
         if model_type is ModelType.Main:
-            config_file = LEGACY_CONFIGS[base_type][variant_type]
-            if isinstance(config_file, dict):  # need another tier for sd-2.x models
-                config_file = config_file[prediction_type]
-            config_file = f"stable-diffusion/{config_file}"
+            if base_type == BaseModelType.Flux:
+                config_file="flux/flux1-schnell.yaml"
+            else:
+                config_file = LEGACY_CONFIGS[base_type][variant_type]
+                if isinstance(config_file, dict):  # need another tier for sd-2.x models
+                    config_file = config_file[prediction_type]
+                config_file = f"stable-diffusion/{config_file}"
         elif model_type is ModelType.ControlNet:
             config_file = (
                 "controlnet/cldm_v15.yaml"
@@ -334,7 +337,9 @@ def _get_checkpoint_config_path(
             )
         elif model_type is ModelType.VAE:
             config_file = (
-                "stable-diffusion/v1-inference.yaml"
+                "flux/flux1-schnell.yaml"
+                if base_type is BaseModelType.Flux
+                else "stable-diffusion/v1-inference.yaml"
                 if base_type is BaseModelType.StableDiffusion1
                 else "stable-diffusion/sd_xl_base.yaml"
                 if base_type is BaseModelType.StableDiffusionXL
@@ -421,7 +426,8 @@ def get_format(self) -> ModelFormat:
 
     def get_variant_type(self) -> ModelVariantType:
         model_type = ModelProbe.get_model_type_from_checkpoint(self.model_path, self.checkpoint)
-        if model_type != ModelType.Main:
+        base_type = self.get_base_type()
+        if model_type != ModelType.Main or base_type == BaseModelType.Flux:
             return ModelVariantType.Normal
         state_dict = self.checkpoint.get("state_dict") or self.checkpoint
         in_channels = state_dict["model.diffusion_model.input_blocks.0.0.weight"].shape[1]
@@ -441,6 +447,8 @@ class PipelineCheckpointProbe(CheckpointProbeBase):
     def get_base_type(self) -> BaseModelType:
         checkpoint = self.checkpoint
         state_dict = self.checkpoint.get("state_dict") or checkpoint
+        if "double_blocks.0.img_attn.norm.key_norm.scale" in state_dict:
+            return BaseModelType.Flux
         key_name = "model.diffusion_model.input_blocks.2.1.transformer_blocks.0.attn2.to_k.weight"
         if key_name in state_dict and state_dict[key_name].shape[-1] == 768:
             return BaseModelType.StableDiffusion1
@@ -483,6 +491,7 @@ def get_base_type(self) -> BaseModelType:
             (r"xl", BaseModelType.StableDiffusionXL),
             (r"sd2", BaseModelType.StableDiffusion2),
             (r"vae", BaseModelType.StableDiffusion1),
+            (r"FLUX.1-schnell_ae", BaseModelType.Flux),
         ]:
             if re.search(regexp, self.model_path.name, re.IGNORECASE):
                 return basetype
@@ -627,10 +636,6 @@ def get_repo_variant(self) -> ModelRepoVariant:
 
 class PipelineFolderProbe(FolderProbeBase):
     def get_base_type(self) -> BaseModelType:
-        with open(f"{self.model_path}/model_index.json", "r") as file:
-            conf = json.load(file)
-        if "_class_name" in conf and conf.get("_class_name") == "FluxPipeline":
-            return BaseModelType.Flux
         with open(self.model_path / "unet" / "config.json", "r") as file:
             unet_conf = json.load(file)
         if unet_conf["cross_attention_dim"] == 768:
@@ -718,6 +723,10 @@ def get_base_type(self) -> BaseModelType:
         return TextualInversionCheckpointProbe(path).get_base_type()
 
 
+class T5EncoderFolderProbe(FolderProbeBase):
+    def get_format(self) -> ModelFormat:
+        return ModelFormat.T5Encoder
+
 class ONNXFolderProbe(PipelineFolderProbe):
     def get_base_type(self) -> BaseModelType:
         # Due to the way the installer is set up, the configuration file for safetensors
@@ -810,6 +819,11 @@ def get_base_type(self) -> BaseModelType:
         return BaseModelType.Any
 
 
+class CLIPEmbedFolderProbe(FolderProbeBase):
+    def get_base_type(self) -> BaseModelType:
+        return BaseModelType.Any
+
+
 class SpandrelImageToImageFolderProbe(FolderProbeBase):
     def get_base_type(self) -> BaseModelType:
         raise NotImplementedError()
@@ -840,8 +854,10 @@ def get_base_type(self) -> BaseModelType:
 ModelProbe.register_probe("diffusers", ModelType.VAE, VaeFolderProbe)
 ModelProbe.register_probe("diffusers", ModelType.LoRA, LoRAFolderProbe)
 ModelProbe.register_probe("diffusers", ModelType.TextualInversion, TextualInversionFolderProbe)
+ModelProbe.register_probe("diffusers", ModelType.T5Encoder, T5EncoderFolderProbe)
 ModelProbe.register_probe("diffusers", ModelType.ControlNet, ControlNetFolderProbe)
 ModelProbe.register_probe("diffusers", ModelType.IPAdapter, IPAdapterFolderProbe)
+ModelProbe.register_probe("diffusers", ModelType.CLIPEmbed, CLIPEmbedFolderProbe)
 ModelProbe.register_probe("diffusers", ModelType.CLIPVision, CLIPVisionFolderProbe)
 ModelProbe.register_probe("diffusers", ModelType.T2IAdapter, T2IAdapterFolderProbe)
 ModelProbe.register_probe("diffusers", ModelType.SpandrelImageToImage, SpandrelImageToImageFolderProbe)
diff --git a/invokeai/configs/flux/flux1-dev.yaml b/invokeai/configs/flux/flux1-dev.yaml
new file mode 100644
index 00000000000..3f76f11cd4e
--- /dev/null
+++ b/invokeai/configs/flux/flux1-dev.yaml
@@ -0,0 +1,33 @@
+repo_id: "black-forest-labs/FLUX.1-dev"
+repo_ae: "ae.safetensors"
+max_length: 512
+params:
+  in_channels: 64
+  vec_in_dim: 768
+  context_in_dim: 4096
+  hidden_size: 3072
+  mlp_ratio: 4.0
+  num_heads: 24
+  depth: 19
+  depth_single_blocks: 38
+  axes_dim:
+  - 16
+  - 56
+  - 56
+  theta: 10_000
+  qkv_bias: True
+  guidance_embed: True
+  ae_params:
+    resolution: 256
+    in_channels: 3
+    ch: 128
+    out_ch: 3
+    ch_mult:
+    - 1
+    - 2
+    - 4
+    - 4
+    num_res_blocks: 2
+    z_channels: 16
+    scale_factor: 0.3611
+    shift_factor: 0.1159
diff --git a/invokeai/configs/flux/flux1-schnell.yaml b/invokeai/configs/flux/flux1-schnell.yaml
new file mode 100644
index 00000000000..bea1824e35a
--- /dev/null
+++ b/invokeai/configs/flux/flux1-schnell.yaml
@@ -0,0 +1,34 @@
+repo_id: "black-forest-labs/FLUX.1-schnell"
+repo_ae: "ae.safetensors"
+t5_encoder: "google/t5-v1_1-xxl"
+max_length: 512
+params:
+  in_channels: 64
+  vec_in_dim: 768
+  context_in_dim: 4096
+  hidden_size: 3072
+  mlp_ratio: 4.0
+  num_heads: 24
+  depth: 19
+  depth_single_blocks: 38
+  axes_dim:
+  - 16
+  - 56
+  - 56
+  theta: 10_000
+  qkv_bias: True
+  guidance_embed: False
+  ae_params:
+    resolution: 256
+    in_channels: 3
+    ch: 128
+    out_ch: 3
+    ch_mult:
+    - 1
+    - 2
+    - 4
+    - 4
+    num_res_blocks: 2
+    z_channels: 16
+    scale_factor: 0.3611
+    shift_factor: 0.1159
diff --git a/invokeai/frontend/web/src/services/api/schema.ts b/invokeai/frontend/web/src/services/api/schema.ts
index 74e1fb3a831..cfdca4354ec 100644
--- a/invokeai/frontend/web/src/services/api/schema.ts
+++ b/invokeai/frontend/web/src/services/api/schema.ts
@@ -4800,6 +4800,13 @@ export type components = {
       use_cache?: boolean;
       /** @description Flux model (Transformer, VAE, CLIP) to load */
       model: components["schemas"]["ModelIdentifierField"];
+      /**
+       * T5 Encoder
+       * @description The T5 Encoder model to use.
+       * @default null
+       * @enum {string}
+       */
+      t5_encoder?: "base" | "16b_quantized" | "8b_quantized";
       /**
        * type
        * @default flux_model_loader
@@ -7675,154 +7682,154 @@ export type components = {
       project_id: string | null;
     };
     InvocationOutputMap: {
-      mask_combine: components["schemas"]["ImageOutput"];
+      lblend: components["schemas"]["LatentsOutput"];
+      boolean_collection: components["schemas"]["BooleanCollectionOutput"];
+      random_range: components["schemas"]["IntegerCollectionOutput"];
+      merge_metadata: components["schemas"]["MetadataOutput"];
+      string_split: components["schemas"]["String2Output"];
+      string: components["schemas"]["StringOutput"];
+      pair_tile_image: components["schemas"]["PairTileImageOutput"];
+      tile_to_properties: components["schemas"]["TileToPropertiesOutput"];
+      infill_lama: components["schemas"]["ImageOutput"];
+      pidi_image_processor: components["schemas"]["ImageOutput"];
+      lineart_image_processor: components["schemas"]["ImageOutput"];
+      img_watermark: components["schemas"]["ImageOutput"];
+      metadata_item: components["schemas"]["MetadataItemOutput"];
+      integer: components["schemas"]["IntegerOutput"];
+      img_conv: components["schemas"]["ImageOutput"];
+      string_join: components["schemas"]["StringOutput"];
+      lscale: components["schemas"]["LatentsOutput"];
+      create_gradient_mask: components["schemas"]["GradientMaskOutput"];
+      img_pad_crop: components["schemas"]["ImageOutput"];
+      img_channel_offset: components["schemas"]["ImageOutput"];
+      mask_edge: components["schemas"]["ImageOutput"];
+      img_chan: components["schemas"]["ImageOutput"];
+      string_join_three: components["schemas"]["StringOutput"];
+      div: components["schemas"]["IntegerOutput"];
+      merge_tiles_to_image: components["schemas"]["ImageOutput"];
+      add: components["schemas"]["IntegerOutput"];
+      leres_image_processor: components["schemas"]["ImageOutput"];
+      img_lerp: components["schemas"]["ImageOutput"];
+      t2i_adapter: components["schemas"]["T2IAdapterOutput"];
+      float_to_int: components["schemas"]["IntegerOutput"];
+      string_collection: components["schemas"]["StringCollectionOutput"];
+      main_model_loader: components["schemas"]["ModelLoaderOutput"];
+      heuristic_resize: components["schemas"]["ImageOutput"];
+      img_channel_multiply: components["schemas"]["ImageOutput"];
+      face_identifier: components["schemas"]["ImageOutput"];
       img_ilerp: components["schemas"]["ImageOutput"];
-      sdxl_model_loader: components["schemas"]["SDXLModelLoaderOutput"];
-      dw_openpose_image_processor: components["schemas"]["ImageOutput"];
-      infill_patchmatch: components["schemas"]["ImageOutput"];
-      infill_tile: components["schemas"]["ImageOutput"];
-      zoe_depth_image_processor: components["schemas"]["ImageOutput"];
-      canny_image_processor: components["schemas"]["ImageOutput"];
-      face_mask_detection: components["schemas"]["FaceMaskOutput"];
-      core_metadata: components["schemas"]["MetadataOutput"];
-      calculate_image_tiles_even_split: components["schemas"]["CalculateImageTilesOutput"];
-      scheduler: components["schemas"]["SchedulerOutput"];
-      clip_skip: components["schemas"]["CLIPSkipInvocationOutput"];
-      img_nsfw: components["schemas"]["ImageOutput"];
-      show_image: components["schemas"]["ImageOutput"];
-      string_split_neg: components["schemas"]["StringPosNegOutput"];
-      vae_loader: components["schemas"]["VAEOutput"];
-      depth_anything_image_processor: components["schemas"]["ImageOutput"];
-      canvas_paste_back: components["schemas"]["ImageOutput"];
-      image_collection: components["schemas"]["ImageCollectionOutput"];
       color: components["schemas"]["ColorOutput"];
-      img_resize: components["schemas"]["ImageOutput"];
+      rand_int: components["schemas"]["IntegerOutput"];
+      infill_tile: components["schemas"]["ImageOutput"];
+      sdxl_lora_loader: components["schemas"]["SDXLLoRALoaderOutput"];
       ideal_size: components["schemas"]["IdealSizeOutput"];
-      invert_tensor_mask: components["schemas"]["MaskOutput"];
-      string_collection: components["schemas"]["StringCollectionOutput"];
-      rectangle_mask: components["schemas"]["MaskOutput"];
-      esrgan: components["schemas"]["ImageOutput"];
-      image: components["schemas"]["ImageOutput"];
-      sdxl_compel_prompt: components["schemas"]["ConditioningOutput"];
-      metadata: components["schemas"]["MetadataOutput"];
-      mlsd_image_processor: components["schemas"]["ImageOutput"];
+      flux_model_loader: components["schemas"]["FluxModelLoaderOutput"];
+      tile_image_processor: components["schemas"]["ImageOutput"];
       color_correct: components["schemas"]["ImageOutput"];
-      img_pad_crop: components["schemas"]["ImageOutput"];
-      string_join_three: components["schemas"]["StringOutput"];
+      lineart_anime_image_processor: components["schemas"]["ImageOutput"];
+      latents_collection: components["schemas"]["LatentsCollectionOutput"];
+      metadata: components["schemas"]["MetadataOutput"];
       segment_anything: components["schemas"]["MaskOutput"];
+      float: components["schemas"]["FloatOutput"];
+      compel: components["schemas"]["ConditioningOutput"];
       crop_latents: components["schemas"]["LatentsOutput"];
-      integer: components["schemas"]["IntegerOutput"];
-      img_channel_multiply: components["schemas"]["ImageOutput"];
-      tiled_multi_diffusion_denoise_latents: components["schemas"]["LatentsOutput"];
-      midas_depth_image_processor: components["schemas"]["ImageOutput"];
-      img_scale: components["schemas"]["ImageOutput"];
-      lora_collection_loader: components["schemas"]["LoRALoaderOutput"];
-      cv_inpaint: components["schemas"]["ImageOutput"];
-      sub: components["schemas"]["IntegerOutput"];
-      flux_text_to_image: components["schemas"]["ImageOutput"];
-      flux_model_loader: components["schemas"]["FluxModelLoaderOutput"];
-      image_mask_to_tensor: components["schemas"]["MaskOutput"];
-      pidi_image_processor: components["schemas"]["ImageOutput"];
-      img_conv: components["schemas"]["ImageOutput"];
-      unsharp_mask: components["schemas"]["ImageOutput"];
-      t2i_adapter: components["schemas"]["T2IAdapterOutput"];
-      rand_float: components["schemas"]["FloatOutput"];
-      main_model_loader: components["schemas"]["ModelLoaderOutput"];
+      tensor_mask_to_image: components["schemas"]["ImageOutput"];
+      string_replace: components["schemas"]["StringOutput"];
+      zoe_depth_image_processor: components["schemas"]["ImageOutput"];
+      save_image: components["schemas"]["ImageOutput"];
+      flux_text_encoder: components["schemas"]["ConditioningOutput"];
+      sdxl_model_loader: components["schemas"]["SDXLModelLoaderOutput"];
+      infill_rgba: components["schemas"]["ImageOutput"];
       dynamic_prompt: components["schemas"]["StringCollectionOutput"];
-      lresize: components["schemas"]["LatentsOutput"];
-      float_collection: components["schemas"]["FloatCollectionOutput"];
-      mediapipe_face_processor: components["schemas"]["ImageOutput"];
-      lineart_image_processor: components["schemas"]["ImageOutput"];
-      img_watermark: components["schemas"]["ImageOutput"];
-      bounding_box: components["schemas"]["BoundingBoxOutput"];
-      hed_image_processor: components["schemas"]["ImageOutput"];
-      img_mul: components["schemas"]["ImageOutput"];
-      img_lerp: components["schemas"]["ImageOutput"];
-      compel: components["schemas"]["ConditioningOutput"];
-      spandrel_image_to_image_autoscale: components["schemas"]["ImageOutput"];
-      round_float: components["schemas"]["FloatOutput"];
-      infill_lama: components["schemas"]["ImageOutput"];
+      segment_anything_processor: components["schemas"]["ImageOutput"];
+      lora_selector: components["schemas"]["LoRASelectorOutput"];
+      content_shuffle_image_processor: components["schemas"]["ImageOutput"];
       grounding_dino: components["schemas"]["BoundingBoxCollectionOutput"];
-      model_identifier: components["schemas"]["ModelIdentifierOutput"];
-      random_range: components["schemas"]["IntegerCollectionOutput"];
-      float: components["schemas"]["FloatOutput"];
+      cv_inpaint: components["schemas"]["ImageOutput"];
+      mul: components["schemas"]["IntegerOutput"];
+      float_range: components["schemas"]["FloatCollectionOutput"];
       img_blur: components["schemas"]["ImageOutput"];
-      img_chan: components["schemas"]["ImageOutput"];
-      calculate_image_tiles_min_overlap: components["schemas"]["CalculateImageTilesOutput"];
-      boolean: components["schemas"]["BooleanOutput"];
-      metadata_item: components["schemas"]["MetadataItemOutput"];
-      img_hue_adjust: components["schemas"]["ImageOutput"];
-      string_split: components["schemas"]["String2Output"];
-      float_to_int: components["schemas"]["IntegerOutput"];
-      tensor_mask_to_image: components["schemas"]["ImageOutput"];
-      seamless: components["schemas"]["SeamlessModeOutput"];
-      create_denoise_mask: components["schemas"]["DenoiseMaskOutput"];
-      string: components["schemas"]["StringOutput"];
-      lblend: components["schemas"]["LatentsOutput"];
+      core_metadata: components["schemas"]["MetadataOutput"];
+      controlnet: components["schemas"]["ControlOutput"];
+      image: components["schemas"]["ImageOutput"];
+      infill_patchmatch: components["schemas"]["ImageOutput"];
+      img_mul: components["schemas"]["ImageOutput"];
+      prompt_from_file: components["schemas"]["StringCollectionOutput"];
+      invert_tensor_mask: components["schemas"]["MaskOutput"];
+      show_image: components["schemas"]["ImageOutput"];
+      range: components["schemas"]["IntegerCollectionOutput"];
       mask_from_id: components["schemas"]["ImageOutput"];
-      latents_collection: components["schemas"]["LatentsCollectionOutput"];
-      lineart_anime_image_processor: components["schemas"]["ImageOutput"];
-      face_off: components["schemas"]["FaceOffOutput"];
-      normalbae_image_processor: components["schemas"]["ImageOutput"];
-      img_paste: components["schemas"]["ImageOutput"];
+      range_of_size: components["schemas"]["IntegerCollectionOutput"];
+      sdxl_compel_prompt: components["schemas"]["ConditioningOutput"];
+      round_float: components["schemas"]["FloatOutput"];
+      scheduler: components["schemas"]["SchedulerOutput"];
+      image_collection: components["schemas"]["ImageCollectionOutput"];
+      vae_loader: components["schemas"]["VAEOutput"];
       infill_cv2: components["schemas"]["ImageOutput"];
-      prompt_from_file: components["schemas"]["StringCollectionOutput"];
-      segment_anything_processor: components["schemas"]["ImageOutput"];
-      boolean_collection: components["schemas"]["BooleanCollectionOutput"];
+      calculate_image_tiles_min_overlap: components["schemas"]["CalculateImageTilesOutput"];
+      tiled_multi_diffusion_denoise_latents: components["schemas"]["LatentsOutput"];
+      l2i: components["schemas"]["ImageOutput"];
+      model_identifier: components["schemas"]["ModelIdentifierOutput"];
+      mlsd_image_processor: components["schemas"]["ImageOutput"];
       conditioning: components["schemas"]["ConditioningOutput"];
-      flux_text_encoder: components["schemas"]["ConditioningOutput"];
-      img_channel_offset: components["schemas"]["ImageOutput"];
-      face_identifier: components["schemas"]["ImageOutput"];
+      mask_combine: components["schemas"]["ImageOutput"];
+      latents: components["schemas"]["LatentsOutput"];
+      clip_skip: components["schemas"]["CLIPSkipInvocationOutput"];
+      alpha_mask_to_tensor: components["schemas"]["MaskOutput"];
+      tomask: components["schemas"]["ImageOutput"];
+      create_denoise_mask: components["schemas"]["DenoiseMaskOutput"];
       calculate_image_tiles: components["schemas"]["CalculateImageTilesOutput"];
-      lora_loader: components["schemas"]["LoRALoaderOutput"];
-      mul: components["schemas"]["IntegerOutput"];
-      lscale: components["schemas"]["LatentsOutput"];
-      integer_collection: components["schemas"]["IntegerCollectionOutput"];
-      iterate: components["schemas"]["IterateInvocationOutput"];
-      save_image: components["schemas"]["ImageOutput"];
-      infill_rgba: components["schemas"]["ImageOutput"];
-      content_shuffle_image_processor: components["schemas"]["ImageOutput"];
-      tile_to_properties: components["schemas"]["TileToPropertiesOutput"];
+      dw_openpose_image_processor: components["schemas"]["ImageOutput"];
+      midas_depth_image_processor: components["schemas"]["ImageOutput"];
+      boolean: components["schemas"]["BooleanOutput"];
+      conditioning_collection: components["schemas"]["ConditioningCollectionOutput"];
+      unsharp_mask: components["schemas"]["ImageOutput"];
+      face_off: components["schemas"]["FaceOffOutput"];
+      i2l: components["schemas"]["LatentsOutput"];
+      normalbae_image_processor: components["schemas"]["ImageOutput"];
+      img_hue_adjust: components["schemas"]["ImageOutput"];
+      canny_image_processor: components["schemas"]["ImageOutput"];
+      img_nsfw: components["schemas"]["ImageOutput"];
       noise: components["schemas"]["NoiseOutput"];
-      img_crop: components["schemas"]["ImageOutput"];
-      freeu: components["schemas"]["UNetOutput"];
-      integer_math: components["schemas"]["IntegerOutput"];
-      div: components["schemas"]["IntegerOutput"];
-      l2i: components["schemas"]["ImageOutput"];
-      leres_image_processor: components["schemas"]["ImageOutput"];
-      float_range: components["schemas"]["FloatCollectionOutput"];
-      mask_edge: components["schemas"]["ImageOutput"];
-      latents: components["schemas"]["LatentsOutput"];
-      string_join: components["schemas"]["StringOutput"];
-      add: components["schemas"]["IntegerOutput"];
+      string_split_neg: components["schemas"]["StringPosNegOutput"];
       sdxl_refiner_model_loader: components["schemas"]["SDXLRefinerModelLoaderOutput"];
-      heuristic_resize: components["schemas"]["ImageOutput"];
-      sdxl_lora_loader: components["schemas"]["SDXLLoRALoaderOutput"];
-      color_map_image_processor: components["schemas"]["ImageOutput"];
-      pair_tile_image: components["schemas"]["PairTileImageOutput"];
-      i2l: components["schemas"]["LatentsOutput"];
-      rand_int: components["schemas"]["IntegerOutput"];
-      ip_adapter: components["schemas"]["IPAdapterOutput"];
-      range_of_size: components["schemas"]["IntegerCollectionOutput"];
-      merge_tiles_to_image: components["schemas"]["ImageOutput"];
+      image_mask_to_tensor: components["schemas"]["MaskOutput"];
+      lresize: components["schemas"]["LatentsOutput"];
+      lora_loader: components["schemas"]["LoRALoaderOutput"];
+      mediapipe_face_processor: components["schemas"]["ImageOutput"];
+      esrgan: components["schemas"]["ImageOutput"];
       sdxl_refiner_compel_prompt: components["schemas"]["ConditioningOutput"];
-      controlnet: components["schemas"]["ControlOutput"];
-      conditioning_collection: components["schemas"]["ConditioningCollectionOutput"];
-      blank_image: components["schemas"]["ImageOutput"];
-      sdxl_lora_collection_loader: components["schemas"]["SDXLLoRALoaderOutput"];
-      create_gradient_mask: components["schemas"]["GradientMaskOutput"];
-      collect: components["schemas"]["CollectInvocationOutput"];
-      range: components["schemas"]["IntegerCollectionOutput"];
-      merge_metadata: components["schemas"]["MetadataOutput"];
-      string_replace: components["schemas"]["StringOutput"];
-      tomask: components["schemas"]["ImageOutput"];
+      integer_math: components["schemas"]["IntegerOutput"];
+      seamless: components["schemas"]["SeamlessModeOutput"];
+      img_paste: components["schemas"]["ImageOutput"];
+      step_param_easing: components["schemas"]["FloatCollectionOutput"];
+      hed_image_processor: components["schemas"]["ImageOutput"];
+      img_scale: components["schemas"]["ImageOutput"];
+      face_mask_detection: components["schemas"]["FaceMaskOutput"];
+      img_resize: components["schemas"]["ImageOutput"];
       denoise_latents: components["schemas"]["LatentsOutput"];
       float_math: components["schemas"]["FloatOutput"];
+      freeu: components["schemas"]["UNetOutput"];
+      collect: components["schemas"]["CollectInvocationOutput"];
+      blank_image: components["schemas"]["ImageOutput"];
+      calculate_image_tiles_even_split: components["schemas"]["CalculateImageTilesOutput"];
+      depth_anything_image_processor: components["schemas"]["ImageOutput"];
+      bounding_box: components["schemas"]["BoundingBoxOutput"];
+      color_map_image_processor: components["schemas"]["ImageOutput"];
       spandrel_image_to_image: components["schemas"]["ImageOutput"];
-      lora_selector: components["schemas"]["LoRASelectorOutput"];
-      alpha_mask_to_tensor: components["schemas"]["MaskOutput"];
-      tile_image_processor: components["schemas"]["ImageOutput"];
-      step_param_easing: components["schemas"]["FloatCollectionOutput"];
+      iterate: components["schemas"]["IterateInvocationOutput"];
+      float_collection: components["schemas"]["FloatCollectionOutput"];
+      sdxl_lora_collection_loader: components["schemas"]["SDXLLoRALoaderOutput"];
+      rectangle_mask: components["schemas"]["MaskOutput"];
+      flux_text_to_image: components["schemas"]["ImageOutput"];
+      spandrel_image_to_image_autoscale: components["schemas"]["ImageOutput"];
+      rand_float: components["schemas"]["FloatOutput"];
+      integer_collection: components["schemas"]["IntegerCollectionOutput"];
+      ip_adapter: components["schemas"]["IPAdapterOutput"];
+      sub: components["schemas"]["IntegerOutput"];
+      img_crop: components["schemas"]["ImageOutput"];
+      lora_collection_loader: components["schemas"]["LoRALoaderOutput"];
+      canvas_paste_back: components["schemas"]["ImageOutput"];
     };
     /**
      * InvocationStartedEvent
@@ -9724,7 +9731,7 @@ export type components = {
      * @description Storage format of model.
      * @enum {string}
      */
-    ModelFormat: "diffusers" | "checkpoint" | "lycoris" | "onnx" | "olive" | "embedding_file" | "embedding_folder" | "invokeai";
+    ModelFormat: "diffusers" | "checkpoint" | "lycoris" | "onnx" | "olive" | "embedding_file" | "embedding_folder" | "invokeai" | "t5_encoder" | "t5_encoder_8b" | "t5_encoder_4b";
     /** ModelIdentifierField */
     ModelIdentifierField: {
       /**
@@ -10024,7 +10031,7 @@ export type components = {
        * Config Out
        * @description After successful installation, this will hold the configuration object.
        */
-      config_out?: (components["schemas"]["MainDiffusersConfig"] | components["schemas"]["MainCheckpointConfig"] | components["schemas"]["VAEDiffusersConfig"] | components["schemas"]["VAECheckpointConfig"] | components["schemas"]["ControlNetDiffusersConfig"] | components["schemas"]["ControlNetCheckpointConfig"] | components["schemas"]["LoRALyCORISConfig"] | components["schemas"]["LoRADiffusersConfig"] | components["schemas"]["TextualInversionFileConfig"] | components["schemas"]["TextualInversionFolderConfig"] | components["schemas"]["IPAdapterInvokeAIConfig"] | components["schemas"]["IPAdapterCheckpointConfig"] | components["schemas"]["T2IAdapterConfig"] | components["schemas"]["SpandrelImageToImageConfig"] | components["schemas"]["CLIPVisionDiffusersConfig"]) | null;
+      config_out?: (components["schemas"]["MainDiffusersConfig"] | components["schemas"]["MainCheckpointConfig"] | components["schemas"]["VAEDiffusersConfig"] | components["schemas"]["VAECheckpointConfig"] | components["schemas"]["ControlNetDiffusersConfig"] | components["schemas"]["ControlNetCheckpointConfig"] | components["schemas"]["LoRALyCORISConfig"] | components["schemas"]["LoRADiffusersConfig"] | components["schemas"]["T5EncoderConfig"] | components["schemas"]["TextualInversionFileConfig"] | components["schemas"]["TextualInversionFolderConfig"] | components["schemas"]["IPAdapterInvokeAIConfig"] | components["schemas"]["IPAdapterCheckpointConfig"] | components["schemas"]["T2IAdapterConfig"] | components["schemas"]["SpandrelImageToImageConfig"] | components["schemas"]["CLIPVisionDiffusersConfig"]) | null;
       /**
        * Inplace
        * @description Leave model in its current location; otherwise install under models directory
@@ -10110,7 +10117,7 @@ export type components = {
        * Config
        * @description The model's config
        */
-      config: components["schemas"]["MainDiffusersConfig"] | components["schemas"]["MainCheckpointConfig"] | components["schemas"]["VAEDiffusersConfig"] | components["schemas"]["VAECheckpointConfig"] | components["schemas"]["ControlNetDiffusersConfig"] | components["schemas"]["ControlNetCheckpointConfig"] | components["schemas"]["LoRALyCORISConfig"] | components["schemas"]["LoRADiffusersConfig"] | components["schemas"]["TextualInversionFileConfig"] | components["schemas"]["TextualInversionFolderConfig"] | components["schemas"]["IPAdapterInvokeAIConfig"] | components["schemas"]["IPAdapterCheckpointConfig"] | components["schemas"]["T2IAdapterConfig"] | components["schemas"]["SpandrelImageToImageConfig"] | components["schemas"]["CLIPVisionDiffusersConfig"];
+      config: components["schemas"]["MainDiffusersConfig"] | components["schemas"]["MainCheckpointConfig"] | components["schemas"]["VAEDiffusersConfig"] | components["schemas"]["VAECheckpointConfig"] | components["schemas"]["ControlNetDiffusersConfig"] | components["schemas"]["ControlNetCheckpointConfig"] | components["schemas"]["LoRALyCORISConfig"] | components["schemas"]["LoRADiffusersConfig"] | components["schemas"]["T5EncoderConfig"] | components["schemas"]["TextualInversionFileConfig"] | components["schemas"]["TextualInversionFolderConfig"] | components["schemas"]["IPAdapterInvokeAIConfig"] | components["schemas"]["IPAdapterCheckpointConfig"] | components["schemas"]["T2IAdapterConfig"] | components["schemas"]["SpandrelImageToImageConfig"] | components["schemas"]["CLIPVisionDiffusersConfig"];
       /**
        * @description The submodel type, if any
        * @default null
@@ -10131,7 +10138,7 @@ export type components = {
        * Config
        * @description The model's config
        */
-      config: components["schemas"]["MainDiffusersConfig"] | components["schemas"]["MainCheckpointConfig"] | components["schemas"]["VAEDiffusersConfig"] | components["schemas"]["VAECheckpointConfig"] | components["schemas"]["ControlNetDiffusersConfig"] | components["schemas"]["ControlNetCheckpointConfig"] | components["schemas"]["LoRALyCORISConfig"] | components["schemas"]["LoRADiffusersConfig"] | components["schemas"]["TextualInversionFileConfig"] | components["schemas"]["TextualInversionFolderConfig"] | components["schemas"]["IPAdapterInvokeAIConfig"] | components["schemas"]["IPAdapterCheckpointConfig"] | components["schemas"]["T2IAdapterConfig"] | components["schemas"]["SpandrelImageToImageConfig"] | components["schemas"]["CLIPVisionDiffusersConfig"];
+      config: components["schemas"]["MainDiffusersConfig"] | components["schemas"]["MainCheckpointConfig"] | components["schemas"]["VAEDiffusersConfig"] | components["schemas"]["VAECheckpointConfig"] | components["schemas"]["ControlNetDiffusersConfig"] | components["schemas"]["ControlNetCheckpointConfig"] | components["schemas"]["LoRALyCORISConfig"] | components["schemas"]["LoRADiffusersConfig"] | components["schemas"]["T5EncoderConfig"] | components["schemas"]["TextualInversionFileConfig"] | components["schemas"]["TextualInversionFolderConfig"] | components["schemas"]["IPAdapterInvokeAIConfig"] | components["schemas"]["IPAdapterCheckpointConfig"] | components["schemas"]["T2IAdapterConfig"] | components["schemas"]["SpandrelImageToImageConfig"] | components["schemas"]["CLIPVisionDiffusersConfig"];
       /**
        * @description The submodel type, if any
        * @default null
@@ -10212,6 +10219,11 @@ export type components = {
        * @description hash of model file
        */
       hash?: string | null;
+      /**
+       * Format
+       * @description format of model file
+       */
+      format?: string | null;
       /**
        * Trigger Phrases
        * @description Set of trigger phrases for this model
@@ -10254,7 +10266,7 @@ export type components = {
      * @description Model type.
      * @enum {string}
      */
-    ModelType: "onnx" | "main" | "vae" | "lora" | "controlnet" | "embedding" | "ip_adapter" | "clip_vision" | "t2i_adapter" | "spandrel_image_to_image";
+    ModelType: "onnx" | "main" | "vae" | "lora" | "controlnet" | "embedding" | "ip_adapter" | "clip_vision" | "t2i_adapter" | "t5_encoder" | "spandrel_image_to_image";
     /**
      * ModelVariantType
      * @description Variant type.
@@ -10267,7 +10279,7 @@ export type components = {
      */
     ModelsList: {
       /** Models */
-      models: (components["schemas"]["MainDiffusersConfig"] | components["schemas"]["MainCheckpointConfig"] | components["schemas"]["VAEDiffusersConfig"] | components["schemas"]["VAECheckpointConfig"] | components["schemas"]["ControlNetDiffusersConfig"] | components["schemas"]["ControlNetCheckpointConfig"] | components["schemas"]["LoRALyCORISConfig"] | components["schemas"]["LoRADiffusersConfig"] | components["schemas"]["TextualInversionFileConfig"] | components["schemas"]["TextualInversionFolderConfig"] | components["schemas"]["IPAdapterInvokeAIConfig"] | components["schemas"]["IPAdapterCheckpointConfig"] | components["schemas"]["T2IAdapterConfig"] | components["schemas"]["SpandrelImageToImageConfig"] | components["schemas"]["CLIPVisionDiffusersConfig"])[];
+      models: (components["schemas"]["MainDiffusersConfig"] | components["schemas"]["MainCheckpointConfig"] | components["schemas"]["VAEDiffusersConfig"] | components["schemas"]["VAECheckpointConfig"] | components["schemas"]["ControlNetDiffusersConfig"] | components["schemas"]["ControlNetCheckpointConfig"] | components["schemas"]["LoRALyCORISConfig"] | components["schemas"]["LoRADiffusersConfig"] | components["schemas"]["T5EncoderConfig"] | components["schemas"]["TextualInversionFileConfig"] | components["schemas"]["TextualInversionFolderConfig"] | components["schemas"]["IPAdapterInvokeAIConfig"] | components["schemas"]["IPAdapterCheckpointConfig"] | components["schemas"]["T2IAdapterConfig"] | components["schemas"]["SpandrelImageToImageConfig"] | components["schemas"]["CLIPVisionDiffusersConfig"])[];
     };
     /**
      * Multiply Integers
@@ -13338,6 +13350,67 @@ export type components = {
        */
       type: "t2i_adapter_output";
     };
+    /** T5EncoderConfig */
+    T5EncoderConfig: {
+      /**
+       * Key
+       * @description A unique key for this model.
+       */
+      key: string;
+      /**
+       * Hash
+       * @description The hash of the model file(s).
+       */
+      hash: string;
+      /**
+       * Path
+       * @description Path to the model on the filesystem. Relative paths are relative to the Invoke root directory.
+       */
+      path: string;
+      /**
+       * Name
+       * @description Name of the model.
+       */
+      name: string;
+      /** @description The base model. */
+      base: components["schemas"]["BaseModelType"];
+      /**
+       * Description
+       * @description Model description
+       */
+      description?: string | null;
+      /**
+       * Source
+       * @description The original source of the model (path, URL or repo_id).
+       */
+      source: string;
+      /** @description The type of source */
+      source_type: components["schemas"]["ModelSourceType"];
+      /**
+       * Source Api Response
+       * @description The original API response from the source, as stringified JSON.
+       */
+      source_api_response?: string | null;
+      /**
+       * Cover Image
+       * @description Url for image to preview model
+       */
+      cover_image?: string | null;
+      /**
+       * Type
+       * @default t5_encoder
+       * @constant
+       * @enum {string}
+       */
+      type: "t5_encoder";
+      /**
+       * Format
+       * @default t5_encoder
+       * @constant
+       * @enum {string}
+       */
+      format: "t5_encoder";
+    };
     /** T5EncoderField */
     T5EncoderField: {
       /** @description Info to load tokenizer submodel */
@@ -13780,8 +13853,6 @@ export type components = {
     TransformerField: {
       /** @description Info to load Transformer submodel */
       transformer: components["schemas"]["ModelIdentifierField"];
-      /** @description Info to load scheduler submodel */
-      scheduler: components["schemas"]["ModelIdentifierField"];
     };
     /**
      * UIComponent
@@ -14560,7 +14631,7 @@ export type operations = {
       /** @description Successful Response */
       200: {
         content: {
-          "application/json": components["schemas"]["MainDiffusersConfig"] | components["schemas"]["MainCheckpointConfig"] | components["schemas"]["VAEDiffusersConfig"] | components["schemas"]["VAECheckpointConfig"] | components["schemas"]["ControlNetDiffusersConfig"] | components["schemas"]["ControlNetCheckpointConfig"] | components["schemas"]["LoRALyCORISConfig"] | components["schemas"]["LoRADiffusersConfig"] | components["schemas"]["TextualInversionFileConfig"] | components["schemas"]["TextualInversionFolderConfig"] | components["schemas"]["IPAdapterInvokeAIConfig"] | components["schemas"]["IPAdapterCheckpointConfig"] | components["schemas"]["T2IAdapterConfig"] | components["schemas"]["SpandrelImageToImageConfig"] | components["schemas"]["CLIPVisionDiffusersConfig"];
+          "application/json": components["schemas"]["MainDiffusersConfig"] | components["schemas"]["MainCheckpointConfig"] | components["schemas"]["VAEDiffusersConfig"] | components["schemas"]["VAECheckpointConfig"] | components["schemas"]["ControlNetDiffusersConfig"] | components["schemas"]["ControlNetCheckpointConfig"] | components["schemas"]["LoRALyCORISConfig"] | components["schemas"]["LoRADiffusersConfig"] | components["schemas"]["T5EncoderConfig"] | components["schemas"]["TextualInversionFileConfig"] | components["schemas"]["TextualInversionFolderConfig"] | components["schemas"]["IPAdapterInvokeAIConfig"] | components["schemas"]["IPAdapterCheckpointConfig"] | components["schemas"]["T2IAdapterConfig"] | components["schemas"]["SpandrelImageToImageConfig"] | components["schemas"]["CLIPVisionDiffusersConfig"];
         };
       };
       /** @description Validation Error */
@@ -14586,7 +14657,7 @@ export type operations = {
       /** @description The model configuration was retrieved successfully */
       200: {
         content: {
-          "application/json": components["schemas"]["MainDiffusersConfig"] | components["schemas"]["MainCheckpointConfig"] | components["schemas"]["VAEDiffusersConfig"] | components["schemas"]["VAECheckpointConfig"] | components["schemas"]["ControlNetDiffusersConfig"] | components["schemas"]["ControlNetCheckpointConfig"] | components["schemas"]["LoRALyCORISConfig"] | components["schemas"]["LoRADiffusersConfig"] | components["schemas"]["TextualInversionFileConfig"] | components["schemas"]["TextualInversionFolderConfig"] | components["schemas"]["IPAdapterInvokeAIConfig"] | components["schemas"]["IPAdapterCheckpointConfig"] | components["schemas"]["T2IAdapterConfig"] | components["schemas"]["SpandrelImageToImageConfig"] | components["schemas"]["CLIPVisionDiffusersConfig"];
+          "application/json": components["schemas"]["MainDiffusersConfig"] | components["schemas"]["MainCheckpointConfig"] | components["schemas"]["VAEDiffusersConfig"] | components["schemas"]["VAECheckpointConfig"] | components["schemas"]["ControlNetDiffusersConfig"] | components["schemas"]["ControlNetCheckpointConfig"] | components["schemas"]["LoRALyCORISConfig"] | components["schemas"]["LoRADiffusersConfig"] | components["schemas"]["T5EncoderConfig"] | components["schemas"]["TextualInversionFileConfig"] | components["schemas"]["TextualInversionFolderConfig"] | components["schemas"]["IPAdapterInvokeAIConfig"] | components["schemas"]["IPAdapterCheckpointConfig"] | components["schemas"]["T2IAdapterConfig"] | components["schemas"]["SpandrelImageToImageConfig"] | components["schemas"]["CLIPVisionDiffusersConfig"];
         };
       };
       /** @description Bad request */
@@ -14668,7 +14739,7 @@ export type operations = {
       /** @description The model was updated successfully */
       200: {
         content: {
-          "application/json": components["schemas"]["MainDiffusersConfig"] | components["schemas"]["MainCheckpointConfig"] | components["schemas"]["VAEDiffusersConfig"] | components["schemas"]["VAECheckpointConfig"] | components["schemas"]["ControlNetDiffusersConfig"] | components["schemas"]["ControlNetCheckpointConfig"] | components["schemas"]["LoRALyCORISConfig"] | components["schemas"]["LoRADiffusersConfig"] | components["schemas"]["TextualInversionFileConfig"] | components["schemas"]["TextualInversionFolderConfig"] | components["schemas"]["IPAdapterInvokeAIConfig"] | components["schemas"]["IPAdapterCheckpointConfig"] | components["schemas"]["T2IAdapterConfig"] | components["schemas"]["SpandrelImageToImageConfig"] | components["schemas"]["CLIPVisionDiffusersConfig"];
+          "application/json": components["schemas"]["MainDiffusersConfig"] | components["schemas"]["MainCheckpointConfig"] | components["schemas"]["VAEDiffusersConfig"] | components["schemas"]["VAECheckpointConfig"] | components["schemas"]["ControlNetDiffusersConfig"] | components["schemas"]["ControlNetCheckpointConfig"] | components["schemas"]["LoRALyCORISConfig"] | components["schemas"]["LoRADiffusersConfig"] | components["schemas"]["T5EncoderConfig"] | components["schemas"]["TextualInversionFileConfig"] | components["schemas"]["TextualInversionFolderConfig"] | components["schemas"]["IPAdapterInvokeAIConfig"] | components["schemas"]["IPAdapterCheckpointConfig"] | components["schemas"]["T2IAdapterConfig"] | components["schemas"]["SpandrelImageToImageConfig"] | components["schemas"]["CLIPVisionDiffusersConfig"];
         };
       };
       /** @description Bad request */
@@ -15078,7 +15149,7 @@ export type operations = {
       /** @description Model converted successfully */
       200: {
         content: {
-          "application/json": components["schemas"]["MainDiffusersConfig"] | components["schemas"]["MainCheckpointConfig"] | components["schemas"]["VAEDiffusersConfig"] | components["schemas"]["VAECheckpointConfig"] | components["schemas"]["ControlNetDiffusersConfig"] | components["schemas"]["ControlNetCheckpointConfig"] | components["schemas"]["LoRALyCORISConfig"] | components["schemas"]["LoRADiffusersConfig"] | components["schemas"]["TextualInversionFileConfig"] | components["schemas"]["TextualInversionFolderConfig"] | components["schemas"]["IPAdapterInvokeAIConfig"] | components["schemas"]["IPAdapterCheckpointConfig"] | components["schemas"]["T2IAdapterConfig"] | components["schemas"]["SpandrelImageToImageConfig"] | components["schemas"]["CLIPVisionDiffusersConfig"];
+          "application/json": components["schemas"]["MainDiffusersConfig"] | components["schemas"]["MainCheckpointConfig"] | components["schemas"]["VAEDiffusersConfig"] | components["schemas"]["VAECheckpointConfig"] | components["schemas"]["ControlNetDiffusersConfig"] | components["schemas"]["ControlNetCheckpointConfig"] | components["schemas"]["LoRALyCORISConfig"] | components["schemas"]["LoRADiffusersConfig"] | components["schemas"]["T5EncoderConfig"] | components["schemas"]["TextualInversionFileConfig"] | components["schemas"]["TextualInversionFolderConfig"] | components["schemas"]["IPAdapterInvokeAIConfig"] | components["schemas"]["IPAdapterCheckpointConfig"] | components["schemas"]["T2IAdapterConfig"] | components["schemas"]["SpandrelImageToImageConfig"] | components["schemas"]["CLIPVisionDiffusersConfig"];
         };
       };
       /** @description Bad request */