invoke-ai · StAlKeR7779 · Jun 27, 2024 · Jul 27, 2024 · Jul 27, 2024 · Jul 27, 2024
diff --git a/docker/Dockerfile b/docker/Dockerfile
@@ -43,7 +43,12 @@ RUN --mount=type=cache,target=/root/.cache/pip \
         extra_index_url_arg="--extra-index-url https://download.pytorch.org/whl/cu121"; \
     fi &&\
 
-    pip install $extra_index_url_arg -e ".";
+    # xformers + triton fails to install on arm64
+    if [ "$GPU_DRIVER" = "cuda" ] && [ "$TARGETPLATFORM" = "linux/amd64" ]; then \
+        pip install $extra_index_url_arg -e ".[xformers]"; \
+    else \
+        pip install $extra_index_url_arg -e "."; \
+    fi
 
 # #### Build the Web UI ------------------------------------
 

diff --git a/docs/installation/020_INSTALL_MANUAL.md b/docs/installation/020_INSTALL_MANUAL.md
@@ -87,6 +87,14 @@ Before you start, go through the [installation requirements](./INSTALL_REQUIREME
             pip install InvokeAI --use-pep517 --extra-index-url https://download.pytorch.org/whl/cu121
             ```
 
+    - If you have a CUDA GPU and want to install with `xformers`, you need to add an option to the package name. Note that `xformers` is not necessary. PyTorch includes an implementation of the SDP attention algorithm with the same performance.
+
+        !!! example "Install with `xformers`"
+
+            ```bash
+            pip install "InvokeAI[xformers]" --use-pep517
+            ```
+
 1. Deactivate and reactivate your runtime directory so that the invokeai-specific commands become available in the environment:
 
     === "Linux/macOS"

diff --git a/flake.nix b/flake.nix
@@ -84,7 +84,7 @@
     in
     {
       devShells.${system} = rec {
-        develop = mkShell { dir = "venv"; install = "-e '.' --extra-index-url https://download.pytorch.org/whl/cu118"; };
+        develop = mkShell { dir = "venv"; install = "-e '.[xformers]' --extra-index-url https://download.pytorch.org/whl/cu118"; };
         default = develop;
       };
     };

diff --git a/installer/lib/installer.py b/installer/lib/installer.py
@@ -418,11 +418,11 @@ def get_torch_source() -> Tuple[str | None, str | None]:
             url = "https://download.pytorch.org/whl/cpu"
         elif device.value == "cuda":
             # CUDA uses the default PyPi index
-            optional_modules = "[onnx-cuda]"
+            optional_modules = "[xformers,onnx-cuda]"
     elif OS == "Windows":
         if device.value == "cuda":
             url = "https://download.pytorch.org/whl/cu121"
-            optional_modules = "[onnx-cuda]"
+            optional_modules = "[xformers,onnx-cuda]"
         elif device.value == "cpu":
             # CPU  uses the default PyPi index, no optional modules
             pass

diff --git a/invokeai/app/api/routers/app_info.py b/invokeai/app/api/routers/app_info.py
@@ -1,6 +1,6 @@
 import typing
 from enum import Enum
-from importlib.metadata import version
+from importlib.metadata import PackageNotFoundError, version
 from pathlib import Path
 from platform import python_version
 from typing import Optional
@@ -56,6 +56,7 @@ class AppDependencyVersions(BaseModel):
     torch: str = Field(description="PyTorch version")
     torchvision: str = Field(description="PyTorch Vision version")
     transformers: str = Field(description="transformers version")
+    xformers: Optional[str] = Field(description="xformers version")
 
 
 class AppConfig(BaseModel):
@@ -74,6 +75,10 @@ async def get_version() -> AppVersion:
 
 @app_router.get("/app_deps", operation_id="get_app_deps", status_code=200, response_model=AppDependencyVersions)
 async def get_app_deps() -> AppDependencyVersions:
+    try:
+        xformers = version("xformers")
+    except PackageNotFoundError:
+        xformers = None
     return AppDependencyVersions(
         accelerate=version("accelerate"),
         compel=version("compel"),
@@ -87,6 +92,7 @@ async def get_app_deps() -> AppDependencyVersions:
         torch=torch.version.__version__,
         torchvision=version("torchvision"),
         transformers=version("transformers"),
+        xformers=xformers,
     )
 
 

@@ -28,7 +28,7 @@
 DEFAULT_VRAM_CACHE = 0.25
 DEVICE = Literal["auto", "cpu", "cuda", "cuda:1", "mps"]
 PRECISION = Literal["auto", "float16", "bfloat16", "float32"]
-ATTENTION_TYPE = Literal["auto", "normal", "torch-sdp"]
+ATTENTION_TYPE = Literal["auto", "normal", "xformers", "torch-sdp"]
 ATTENTION_SLICE_SIZE = Literal["auto", "none", "balanced", "max", 1, 2, 3, 4, 5, 6, 7, 8]
 LOG_FORMAT = Literal["plain", "color", "syslog", "legacy"]
 LOG_LEVEL = Literal["debug", "info", "warning", "error", "critical"]
@@ -449,8 +449,8 @@ def migrate_v4_0_2_to_4_0_3_config_dict(config_dict: dict[str, Any]) -> dict[str
     if attention_type != "sliced" and "attention_slice_size" in parsed_config_dict:
         del parsed_config_dict["attention_slice_size"]
 
-    # xformers attention removed, sliced moved to attention_slice_size
-    if attention_type in ["sliced", "xformers"]:
+    # sliced moved to attention_slice_size
+    if attention_type == "sliced":
         parsed_config_dict["attention_type"] = "auto"
 
     parsed_config_dict["schema_version"] = "4.0.3"

@@ -6,13 +6,20 @@
 import torch
 import torch.nn.functional as F
 from diffusers.models.attention_processor import Attention
+from diffusers.utils.import_utils import is_xformers_available
 
 from invokeai.app.services.config.config_default import get_config
 from invokeai.backend.ip_adapter.ip_attention_weights import IPAttentionProcessorWeights
 from invokeai.backend.stable_diffusion.diffusion.regional_ip_data import RegionalIPData
 from invokeai.backend.stable_diffusion.diffusion.regional_prompt_data import RegionalPromptData
 from invokeai.backend.util.devices import TorchDevice
 
+if is_xformers_available():
+    import xformers
+    import xformers.ops
+else:
+    xformers = None
+
 
 @dataclass
 class IPAdapterAttentionWeights:
@@ -23,7 +30,9 @@ class IPAdapterAttentionWeights:
 class CustomAttnProcessor:
     """A custom implementation of attention processor that supports additional Invoke features.
     This implementation is based on
+    AttnProcessor (https://github.com/huggingface/diffusers/blob/fcfa270fbd1dc294e2f3a505bae6bcb791d721c3/src/diffusers/models/attention_processor.py#L732)
     SlicedAttnProcessor (https://github.com/huggingface/diffusers/blob/fcfa270fbd1dc294e2f3a505bae6bcb791d721c3/src/diffusers/models/attention_processor.py#L1616)
+    XFormersAttnProcessor (https://github.com/huggingface/diffusers/blob/fcfa270fbd1dc294e2f3a505bae6bcb791d721c3/src/diffusers/models/attention_processor.py#L1113)
     AttnProcessor2_0 (https://github.com/huggingface/diffusers/blob/fcfa270fbd1dc294e2f3a505bae6bcb791d721c3/src/diffusers/models/attention_processor.py#L1204)
     Supported custom features:
     - IP-Adapter
@@ -53,6 +62,9 @@ def __init__(
         if self.slice_size == "auto":
             self.slice_size = self._select_slice_size()
 
+        if self.attention_type == "xformers" and xformers is None:
+            raise ImportError("xformers attention requires xformers module to be installed.")
+
     def _select_attention_type(self) -> str:
         device = TorchDevice.choose_torch_device()
         # On some mps system normal attention still faster than torch-sdp, on others - on par
@@ -61,7 +73,14 @@ def _select_attention_type(self) -> str:
         # Adreitz: 260.868s vs 226.638s
         if device.type == "mps":
             return "normal"
-        else:  # cuda, cpu
+        elif device.type == "cuda":
+            # Flash Attention is supported from sm80 compute capability onwards in PyTorch
+            # https://pytorch.org/blog/accelerated-pytorch-2/
+            if torch.cuda.get_device_capability("cuda")[0] < 8 and xformers is not None:
+                return "xformers"
+            else:
+                return "torch-sdp"
+        else:  # cpu
             return "torch-sdp"
 
     def _select_slice_size(self) -> str:
@@ -262,6 +281,8 @@ def run_attention(
             attn_call = self.run_attention_sdp
         elif self.attention_type == "normal":
             attn_call = self.run_attention_normal
+        elif self.attention_type == "xformers":
+            attn_call = self.run_attention_xformers
         else:
             raise Exception(f"Unknown attention type: {self.attention_type}")
 
@@ -291,6 +312,35 @@ def run_attention_normal(
 
         return hidden_states
 
+    def run_attention_xformers(
+        self,
+        attn: Attention,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        attention_mask: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        query = attn.head_to_batch_dim(query).contiguous()
+        key   = attn.head_to_batch_dim(key).contiguous()
+        value = attn.head_to_batch_dim(value).contiguous()
+
+        if attention_mask is not None:
+            # expand our mask's singleton query_length dimension:
+            #   [batch*heads,            1, key_length] ->
+            #   [batch*heads, query_length, key_length]
+            # so that it can be added as a bias onto the attention scores that xformers computes:
+            #   [batch*heads, query_length, key_length]
+            # we do this explicitly because xformers doesn't broadcast the singleton dimension for us.
+            attention_mask = attention_mask.expand(-1, query.shape[1], -1)
+
+        hidden_states = xformers.ops.memory_efficient_attention(
+            query, key, value, attn_bias=attention_mask, op=None, scale=attn.scale
+        )
+        hidden_states = hidden_states.to(query.dtype)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+
+        return hidden_states
+
     def run_attention_sdp(
         self,
         attn: Attention,
@@ -355,6 +405,10 @@ def run_attention_sliced(
                 attn_slice = attn.get_attention_scores(query_slice, key_slice, attn_mask_slice)
                 torch.bmm(attn_slice, value_slice, out=hidden_states[start_idx:end_idx])
                 del attn_slice
+            elif self.attention_type == "xformers":
+                hidden_states[start_idx:end_idx] = xformers.ops.memory_efficient_attention(
+                    query_slice, key_slice, value_slice, attn_bias=attn_mask_slice, op=None, scale=attn.scale
+                )
             elif self.attention_type == "torch-sdp":
                 if attn_mask_slice is not None:
                     attn_mask_slice = attn_mask_slice.unsqueeze(0)

@@ -791,3 +791,49 @@ def new_LoRACompatibleConv_forward(self, hidden_states, scale: float = 1.0):
 
 
 diffusers.models.lora.LoRACompatibleConv.forward = new_LoRACompatibleConv_forward
+
+try:
+    import xformers
+
+    xformers_available = True
+except Exception:
+    xformers_available = False
+
+
+if xformers_available:
+    # TODO: remove when fixed in diffusers
+    _xformers_memory_efficient_attention = xformers.ops.memory_efficient_attention
+
+    def new_memory_efficient_attention(
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        attn_bias=None,
+        p: float = 0.0,
+        scale: Optional[float] = None,
+        *,
+        op=None,
+    ):
+        # diffusers not align shape to 8, which is required by xformers
+        if attn_bias is not None and type(attn_bias) is torch.Tensor:
+            orig_size = attn_bias.shape[-1]
+            new_size = ((orig_size + 7) // 8) * 8
+            aligned_attn_bias = torch.zeros(
+                (attn_bias.shape[0], attn_bias.shape[1], new_size),
+                device=attn_bias.device,
+                dtype=attn_bias.dtype,
+            )
+            aligned_attn_bias[:, :, :orig_size] = attn_bias
+            attn_bias = aligned_attn_bias[:, :, :orig_size]
+
+        return _xformers_memory_efficient_attention(
+            query=query,
+            key=key,
+            value=value,
+            attn_bias=attn_bias,
+            p=p,
+            scale=scale,
+            op=op,
+        )
+
+    xformers.ops.memory_efficient_attention = new_memory_efficient_attention
diff --git a/invokeai/frontend/web/src/services/api/schema.ts b/invokeai/frontend/web/src/services/api/schema.ts
@@ -725,6 +725,11 @@ export type components = {
        * @description transformers version
        */
       transformers: string;
+      /**
+       * Xformers
+       * @description xformers version
+       */
+      xformers: string | null;
     };
     /**
      * AppVersion

diff --git a/invokeai/version/__init__.py b/invokeai/version/__init__.py
@@ -6,3 +6,15 @@
 
 __app_id__ = "invoke-ai/InvokeAI"
 __app_name__ = "InvokeAI"
+
+
+def _ignore_xformers_triton_message_on_windows():
+    import logging
+
+    logging.getLogger("xformers").addFilter(
+        lambda record: "A matching Triton is not available" not in record.getMessage()
+    )
+
+
+# In order to be effective, this needs to happen before anything could possibly import xformers.
+_ignore_xformers_triton_message_on_windows()
diff --git a/pyproject.toml b/pyproject.toml
@@ -94,6 +94,12 @@ dependencies = [
 ]
 
 [project.optional-dependencies]
+"xformers" = [
+  # Core generation dependencies, pinned for reproducible builds.
+  "xformers==0.0.25post1; sys_platform!='darwin'",
+  # Auxiliary dependencies, pinned only if necessary.
+  "triton; sys_platform=='linux'",
+]
 "onnx" = ["onnxruntime"]
 "onnx-cuda" = ["onnxruntime-gpu"]
 "onnx-directml" = ["onnxruntime-directml"]

diff --git a/scripts/invokeai-web.py b/scripts/invokeai-web.py
@@ -2,10 +2,13 @@
 
 # Copyright (c) 2022 Kyle Schouviller (https://github.com/kyle0654)
 
+import logging
 import os
 
 from invokeai.app.run_app import run_app
 
+logging.getLogger("xformers").addFilter(lambda record: "A matching Triton is not available" not in record.getMessage())
+
 
 def main():
     # Change working directory to the repo root