From c7562dd6c0ac9ff0ecf165a0b2e4d35428738307 Mon Sep 17 00:00:00 2001
From: psychedelicious <4822129+psychedelicious@users.noreply.github.com>
Date: Thu, 27 Jun 2024 19:15:23 +1000
Subject: [PATCH 1/2] fix(backend): mps should not use `non_blocking`

We can get black outputs when moving tensors from CPU to MPS. It appears MPS to CPU is fine. See:
- https://github.com/pytorch/pytorch/issues/107455
- https://discuss.pytorch.org/t/should-we-set-non-blocking-to-true/38234/28

Changes:
- Add properties for each device on `TorchDevice` as a convenience.
- Add `get_non_blocking` static method on `TorchDevice`. This utility takes a torch device and returns the flag to be used for non_blocking when moving a tensor to the device provided.
- Update model patching and caching APIs to use this new utility.

Fixes: #6545
---
 invokeai/backend/lora.py                         |  3 ++-
 .../load/model_cache/model_cache_default.py      |  4 ++--
 invokeai/backend/model_patcher.py                | 11 ++++++-----
 invokeai/backend/util/devices.py                 | 16 ++++++++++++++++
 4 files changed, 26 insertions(+), 8 deletions(-)

diff --git a/invokeai/backend/lora.py b/invokeai/backend/lora.py
index f7c3863a6ad..8d17de08372 100644
--- a/invokeai/backend/lora.py
+++ b/invokeai/backend/lora.py
@@ -10,6 +10,7 @@
 from typing_extensions import Self
 
 from invokeai.backend.model_manager import BaseModelType
+from invokeai.backend.util.devices import TorchDevice
 
 from .raw_model import RawModel
 
@@ -521,7 +522,7 @@ def from_checkpoint(
             # lower memory consumption by removing already parsed layer values
             state_dict[layer_key].clear()
 
-            layer.to(device=device, dtype=dtype, non_blocking=True)
+            layer.to(device=device, dtype=dtype, non_blocking=TorchDevice.get_non_blocking(device))
             model.layers[layer_key] = layer
 
         return model
diff --git a/invokeai/backend/model_manager/load/model_cache/model_cache_default.py b/invokeai/backend/model_manager/load/model_cache/model_cache_default.py
index d48e45426e3..7331654dc18 100644
--- a/invokeai/backend/model_manager/load/model_cache/model_cache_default.py
+++ b/invokeai/backend/model_manager/load/model_cache/model_cache_default.py
@@ -285,9 +285,9 @@ def move_model_to_device(self, cache_entry: CacheRecord[AnyModel], target_device
                 else:
                     new_dict: Dict[str, torch.Tensor] = {}
                     for k, v in cache_entry.state_dict.items():
-                        new_dict[k] = v.to(torch.device(target_device), copy=True, non_blocking=True)
+                        new_dict[k] = v.to(target_device, copy=True, non_blocking=TorchDevice.get_non_blocking(target_device))
                     cache_entry.model.load_state_dict(new_dict, assign=True)
-            cache_entry.model.to(target_device, non_blocking=True)
+            cache_entry.model.to(target_device, non_blocking=TorchDevice.get_non_blocking(target_device))
             cache_entry.device = target_device
         except Exception as e:  # blow away cache entry
             self._delete_cache_entry(cache_entry)
diff --git a/invokeai/backend/model_patcher.py b/invokeai/backend/model_patcher.py
index fdc79539ae7..993d96784a9 100644
--- a/invokeai/backend/model_patcher.py
+++ b/invokeai/backend/model_patcher.py
@@ -16,6 +16,7 @@
 from invokeai.backend.model_manager import AnyModel
 from invokeai.backend.model_manager.load.optimizations import skip_torch_weight_init
 from invokeai.backend.onnx.onnx_runtime import IAIOnnxRuntimeModel
+from invokeai.backend.util.devices import TorchDevice
 
 from .lora import LoRAModelRaw
 from .textual_inversion import TextualInversionManager, TextualInversionModelRaw
@@ -139,12 +140,12 @@ def apply_lora(
                         # We intentionally move to the target device first, then cast. Experimentally, this was found to
                         # be significantly faster for 16-bit CPU tensors being moved to a CUDA device than doing the
                         # same thing in a single call to '.to(...)'.
-                        layer.to(device=device, non_blocking=True)
-                        layer.to(dtype=torch.float32, non_blocking=True)
+                        layer.to(device=device, non_blocking=TorchDevice.get_non_blocking(device))
+                        layer.to(dtype=torch.float32, non_blocking=TorchDevice.get_non_blocking(device))
                         # TODO(ryand): Using torch.autocast(...) over explicit casting may offer a speed benefit on CUDA
                         # devices here. Experimentally, it was found to be very slow on CPU. More investigation needed.
                         layer_weight = layer.get_weight(module.weight) * (lora_weight * layer_scale)
-                        layer.to(device=torch.device("cpu"), non_blocking=True)
+                        layer.to(device=TorchDevice.CPU_DEVICE, non_blocking=TorchDevice.get_non_blocking(TorchDevice.CPU_DEVICE))
 
                         assert isinstance(layer_weight, torch.Tensor)  # mypy thinks layer_weight is a float|Any ??!
                         if module.weight.shape != layer_weight.shape:
@@ -153,7 +154,7 @@ def apply_lora(
                             layer_weight = layer_weight.reshape(module.weight.shape)
 
                         assert isinstance(layer_weight, torch.Tensor)  # mypy thinks layer_weight is a float|Any ??!
-                        module.weight += layer_weight.to(dtype=dtype, non_blocking=True)
+                        module.weight += layer_weight.to(dtype=dtype, non_blocking=TorchDevice.get_non_blocking(device))
 
             yield  # wait for context manager exit
 
@@ -161,7 +162,7 @@ def apply_lora(
             assert hasattr(model, "get_submodule")  # mypy not picking up fact that torch.nn.Module has get_submodule()
             with torch.no_grad():
                 for module_key, weight in original_weights.items():
-                    model.get_submodule(module_key).weight.copy_(weight, non_blocking=True)
+                    model.get_submodule(module_key).weight.copy_(weight, non_blocking=TorchDevice.get_non_blocking(weight.device))
 
     @classmethod
     @contextmanager
diff --git a/invokeai/backend/util/devices.py b/invokeai/backend/util/devices.py
index e8380dc8bcd..1cba70c6626 100644
--- a/invokeai/backend/util/devices.py
+++ b/invokeai/backend/util/devices.py
@@ -42,6 +42,10 @@ def torch_dtype(device: torch.device) -> torch.dtype:
 class TorchDevice:
     """Abstraction layer for torch devices."""
 
+    CPU_DEVICE = torch.device("cpu")
+    CUDA_DEVICE = torch.device("cuda")
+    MPS_DEVICE = torch.device("mps")
+
     @classmethod
     def choose_torch_device(cls) -> torch.device:
         """Return the torch.device to use for accelerated inference."""
@@ -108,3 +112,15 @@ def empty_cache(cls) -> None:
     @classmethod
     def _to_dtype(cls, precision_name: TorchPrecisionNames) -> torch.dtype:
         return NAME_TO_PRECISION[precision_name]
+
+    @staticmethod
+    def get_non_blocking(to_device: torch.device) -> bool:
+        """Return the non_blocking flag to be used when moving a tensor to a given device.
+        MPS may have unexpected errors with non-blocking operations - we should not use non-blocking when moving _to_ MPS.
+        When moving _from_ MPS, we can use non-blocking operations.
+
+        See:
+        - https://github.com/pytorch/pytorch/issues/107455
+        - https://discuss.pytorch.org/t/should-we-set-non-blocking-to-true/38234/28
+        """
+        return False if to_device.type == "mps" else True

From 14775cc9c4937f557943022d480cfb01f6e21d15 Mon Sep 17 00:00:00 2001
From: Ryan Dick <ryanjdick3@gmail.com>
Date: Thu, 27 Jun 2024 09:45:13 -0400
Subject: [PATCH 2/2] ruff format

---
 .../load/model_cache/model_cache_default.py              | 4 +++-
 invokeai/backend/model_patcher.py                        | 9 +++++++--
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/invokeai/backend/model_manager/load/model_cache/model_cache_default.py b/invokeai/backend/model_manager/load/model_cache/model_cache_default.py
index 7331654dc18..697d3daf9b9 100644
--- a/invokeai/backend/model_manager/load/model_cache/model_cache_default.py
+++ b/invokeai/backend/model_manager/load/model_cache/model_cache_default.py
@@ -285,7 +285,9 @@ def move_model_to_device(self, cache_entry: CacheRecord[AnyModel], target_device
                 else:
                     new_dict: Dict[str, torch.Tensor] = {}
                     for k, v in cache_entry.state_dict.items():
-                        new_dict[k] = v.to(target_device, copy=True, non_blocking=TorchDevice.get_non_blocking(target_device))
+                        new_dict[k] = v.to(
+                            target_device, copy=True, non_blocking=TorchDevice.get_non_blocking(target_device)
+                        )
                     cache_entry.model.load_state_dict(new_dict, assign=True)
             cache_entry.model.to(target_device, non_blocking=TorchDevice.get_non_blocking(target_device))
             cache_entry.device = target_device
diff --git a/invokeai/backend/model_patcher.py b/invokeai/backend/model_patcher.py
index 993d96784a9..051d1142769 100644
--- a/invokeai/backend/model_patcher.py
+++ b/invokeai/backend/model_patcher.py
@@ -145,7 +145,10 @@ def apply_lora(
                         # TODO(ryand): Using torch.autocast(...) over explicit casting may offer a speed benefit on CUDA
                         # devices here. Experimentally, it was found to be very slow on CPU. More investigation needed.
                         layer_weight = layer.get_weight(module.weight) * (lora_weight * layer_scale)
-                        layer.to(device=TorchDevice.CPU_DEVICE, non_blocking=TorchDevice.get_non_blocking(TorchDevice.CPU_DEVICE))
+                        layer.to(
+                            device=TorchDevice.CPU_DEVICE,
+                            non_blocking=TorchDevice.get_non_blocking(TorchDevice.CPU_DEVICE),
+                        )
 
                         assert isinstance(layer_weight, torch.Tensor)  # mypy thinks layer_weight is a float|Any ??!
                         if module.weight.shape != layer_weight.shape:
@@ -162,7 +165,9 @@ def apply_lora(
             assert hasattr(model, "get_submodule")  # mypy not picking up fact that torch.nn.Module has get_submodule()
             with torch.no_grad():
                 for module_key, weight in original_weights.items():
-                    model.get_submodule(module_key).weight.copy_(weight, non_blocking=TorchDevice.get_non_blocking(weight.device))
+                    model.get_submodule(module_key).weight.copy_(
+                        weight, non_blocking=TorchDevice.get_non_blocking(weight.device)
+                    )
 
     @classmethod
     @contextmanager