Merge pull request #16030 from AUTOMATIC1111/sd3

Stable Diffusion 3 support
AUTOMATIC1111 · Jul 6, 2024 · c02e3a5 · c02e3a5
2 parents a30b19d + 9e404c3
commit c02e3a5
Show file tree

Hide file tree

Showing 24 changed files with 2,058 additions and 72 deletions.
diff --git a/README.md b/README.md
@@ -150,7 +150,7 @@ For the purposes of getting Google and other search engines to crawl the wiki, h
 ## Credits
 Licenses for borrowed code can be found in `Settings -> Licenses` screen, and also in `html/licenses.html` file.
 
-- Stable Diffusion - https://github.com/Stability-AI/stablediffusion, https://github.com/CompVis/taming-transformers
+- Stable Diffusion - https://github.com/Stability-AI/stablediffusion, https://github.com/CompVis/taming-transformers, https://github.com/mcmonkey4eva/sd3-ref
 - k-diffusion - https://github.com/crowsonkb/k-diffusion.git
 - Spandrel - https://github.com/chaiNNer-org/spandrel implementing
   - GFPGAN - https://github.com/TencentARC/GFPGAN.git

diff --git a/configs/sd3-inference.yaml b/configs/sd3-inference.yaml
@@ -0,0 +1,5 @@
+model:
+  target: modules.models.sd3.sd3_model.SD3Inferencer
+  params:
+    shift: 3
+    state_dict: null
diff --git a/extensions-builtin/Lora/networks.py b/extensions-builtin/Lora/networks.py
@@ -130,7 +130,9 @@ def assign_network_names_to_compvis_modules(sd_model):
                 network_layer_mapping[network_name] = module
                 module.network_layer_name = network_name
     else:
-        for name, module in shared.sd_model.cond_stage_model.wrapped.named_modules():
+        cond_stage_model = getattr(shared.sd_model.cond_stage_model, 'wrapped', shared.sd_model.cond_stage_model)
+
+        for name, module in cond_stage_model.named_modules():
             network_name = name.replace(".", "_")
             network_layer_mapping[network_name] = module
             module.network_layer_name = network_name

diff --git a/modules/deepbooru.py b/modules/deepbooru.py
@@ -57,7 +57,7 @@ def tag_multi(self, pil_image, force_disable_ranks=False):
         a = np.expand_dims(np.array(pic, dtype=np.float32), 0) / 255
 
         with torch.no_grad(), devices.autocast():
-            x = torch.from_numpy(a).to(devices.device)
+            x = torch.from_numpy(a).to(devices.device, devices.dtype)
             y = self.model(x)[0].detach().cpu().numpy()
 
         probability_dict = {}

diff --git a/modules/lowvram.py b/modules/lowvram.py
@@ -1,9 +1,12 @@
+from collections import namedtuple
+
 import torch
 from modules import devices, shared
 
 module_in_gpu = None
 cpu = torch.device("cpu")
 
+ModuleWithParent = namedtuple('ModuleWithParent', ['module', 'parent'], defaults=['None'])
 
 def send_everything_to_cpu():
     global module_in_gpu
@@ -75,13 +78,14 @@ def first_stage_model_decode_wrap(z):
         (sd_model, 'depth_model'),
         (sd_model, 'embedder'),
         (sd_model, 'model'),
-        (sd_model, 'embedder'),
     ]
 
     is_sdxl = hasattr(sd_model, 'conditioner')
     is_sd2 = not is_sdxl and hasattr(sd_model.cond_stage_model, 'model')
 
-    if is_sdxl:
+    if hasattr(sd_model, 'medvram_fields'):
+        to_remain_in_cpu = sd_model.medvram_fields()
+    elif is_sdxl:
         to_remain_in_cpu.append((sd_model, 'conditioner'))
     elif is_sd2:
         to_remain_in_cpu.append((sd_model.cond_stage_model, 'model'))
@@ -103,7 +107,21 @@ def first_stage_model_decode_wrap(z):
         setattr(obj, field, module)
 
     # register hooks for those the first three models
-    if is_sdxl:
+    if hasattr(sd_model, "cond_stage_model") and hasattr(sd_model.cond_stage_model, "medvram_modules"):
+        for module in sd_model.cond_stage_model.medvram_modules():
+            if isinstance(module, ModuleWithParent):
+                parent = module.parent
+                module = module.module
+            else:
+                parent = None
+
+            if module:
+                module.register_forward_pre_hook(send_me_to_gpu)
+
+                if parent:
+                    parents[module] = parent
+
+    elif is_sdxl:
         sd_model.conditioner.register_forward_pre_hook(send_me_to_gpu)
     elif is_sd2:
         sd_model.cond_stage_model.model.register_forward_pre_hook(send_me_to_gpu)
@@ -117,9 +135,9 @@ def first_stage_model_decode_wrap(z):
     sd_model.first_stage_model.register_forward_pre_hook(send_me_to_gpu)
     sd_model.first_stage_model.encode = first_stage_model_encode_wrap
     sd_model.first_stage_model.decode = first_stage_model_decode_wrap
-    if sd_model.depth_model:
+    if getattr(sd_model, 'depth_model', None) is not None:
         sd_model.depth_model.register_forward_pre_hook(send_me_to_gpu)
-    if sd_model.embedder:
+    if getattr(sd_model, 'embedder', None) is not None:
         sd_model.embedder.register_forward_pre_hook(send_me_to_gpu)
 
     if use_medvram: