From 2d2c4f4efbc06a67d27286f09e5e1e302171efc5 Mon Sep 17 00:00:00 2001
From: akleine <alb.kleine@gmx.de>
Date: Mon, 28 Jul 2025 07:18:22 +0200
Subject: [PATCH 1/2] feat: add code for running SD1.x models with a TINY
 U-Net, just like bk-sdm-tiny

---
 model.cpp            |   8 +
 model.h              |   3 +-
 stable-diffusion.cpp |   1 +
 unet.hpp             | 391 ++++++++++++++++++++++++++++---------------
 4 files changed, 268 insertions(+), 135 deletions(-)
diff --git a/model.cpp b/model.cpp
index 9529cc58..3a8a7a19 100644
--- a/model.cpp
+++ b/model.cpp
@@ -1644,8 +1644,13 @@ SDVersion ModelLoader::get_sd_version() {
     bool is_xl   = false;
     bool is_flux = false;
 
+    bool maybe_unet_is_tiny = false;
+
 #define found_family (is_xl || is_flux)
     for (auto& tensor_storage : tensor_storages) {
+        if (tensor_storage.name == "model.diffusion_model.up_blocks.0.attentions.1.transformer_blocks.0.norm2.bias") {
+            maybe_unet_is_tiny = true;
+        }    
         if (!found_family) {
             if (tensor_storage.name.find("model.diffusion_model.double_blocks.") != std::string::npos) {
                 is_flux = true;
@@ -1722,6 +1727,9 @@ SDVersion ModelLoader::get_sd_version() {
         if (is_ip2p) {
             return VERSION_SD1_PIX2PIX;
         }
+        if (maybe_unet_is_tiny && tensor_storages.size() > 800 && tensor_storages.size() < 805) {
+            return VERSION_SD1_TINY_UNET;
+        }
         return VERSION_SD1;
     } else if (token_embedding_weight.ne[0] == 1024) {
         if (is_inpaint) {
diff --git a/model.h b/model.h
index ea716107..cd6e006b 100644
--- a/model.h
+++ b/model.h
@@ -22,6 +22,7 @@ enum SDVersion {
     VERSION_SD1,
     VERSION_SD1_INPAINT,
     VERSION_SD1_PIX2PIX,
+    VERSION_SD1_TINY_UNET,
     VERSION_SD2,
     VERSION_SD2_INPAINT,
     VERSION_SDXL,
@@ -49,7 +50,7 @@ static inline bool sd_version_is_sd3(SDVersion version) {
 }
 
 static inline bool sd_version_is_sd1(SDVersion version) {
-    if (version == VERSION_SD1 || version == VERSION_SD1_INPAINT || version == VERSION_SD1_PIX2PIX) {
+    if (version == VERSION_SD1 || version == VERSION_SD1_INPAINT || version == VERSION_SD1_PIX2PIX || version == VERSION_SD1_TINY_UNET) {
         return true;
     }
     return false;
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index 402585f1..54ca3c48 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -28,6 +28,7 @@ const char* model_version_to_str[] = {
     "SD 1.x",
     "SD 1.x Inpaint",
     "Instruct-Pix2Pix",
+    "SD 1.x tiny UNet",
     "SD 2.x",
     "SD 2.x Inpaint",
     "SDXL",
diff --git a/unet.hpp b/unet.hpp
index 9193dcd6..10e9513c 100644
--- a/unet.hpp
+++ b/unet.hpp
@@ -253,93 +253,159 @@ class UnetModelBlock : public GGMLBlock {
             }
         };
 
-        size_t len_mults = channel_mult.size();
-        for (int i = 0; i < len_mults; i++) {
-            int mult = channel_mult[i];
-            for (int j = 0; j < num_res_blocks; j++) {
-                input_block_idx += 1;
-                std::string name = "input_blocks." + std::to_string(input_block_idx) + ".0";
-                blocks[name]     = std::shared_ptr<GGMLBlock>(get_resblock(ch, time_embed_dim, mult * model_channels));
-
-                ch = mult * model_channels;
-                if (std::find(attention_resolutions.begin(), attention_resolutions.end(), ds) != attention_resolutions.end()) {
-                    int n_head = num_heads;
-                    int d_head = ch / num_heads;
-                    if (num_head_channels != -1) {
-                        d_head = num_head_channels;
-                        n_head = ch / d_head;
+        if (version == VERSION_SD1_TINY_UNET){
+            int mult = 1;
+            blocks["input_blocks.1.0"] = std::shared_ptr<GGMLBlock>(get_resblock(ch, time_embed_dim, mult * model_channels));
+            ch = mult * model_channels;
+            int d_head = ch / num_heads;
+            blocks["input_blocks.1.1"] = std::shared_ptr<GGMLBlock>(get_attention_layer(ch,num_heads,d_head,1,context_dim));
+            input_block_chans.push_back(ch);
+            blocks["input_blocks.3.0"] = std::shared_ptr<GGMLBlock>(new DownSampleBlock(ch, ch));
+            input_block_chans.push_back(ch);
+
+            mult = 2;
+            blocks["input_blocks.4.0"] = std::shared_ptr<GGMLBlock>(get_resblock(ch, time_embed_dim, mult * model_channels));
+            ch = mult * model_channels;
+            d_head = ch / num_heads;
+            blocks["input_blocks.4.1"] = std::shared_ptr<GGMLBlock>(get_attention_layer(ch,num_heads,d_head,1,context_dim));
+            input_block_chans.push_back(ch);
+            blocks["input_blocks.6.0"] = std::shared_ptr<GGMLBlock>(new DownSampleBlock(ch, ch));
+            input_block_chans.push_back(ch);
+
+            mult = 4;
+            blocks["input_blocks.7.0"] = std::shared_ptr<GGMLBlock>(get_resblock(ch, time_embed_dim, mult * model_channels));
+            ch = mult * model_channels;
+            d_head = ch / num_heads;
+            blocks["input_blocks.7.1"] = std::shared_ptr<GGMLBlock>(get_attention_layer(ch,num_heads,d_head,1,context_dim));
+            input_block_chans.push_back(ch);
+
+            // --- *no*  mid blocks here ---
+
+            mult = 4;
+            int ich = input_block_chans.back();input_block_chans.pop_back();
+            blocks["output_blocks.0.0"] = std::shared_ptr<GGMLBlock>(get_resblock(ch + ich, time_embed_dim, mult * model_channels));
+            ch = mult * model_channels;
+            d_head = ch / num_heads;
+            blocks["up_blocks.0.attentions.0"] = std::shared_ptr<GGMLBlock>(get_attention_layer(ch, num_heads, d_head, 1, context_dim));
+            ich = input_block_chans.back();
+            input_block_chans.pop_back();
+            blocks["output_blocks.1.0"] = std::shared_ptr<GGMLBlock>(get_resblock(ch + ich, time_embed_dim, mult * model_channels));
+            blocks["up_blocks.0.attentions.1"] = std::shared_ptr<GGMLBlock>(get_attention_layer(ch, num_heads, d_head, 1, context_dim));
+            blocks["output_blocks.2.1"] = std::shared_ptr<GGMLBlock>(new UpSampleBlock(ch, ch));
+
+            mult = 2;
+            ich = input_block_chans.back();
+            input_block_chans.pop_back();
+            blocks["output_blocks.3.0"] = std::shared_ptr<GGMLBlock>(get_resblock(ch + ich, time_embed_dim, mult * model_channels));
+            ch = mult * model_channels;
+            d_head = ch / num_heads;
+            blocks["output_blocks.3.1"] = std::shared_ptr<GGMLBlock>(get_attention_layer(ch, num_heads, d_head, 1, context_dim));
+            ich = input_block_chans.back();
+            input_block_chans.pop_back();
+            blocks["output_blocks.4.0"] = std::shared_ptr<GGMLBlock>(get_resblock(ch + ich, time_embed_dim, mult * model_channels));
+            blocks["output_blocks.4.1"] = std::shared_ptr<GGMLBlock>(get_attention_layer(ch, num_heads, d_head, 1, context_dim));
+            blocks["output_blocks.5.2"] = std::shared_ptr<GGMLBlock>(new UpSampleBlock(ch, ch));
+
+            mult = 1;
+            ich = input_block_chans.back();
+            input_block_chans.pop_back();
+            blocks["output_blocks.6.0"] = std::shared_ptr<GGMLBlock>(get_resblock(ch + ich, time_embed_dim, mult * model_channels));
+            ch = mult * model_channels;
+            d_head = ch / num_heads;
+            blocks["output_blocks.6.1"] = std::shared_ptr<GGMLBlock>(get_attention_layer(ch, num_heads, d_head, 1, context_dim));
+            ich = input_block_chans.back();
+            input_block_chans.pop_back();
+            blocks["output_blocks.7.0"] = std::shared_ptr<GGMLBlock>(get_resblock(ch + ich, time_embed_dim, mult * model_channels));
+            blocks["output_blocks.7.1"] = std::shared_ptr<GGMLBlock>(get_attention_layer(ch, num_heads, d_head, 1, context_dim));
+        }
+        else {
+            size_t len_mults = channel_mult.size();
+            for (int i = 0; i < len_mults; i++) {
+                int mult = channel_mult[i];
+                for (int j = 0; j < num_res_blocks; j++) {
+                    input_block_idx += 1;
+                    std::string name = "input_blocks." + std::to_string(input_block_idx) + ".0";
+                    blocks[name]     = std::shared_ptr<GGMLBlock>(get_resblock(ch, time_embed_dim, mult * model_channels));
+
+                    ch = mult * model_channels;
+                    if (std::find(attention_resolutions.begin(), attention_resolutions.end(), ds) != attention_resolutions.end()) {
+                        int n_head = num_heads;
+                        int d_head = ch / num_heads;
+                        if (num_head_channels != -1) {
+                            d_head = num_head_channels;
+                            n_head = ch / d_head;
+                        }
+                        std::string name = "input_blocks." + std::to_string(input_block_idx) + ".1";
+                        blocks[name]     = std::shared_ptr<GGMLBlock>(get_attention_layer(ch,
+                                                                                          n_head,
+                                                                                          d_head,
+                                                                                          transformer_depth[i],
+                                                                                          context_dim));
                     }
-                    std::string name = "input_blocks." + std::to_string(input_block_idx) + ".1";
-                    blocks[name]     = std::shared_ptr<GGMLBlock>(get_attention_layer(ch,
-                                                                                      n_head,
-                                                                                      d_head,
-                                                                                      transformer_depth[i],
-                                                                                      context_dim));
+                    input_block_chans.push_back(ch);
                 }
-                input_block_chans.push_back(ch);
-            }
-            if (i != len_mults - 1) {
-                input_block_idx += 1;
-                std::string name = "input_blocks." + std::to_string(input_block_idx) + ".0";
-                blocks[name]     = std::shared_ptr<GGMLBlock>(new DownSampleBlock(ch, ch));
+                if (i != len_mults - 1) {
+                    input_block_idx += 1;
+                    std::string name = "input_blocks." + std::to_string(input_block_idx) + ".0";
+                    blocks[name]     = std::shared_ptr<GGMLBlock>(new DownSampleBlock(ch, ch));
 
-                input_block_chans.push_back(ch);
-                ds *= 2;
+                    input_block_chans.push_back(ch);
+                    ds *= 2;
+                }
             }
-        }
 
-        // middle blocks
-        int n_head = num_heads;
-        int d_head = ch / num_heads;
-        if (num_head_channels != -1) {
-            d_head = num_head_channels;
-            n_head = ch / d_head;
-        }
-        blocks["middle_block.0"] = std::shared_ptr<GGMLBlock>(get_resblock(ch, time_embed_dim, ch));
-        blocks["middle_block.1"] = std::shared_ptr<GGMLBlock>(get_attention_layer(ch,
-                                                                                  n_head,
-                                                                                  d_head,
-                                                                                  transformer_depth[transformer_depth.size() - 1],
-                                                                                  context_dim));
-        blocks["middle_block.2"] = std::shared_ptr<GGMLBlock>(get_resblock(ch, time_embed_dim, ch));
-
-        // output_blocks
-        int output_block_idx = 0;
-        for (int i = (int)len_mults - 1; i >= 0; i--) {
-            int mult = channel_mult[i];
-            for (int j = 0; j < num_res_blocks + 1; j++) {
-                int ich = input_block_chans.back();
-                input_block_chans.pop_back();
-
-                std::string name = "output_blocks." + std::to_string(output_block_idx) + ".0";
-                blocks[name]     = std::shared_ptr<GGMLBlock>(get_resblock(ch + ich, time_embed_dim, mult * model_channels));
-
-                ch                = mult * model_channels;
-                int up_sample_idx = 1;
-                if (std::find(attention_resolutions.begin(), attention_resolutions.end(), ds) != attention_resolutions.end()) {
-                    int n_head = num_heads;
-                    int d_head = ch / num_heads;
-                    if (num_head_channels != -1) {
-                        d_head = num_head_channels;
-                        n_head = ch / d_head;
+            // middle blocks
+            int n_head = num_heads;
+            int d_head = ch / num_heads;
+            if (num_head_channels != -1) {
+                d_head = num_head_channels;
+                n_head = ch / d_head;
+            }
+            blocks["middle_block.0"] = std::shared_ptr<GGMLBlock>(get_resblock(ch, time_embed_dim, ch));
+            blocks["middle_block.1"] = std::shared_ptr<GGMLBlock>(get_attention_layer(ch,
+                                                                                      n_head,
+                                                                                      d_head,
+                                                                                      transformer_depth[transformer_depth.size() - 1],
+                                                                                      context_dim));
+            blocks["middle_block.2"] = std::shared_ptr<GGMLBlock>(get_resblock(ch, time_embed_dim, ch));
+
+            // output_blocks
+            int output_block_idx = 0;
+            for (int i = (int)len_mults - 1; i >= 0; i--) {
+                int mult = channel_mult[i];
+                for (int j = 0; j < num_res_blocks + 1; j++) {
+                    int ich = input_block_chans.back();
+                    input_block_chans.pop_back();
+
+                    std::string name = "output_blocks." + std::to_string(output_block_idx) + ".0";
+                    blocks[name]     = std::shared_ptr<GGMLBlock>(get_resblock(ch + ich, time_embed_dim, mult * model_channels));
+
+                    ch                = mult * model_channels;
+                    int up_sample_idx = 1;
+                    if (std::find(attention_resolutions.begin(), attention_resolutions.end(), ds) != attention_resolutions.end()) {
+                        int n_head = num_heads;
+                        int d_head = ch / num_heads;
+                        if (num_head_channels != -1) {
+                            d_head = num_head_channels;
+                            n_head = ch / d_head;
+                        }
+                        std::string name = "output_blocks." + std::to_string(output_block_idx) + ".1";
+                        blocks[name]     = std::shared_ptr<GGMLBlock>(get_attention_layer(ch, n_head, d_head, transformer_depth[i], context_dim));
+
+                        up_sample_idx++;
                     }
-                    std::string name = "output_blocks." + std::to_string(output_block_idx) + ".1";
-                    blocks[name]     = std::shared_ptr<GGMLBlock>(get_attention_layer(ch, n_head, d_head, transformer_depth[i], context_dim));
 
-                    up_sample_idx++;
-                }
+                    if (i > 0 && j == num_res_blocks) {
+                        std::string name = "output_blocks." + std::to_string(output_block_idx) + "." + std::to_string(up_sample_idx);
+                        blocks[name]     = std::shared_ptr<GGMLBlock>(new UpSampleBlock(ch, ch));
 
-                if (i > 0 && j == num_res_blocks) {
-                    std::string name = "output_blocks." + std::to_string(output_block_idx) + "." + std::to_string(up_sample_idx);
-                    blocks[name]     = std::shared_ptr<GGMLBlock>(new UpSampleBlock(ch, ch));
+                        ds /= 2;
+                    }
 
-                    ds /= 2;
+                    output_block_idx += 1;
                 }
-
-                output_block_idx += 1;
             }
         }
-
         // out
         blocks["out.0"] = std::shared_ptr<GGMLBlock>(new GroupNorm32(ch));  // ch == model_channels
         // out_1 is nn.SiLU()
@@ -449,83 +515,140 @@ class UnetModelBlock : public GGMLBlock {
         size_t len_mults    = channel_mult.size();
         int input_block_idx = 0;
         int ds              = 1;
-        for (int i = 0; i < len_mults; i++) {
-            int mult = channel_mult[i];
-            for (int j = 0; j < num_res_blocks; j++) {
-                input_block_idx += 1;
-                std::string name = "input_blocks." + std::to_string(input_block_idx) + ".0";
-                h                = resblock_forward(name, ctx, h, emb, num_video_frames);  // [N, mult*model_channels, h, w]
-                if (std::find(attention_resolutions.begin(), attention_resolutions.end(), ds) != attention_resolutions.end()) {
-                    std::string name = "input_blocks." + std::to_string(input_block_idx) + ".1";
-                    h                = attention_layer_forward(name, ctx, h, context, num_video_frames);  // [N, mult*model_channels, h, w]
+
+        if (version == VERSION_SD1_TINY_UNET){
+            h = resblock_forward("input_blocks.1.0", ctx, h, emb, num_video_frames);
+            h = attention_layer_forward("input_blocks.1.1", ctx, h, context, num_video_frames);
+            hs.push_back(h);
+            h = (std::dynamic_pointer_cast<DownSampleBlock>(blocks["input_blocks.3.0"]))->forward(ctx, h);
+            hs.push_back(h);        
+
+            h = resblock_forward("input_blocks.4.0", ctx, h, emb, num_video_frames);
+            h = attention_layer_forward("input_blocks.4.1", ctx, h, context, num_video_frames);
+            hs.push_back(h);
+            h = (std::dynamic_pointer_cast<DownSampleBlock>(blocks["input_blocks.6.0"]))->forward(ctx, h);
+            hs.push_back(h);        
+
+            h = resblock_forward("input_blocks.7.0", ctx, h, emb, num_video_frames);
+            h = attention_layer_forward("input_blocks.7.1", ctx, h, context, num_video_frames);
+            hs.push_back(h);
+
+            // --- *no*  mid blocks here ---
+
+            int control_offset = controls.size() - 2;
+            auto h_skip = hs.back();hs.pop_back();
+            if (controls.size() > 0) {auto cs = ggml_scale_inplace(ctx, controls[control_offset], control_strength);h_skip  = ggml_add(ctx, h_skip, cs);control_offset--;}
+            h = ggml_concat(ctx, h, h_skip, 2);
+            h = resblock_forward("output_blocks.0.0", ctx, h, emb, num_video_frames);
+            h = attention_layer_forward("up_blocks.0.attentions.0", ctx, h, context, num_video_frames);
+            h_skip = hs.back();hs.pop_back();
+            if (controls.size() > 0) {auto cs = ggml_scale_inplace(ctx, controls[control_offset], control_strength);h_skip  = ggml_add(ctx, h_skip, cs);control_offset--;}        
+            h = ggml_concat(ctx, h, h_skip, 2);
+            h = resblock_forward("output_blocks.1.0", ctx, h, emb, num_video_frames);
+            h = attention_layer_forward("up_blocks.0.attentions.1", ctx, h, context, num_video_frames);
+            h = (std::dynamic_pointer_cast<UpSampleBlock>(blocks["output_blocks.2.1"]))->forward(ctx, h);
+
+            h_skip = hs.back();hs.pop_back();
+            if (controls.size() > 0) {auto cs = ggml_scale_inplace(ctx, controls[control_offset], control_strength);h_skip  = ggml_add(ctx, h_skip, cs);control_offset--;}        
+            h = ggml_concat(ctx, h, h_skip, 2);
+            h = resblock_forward("output_blocks.3.0", ctx, h, emb, num_video_frames);
+            h = attention_layer_forward("output_blocks.3.1", ctx, h, context, num_video_frames);
+            h_skip = hs.back();hs.pop_back();
+            if (controls.size() > 0) {auto cs = ggml_scale_inplace(ctx, controls[control_offset], control_strength);h_skip  = ggml_add(ctx, h_skip, cs);control_offset--;}        
+            h = ggml_concat(ctx, h, h_skip, 2);
+            h = resblock_forward("output_blocks.4.0", ctx, h, emb, num_video_frames);
+            h = attention_layer_forward("output_blocks.4.1", ctx, h, context, num_video_frames);
+            h = (std::dynamic_pointer_cast<UpSampleBlock>(blocks["output_blocks.5.2"]))->forward(ctx, h);
+
+            h_skip = hs.back();hs.pop_back();
+            if (controls.size() > 0) {auto cs = ggml_scale_inplace(ctx, controls[control_offset], control_strength);h_skip  = ggml_add(ctx, h_skip, cs);control_offset--;}        
+            h = ggml_concat(ctx, h, h_skip, 2);
+            h = resblock_forward("output_blocks.6.0", ctx, h, emb, num_video_frames);
+            h = attention_layer_forward("output_blocks.6.1", ctx, h, context, num_video_frames);
+            h_skip = hs.back();hs.pop_back();
+            if (controls.size() > 0) {auto cs = ggml_scale_inplace(ctx, controls[control_offset], control_strength);h_skip  = ggml_add(ctx, h_skip, cs);control_offset--;}        
+            h = ggml_concat(ctx, h, h_skip, 2);
+            h = resblock_forward("output_blocks.7.0", ctx, h, emb, num_video_frames);
+            h = attention_layer_forward("output_blocks.7.1", ctx, h, context, num_video_frames);
+        }
+        else {
+            for (int i = 0; i < len_mults; i++) {
+                int mult = channel_mult[i];
+                for (int j = 0; j < num_res_blocks; j++) {
+                    input_block_idx += 1;
+                    std::string name = "input_blocks." + std::to_string(input_block_idx) + ".0";
+                    h                = resblock_forward(name, ctx, h, emb, num_video_frames);  // [N, mult*model_channels, h, w]
+                    if (std::find(attention_resolutions.begin(), attention_resolutions.end(), ds) != attention_resolutions.end()) {
+                        std::string name = "input_blocks." + std::to_string(input_block_idx) + ".1";
+                        h                = attention_layer_forward(name, ctx, h, context, num_video_frames);  // [N, mult*model_channels, h, w]
+                    }
+                    hs.push_back(h);
                 }
-                hs.push_back(h);
-            }
-            if (i != len_mults - 1) {
-                ds *= 2;
-                input_block_idx += 1;
+                if (i != len_mults - 1) {
+                    ds *= 2;
+                    input_block_idx += 1;
 
-                std::string name = "input_blocks." + std::to_string(input_block_idx) + ".0";
-                auto block       = std::dynamic_pointer_cast<DownSampleBlock>(blocks[name]);
+                    std::string name = "input_blocks." + std::to_string(input_block_idx) + ".0";
+                    auto block       = std::dynamic_pointer_cast<DownSampleBlock>(blocks[name]);
 
-                h = block->forward(ctx, h);  // [N, mult*model_channels, h/(2^(i+1)), w/(2^(i+1))]
-                hs.push_back(h);
+                    h = block->forward(ctx, h);  // [N, mult*model_channels, h/(2^(i+1)), w/(2^(i+1))]
+                    hs.push_back(h);
+                }
             }
-        }
-        // [N, 4*model_channels, h/8, w/8]
+            // [N, 4*model_channels, h/8, w/8]
 
-        // middle_block
-        h = resblock_forward("middle_block.0", ctx, h, emb, num_video_frames);             // [N, 4*model_channels, h/8, w/8]
-        h = attention_layer_forward("middle_block.1", ctx, h, context, num_video_frames);  // [N, 4*model_channels, h/8, w/8]
-        h = resblock_forward("middle_block.2", ctx, h, emb, num_video_frames);             // [N, 4*model_channels, h/8, w/8]
+            // middle_block
+            h = resblock_forward("middle_block.0", ctx, h, emb, num_video_frames);             // [N, 4*model_channels, h/8, w/8]
+            h = attention_layer_forward("middle_block.1", ctx, h, context, num_video_frames);  // [N, 4*model_channels, h/8, w/8]
+            h = resblock_forward("middle_block.2", ctx, h, emb, num_video_frames);             // [N, 4*model_channels, h/8, w/8]
 
-        if (controls.size() > 0) {
-            auto cs = ggml_scale_inplace(ctx, controls[controls.size() - 1], control_strength);
-            h       = ggml_add(ctx, h, cs);  // middle control
-        }
-        int control_offset = controls.size() - 2;
-
-        // output_blocks
-        int output_block_idx = 0;
-        for (int i = (int)len_mults - 1; i >= 0; i--) {
-            for (int j = 0; j < num_res_blocks + 1; j++) {
-                auto h_skip = hs.back();
-                hs.pop_back();
-
-                if (controls.size() > 0) {
-                    auto cs = ggml_scale_inplace(ctx, controls[control_offset], control_strength);
-                    h_skip  = ggml_add(ctx, h_skip, cs);  // control net condition
-                    control_offset--;
-                }
+            if (controls.size() > 0) {
+                auto cs = ggml_scale_inplace(ctx, controls[controls.size() - 1], control_strength);
+                h       = ggml_add(ctx, h, cs);  // middle control
+            }
+            int control_offset = controls.size() - 2;
+
+            // output_blocks
+            int output_block_idx = 0;
+            for (int i = (int)len_mults - 1; i >= 0; i--) {
+                for (int j = 0; j < num_res_blocks + 1; j++) {
+                    auto h_skip = hs.back();
+                    hs.pop_back();
+
+                    if (controls.size() > 0) {
+                        auto cs = ggml_scale_inplace(ctx, controls[control_offset], control_strength);
+                        h_skip  = ggml_add(ctx, h_skip, cs);  // control net condition
+                        control_offset--;
+                    }
 
-                h = ggml_concat(ctx, h, h_skip, 2);
+                    h = ggml_concat(ctx, h, h_skip, 2);
 
-                std::string name = "output_blocks." + std::to_string(output_block_idx) + ".0";
+                    std::string name = "output_blocks." + std::to_string(output_block_idx) + ".0";
 
-                h = resblock_forward(name, ctx, h, emb, num_video_frames);
+                    h = resblock_forward(name, ctx, h, emb, num_video_frames);
 
-                int up_sample_idx = 1;
-                if (std::find(attention_resolutions.begin(), attention_resolutions.end(), ds) != attention_resolutions.end()) {
-                    std::string name = "output_blocks." + std::to_string(output_block_idx) + ".1";
+                    int up_sample_idx = 1;
+                    if (std::find(attention_resolutions.begin(), attention_resolutions.end(), ds) != attention_resolutions.end()) {
+                        std::string name = "output_blocks." + std::to_string(output_block_idx) + ".1";
 
-                    h = attention_layer_forward(name, ctx, h, context, num_video_frames);
+                        h = attention_layer_forward(name, ctx, h, context, num_video_frames);
 
-                    up_sample_idx++;
-                }
+                        up_sample_idx++;
+                    }
 
-                if (i > 0 && j == num_res_blocks) {
-                    std::string name = "output_blocks." + std::to_string(output_block_idx) + "." + std::to_string(up_sample_idx);
-                    auto block       = std::dynamic_pointer_cast<UpSampleBlock>(blocks[name]);
+                    if (i > 0 && j == num_res_blocks) {
+                        std::string name = "output_blocks." + std::to_string(output_block_idx) + "." + std::to_string(up_sample_idx);
+                        auto block       = std::dynamic_pointer_cast<UpSampleBlock>(blocks[name]);
 
-                    h = block->forward(ctx, h);
+                        h = block->forward(ctx, h);
 
-                    ds /= 2;
-                }
+                        ds /= 2;
+                    }
 
-                output_block_idx += 1;
+                    output_block_idx += 1;
+                }
             }
         }
-
         // out
         h = out_0->forward(ctx, h);
         h = ggml_silu_inplace(ctx, h);

From dd73373dfa84ec2679c25db03733b95b2e3b909a Mon Sep 17 00:00:00 2001
From: akleine <alb.kleine@gmx.de>
Date: Wed, 30 Jul 2025 07:15:36 +0200
Subject: [PATCH 2/2] docs: add support for SD1.x with tiny a U-Net

---
 docs/tiny_U-Nets_in_SD1x.md | 67 +++++++++++++++++++++++++++++++++++++
 1 file changed, 67 insertions(+)
 create mode 100644 docs/tiny_U-Nets_in_SD1x.md

diff --git a/docs/tiny_U-Nets_in_SD1x.md b/docs/tiny_U-Nets_in_SD1x.md
new file mode 100644
index 00000000..fb3c7d37
--- /dev/null
+++ b/docs/tiny_U-Nets_in_SD1x.md
@@ -0,0 +1,67 @@
+# Running SD1.x models with tiny U-Nets
+
+### Preface
+
+Tiny SD 1.x models have a very small U-Net part.  Unlike other 1.x models they consist of only 6 U-Net blocks, resulting in relatively small checkpoint files (approximately 1 GB). Running these models saves almost 50% of the time. For more details, refer to the paper: https://arxiv.org/pdf/2305.15798.pdf .
+
+There are only a few Tiny SD 1.x models available online, such as:
+
+ * https://huggingface.co/segmind/tiny-sd
+ * https://huggingface.co/segmind/portrait-finetuned
+ * https://huggingface.co/nota-ai/bk-sdm-tiny
+
+To create a checkpoint file, follow these steps:
+
+### Download model from Hugging Face
+
+Download the model using Python on your computer, for example this way:
+
+```python
+import torch
+from diffusers import StableDiffusionPipeline
+pipe = StableDiffusionPipeline.from_pretrained("segmind/tiny-sd")
+unet=pipe.unet
+for param in unet.parameters():
+    param.data = param.data.contiguous()     # <- important here
+pipe.save_pretrained("segmindtiny-sd", safe_serialization=True)
+```
+
+### Convert that to a ckpt file 
+
+To convert the downloaded model to a checkpoint file, you need another Python script. Download the conversion script from here:
+
+ * https://raw.githubusercontent.com/huggingface/diffusers/refs/heads/main/scripts/convert_diffusers_to_original_stable_diffusion.py
+
+
+### Run convert script
+
+Now, run that conversion script:
+
+```bash
+python convert_diffusers_to_original_stable_diffusion.py \
+	--model_path  ./segmindtiny-sd \
+	--checkpoint_path ./segmind_tiny-sd.ckpt --half
+```
+
+The file **segmind_tiny-sd.ckpt**  will be generated and is now ready to use with sd.cpp
+
+You can follow a similar process for other models mentioned above from Hugging Face. 
+
+
+### Another ckpt file on the net
+
+There is another model file available online: 
+
+ * https://huggingface.co/ClashSAN/small-sd/resolve/main/tinySDdistilled.ckpt
+ 
+If you want to use that, you have to adjust some **non-contiguous tensors** first:
+
+```python
+import torch
+ckpt = torch.load("tinySDdistilled.ckpt", map_location=torch.device('cpu'))
+for key, value in ckpt['state_dict'].items():
+    if isinstance(value, torch.Tensor):
+        ckpt['state_dict'][key] = value.contiguous()
+torch.save(ckpt, "tinySDdistilled_fixed.ckpt")
+```
+