From 2d2c4f4efbc06a67d27286f09e5e1e302171efc5 Mon Sep 17 00:00:00 2001 From: akleine Date: Mon, 28 Jul 2025 07:18:22 +0200 Subject: [PATCH 1/2] feat: add code for running SD1.x models with a TINY U-Net, just like bk-sdm-tiny --- model.cpp | 8 + model.h | 3 +- stable-diffusion.cpp | 1 + unet.hpp | 391 ++++++++++++++++++++++++++++--------------- 4 files changed, 268 insertions(+), 135 deletions(-) diff --git a/model.cpp b/model.cpp index 9529cc58..3a8a7a19 100644 --- a/model.cpp +++ b/model.cpp @@ -1644,8 +1644,13 @@ SDVersion ModelLoader::get_sd_version() { bool is_xl = false; bool is_flux = false; + bool maybe_unet_is_tiny = false; + #define found_family (is_xl || is_flux) for (auto& tensor_storage : tensor_storages) { + if (tensor_storage.name == "model.diffusion_model.up_blocks.0.attentions.1.transformer_blocks.0.norm2.bias") { + maybe_unet_is_tiny = true; + } if (!found_family) { if (tensor_storage.name.find("model.diffusion_model.double_blocks.") != std::string::npos) { is_flux = true; @@ -1722,6 +1727,9 @@ SDVersion ModelLoader::get_sd_version() { if (is_ip2p) { return VERSION_SD1_PIX2PIX; } + if (maybe_unet_is_tiny && tensor_storages.size() > 800 && tensor_storages.size() < 805) { + return VERSION_SD1_TINY_UNET; + } return VERSION_SD1; } else if (token_embedding_weight.ne[0] == 1024) { if (is_inpaint) { diff --git a/model.h b/model.h index ea716107..cd6e006b 100644 --- a/model.h +++ b/model.h @@ -22,6 +22,7 @@ enum SDVersion { VERSION_SD1, VERSION_SD1_INPAINT, VERSION_SD1_PIX2PIX, + VERSION_SD1_TINY_UNET, VERSION_SD2, VERSION_SD2_INPAINT, VERSION_SDXL, @@ -49,7 +50,7 @@ static inline bool sd_version_is_sd3(SDVersion version) { } static inline bool sd_version_is_sd1(SDVersion version) { - if (version == VERSION_SD1 || version == VERSION_SD1_INPAINT || version == VERSION_SD1_PIX2PIX) { + if (version == VERSION_SD1 || version == VERSION_SD1_INPAINT || version == VERSION_SD1_PIX2PIX || version == VERSION_SD1_TINY_UNET) { return true; } return false; diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp index 402585f1..54ca3c48 100644 --- a/stable-diffusion.cpp +++ b/stable-diffusion.cpp @@ -28,6 +28,7 @@ const char* model_version_to_str[] = { "SD 1.x", "SD 1.x Inpaint", "Instruct-Pix2Pix", + "SD 1.x tiny UNet", "SD 2.x", "SD 2.x Inpaint", "SDXL", diff --git a/unet.hpp b/unet.hpp index 9193dcd6..10e9513c 100644 --- a/unet.hpp +++ b/unet.hpp @@ -253,93 +253,159 @@ class UnetModelBlock : public GGMLBlock { } }; - size_t len_mults = channel_mult.size(); - for (int i = 0; i < len_mults; i++) { - int mult = channel_mult[i]; - for (int j = 0; j < num_res_blocks; j++) { - input_block_idx += 1; - std::string name = "input_blocks." + std::to_string(input_block_idx) + ".0"; - blocks[name] = std::shared_ptr(get_resblock(ch, time_embed_dim, mult * model_channels)); - - ch = mult * model_channels; - if (std::find(attention_resolutions.begin(), attention_resolutions.end(), ds) != attention_resolutions.end()) { - int n_head = num_heads; - int d_head = ch / num_heads; - if (num_head_channels != -1) { - d_head = num_head_channels; - n_head = ch / d_head; + if (version == VERSION_SD1_TINY_UNET){ + int mult = 1; + blocks["input_blocks.1.0"] = std::shared_ptr(get_resblock(ch, time_embed_dim, mult * model_channels)); + ch = mult * model_channels; + int d_head = ch / num_heads; + blocks["input_blocks.1.1"] = std::shared_ptr(get_attention_layer(ch,num_heads,d_head,1,context_dim)); + input_block_chans.push_back(ch); + blocks["input_blocks.3.0"] = std::shared_ptr(new DownSampleBlock(ch, ch)); + input_block_chans.push_back(ch); + + mult = 2; + blocks["input_blocks.4.0"] = std::shared_ptr(get_resblock(ch, time_embed_dim, mult * model_channels)); + ch = mult * model_channels; + d_head = ch / num_heads; + blocks["input_blocks.4.1"] = std::shared_ptr(get_attention_layer(ch,num_heads,d_head,1,context_dim)); + input_block_chans.push_back(ch); + blocks["input_blocks.6.0"] = std::shared_ptr(new DownSampleBlock(ch, ch)); + input_block_chans.push_back(ch); + + mult = 4; + blocks["input_blocks.7.0"] = std::shared_ptr(get_resblock(ch, time_embed_dim, mult * model_channels)); + ch = mult * model_channels; + d_head = ch / num_heads; + blocks["input_blocks.7.1"] = std::shared_ptr(get_attention_layer(ch,num_heads,d_head,1,context_dim)); + input_block_chans.push_back(ch); + + // --- *no* mid blocks here --- + + mult = 4; + int ich = input_block_chans.back();input_block_chans.pop_back(); + blocks["output_blocks.0.0"] = std::shared_ptr(get_resblock(ch + ich, time_embed_dim, mult * model_channels)); + ch = mult * model_channels; + d_head = ch / num_heads; + blocks["up_blocks.0.attentions.0"] = std::shared_ptr(get_attention_layer(ch, num_heads, d_head, 1, context_dim)); + ich = input_block_chans.back(); + input_block_chans.pop_back(); + blocks["output_blocks.1.0"] = std::shared_ptr(get_resblock(ch + ich, time_embed_dim, mult * model_channels)); + blocks["up_blocks.0.attentions.1"] = std::shared_ptr(get_attention_layer(ch, num_heads, d_head, 1, context_dim)); + blocks["output_blocks.2.1"] = std::shared_ptr(new UpSampleBlock(ch, ch)); + + mult = 2; + ich = input_block_chans.back(); + input_block_chans.pop_back(); + blocks["output_blocks.3.0"] = std::shared_ptr(get_resblock(ch + ich, time_embed_dim, mult * model_channels)); + ch = mult * model_channels; + d_head = ch / num_heads; + blocks["output_blocks.3.1"] = std::shared_ptr(get_attention_layer(ch, num_heads, d_head, 1, context_dim)); + ich = input_block_chans.back(); + input_block_chans.pop_back(); + blocks["output_blocks.4.0"] = std::shared_ptr(get_resblock(ch + ich, time_embed_dim, mult * model_channels)); + blocks["output_blocks.4.1"] = std::shared_ptr(get_attention_layer(ch, num_heads, d_head, 1, context_dim)); + blocks["output_blocks.5.2"] = std::shared_ptr(new UpSampleBlock(ch, ch)); + + mult = 1; + ich = input_block_chans.back(); + input_block_chans.pop_back(); + blocks["output_blocks.6.0"] = std::shared_ptr(get_resblock(ch + ich, time_embed_dim, mult * model_channels)); + ch = mult * model_channels; + d_head = ch / num_heads; + blocks["output_blocks.6.1"] = std::shared_ptr(get_attention_layer(ch, num_heads, d_head, 1, context_dim)); + ich = input_block_chans.back(); + input_block_chans.pop_back(); + blocks["output_blocks.7.0"] = std::shared_ptr(get_resblock(ch + ich, time_embed_dim, mult * model_channels)); + blocks["output_blocks.7.1"] = std::shared_ptr(get_attention_layer(ch, num_heads, d_head, 1, context_dim)); + } + else { + size_t len_mults = channel_mult.size(); + for (int i = 0; i < len_mults; i++) { + int mult = channel_mult[i]; + for (int j = 0; j < num_res_blocks; j++) { + input_block_idx += 1; + std::string name = "input_blocks." + std::to_string(input_block_idx) + ".0"; + blocks[name] = std::shared_ptr(get_resblock(ch, time_embed_dim, mult * model_channels)); + + ch = mult * model_channels; + if (std::find(attention_resolutions.begin(), attention_resolutions.end(), ds) != attention_resolutions.end()) { + int n_head = num_heads; + int d_head = ch / num_heads; + if (num_head_channels != -1) { + d_head = num_head_channels; + n_head = ch / d_head; + } + std::string name = "input_blocks." + std::to_string(input_block_idx) + ".1"; + blocks[name] = std::shared_ptr(get_attention_layer(ch, + n_head, + d_head, + transformer_depth[i], + context_dim)); } - std::string name = "input_blocks." + std::to_string(input_block_idx) + ".1"; - blocks[name] = std::shared_ptr(get_attention_layer(ch, - n_head, - d_head, - transformer_depth[i], - context_dim)); + input_block_chans.push_back(ch); } - input_block_chans.push_back(ch); - } - if (i != len_mults - 1) { - input_block_idx += 1; - std::string name = "input_blocks." + std::to_string(input_block_idx) + ".0"; - blocks[name] = std::shared_ptr(new DownSampleBlock(ch, ch)); + if (i != len_mults - 1) { + input_block_idx += 1; + std::string name = "input_blocks." + std::to_string(input_block_idx) + ".0"; + blocks[name] = std::shared_ptr(new DownSampleBlock(ch, ch)); - input_block_chans.push_back(ch); - ds *= 2; + input_block_chans.push_back(ch); + ds *= 2; + } } - } - // middle blocks - int n_head = num_heads; - int d_head = ch / num_heads; - if (num_head_channels != -1) { - d_head = num_head_channels; - n_head = ch / d_head; - } - blocks["middle_block.0"] = std::shared_ptr(get_resblock(ch, time_embed_dim, ch)); - blocks["middle_block.1"] = std::shared_ptr(get_attention_layer(ch, - n_head, - d_head, - transformer_depth[transformer_depth.size() - 1], - context_dim)); - blocks["middle_block.2"] = std::shared_ptr(get_resblock(ch, time_embed_dim, ch)); - - // output_blocks - int output_block_idx = 0; - for (int i = (int)len_mults - 1; i >= 0; i--) { - int mult = channel_mult[i]; - for (int j = 0; j < num_res_blocks + 1; j++) { - int ich = input_block_chans.back(); - input_block_chans.pop_back(); - - std::string name = "output_blocks." + std::to_string(output_block_idx) + ".0"; - blocks[name] = std::shared_ptr(get_resblock(ch + ich, time_embed_dim, mult * model_channels)); - - ch = mult * model_channels; - int up_sample_idx = 1; - if (std::find(attention_resolutions.begin(), attention_resolutions.end(), ds) != attention_resolutions.end()) { - int n_head = num_heads; - int d_head = ch / num_heads; - if (num_head_channels != -1) { - d_head = num_head_channels; - n_head = ch / d_head; + // middle blocks + int n_head = num_heads; + int d_head = ch / num_heads; + if (num_head_channels != -1) { + d_head = num_head_channels; + n_head = ch / d_head; + } + blocks["middle_block.0"] = std::shared_ptr(get_resblock(ch, time_embed_dim, ch)); + blocks["middle_block.1"] = std::shared_ptr(get_attention_layer(ch, + n_head, + d_head, + transformer_depth[transformer_depth.size() - 1], + context_dim)); + blocks["middle_block.2"] = std::shared_ptr(get_resblock(ch, time_embed_dim, ch)); + + // output_blocks + int output_block_idx = 0; + for (int i = (int)len_mults - 1; i >= 0; i--) { + int mult = channel_mult[i]; + for (int j = 0; j < num_res_blocks + 1; j++) { + int ich = input_block_chans.back(); + input_block_chans.pop_back(); + + std::string name = "output_blocks." + std::to_string(output_block_idx) + ".0"; + blocks[name] = std::shared_ptr(get_resblock(ch + ich, time_embed_dim, mult * model_channels)); + + ch = mult * model_channels; + int up_sample_idx = 1; + if (std::find(attention_resolutions.begin(), attention_resolutions.end(), ds) != attention_resolutions.end()) { + int n_head = num_heads; + int d_head = ch / num_heads; + if (num_head_channels != -1) { + d_head = num_head_channels; + n_head = ch / d_head; + } + std::string name = "output_blocks." + std::to_string(output_block_idx) + ".1"; + blocks[name] = std::shared_ptr(get_attention_layer(ch, n_head, d_head, transformer_depth[i], context_dim)); + + up_sample_idx++; } - std::string name = "output_blocks." + std::to_string(output_block_idx) + ".1"; - blocks[name] = std::shared_ptr(get_attention_layer(ch, n_head, d_head, transformer_depth[i], context_dim)); - up_sample_idx++; - } + if (i > 0 && j == num_res_blocks) { + std::string name = "output_blocks." + std::to_string(output_block_idx) + "." + std::to_string(up_sample_idx); + blocks[name] = std::shared_ptr(new UpSampleBlock(ch, ch)); - if (i > 0 && j == num_res_blocks) { - std::string name = "output_blocks." + std::to_string(output_block_idx) + "." + std::to_string(up_sample_idx); - blocks[name] = std::shared_ptr(new UpSampleBlock(ch, ch)); + ds /= 2; + } - ds /= 2; + output_block_idx += 1; } - - output_block_idx += 1; } } - // out blocks["out.0"] = std::shared_ptr(new GroupNorm32(ch)); // ch == model_channels // out_1 is nn.SiLU() @@ -449,83 +515,140 @@ class UnetModelBlock : public GGMLBlock { size_t len_mults = channel_mult.size(); int input_block_idx = 0; int ds = 1; - for (int i = 0; i < len_mults; i++) { - int mult = channel_mult[i]; - for (int j = 0; j < num_res_blocks; j++) { - input_block_idx += 1; - std::string name = "input_blocks." + std::to_string(input_block_idx) + ".0"; - h = resblock_forward(name, ctx, h, emb, num_video_frames); // [N, mult*model_channels, h, w] - if (std::find(attention_resolutions.begin(), attention_resolutions.end(), ds) != attention_resolutions.end()) { - std::string name = "input_blocks." + std::to_string(input_block_idx) + ".1"; - h = attention_layer_forward(name, ctx, h, context, num_video_frames); // [N, mult*model_channels, h, w] + + if (version == VERSION_SD1_TINY_UNET){ + h = resblock_forward("input_blocks.1.0", ctx, h, emb, num_video_frames); + h = attention_layer_forward("input_blocks.1.1", ctx, h, context, num_video_frames); + hs.push_back(h); + h = (std::dynamic_pointer_cast(blocks["input_blocks.3.0"]))->forward(ctx, h); + hs.push_back(h); + + h = resblock_forward("input_blocks.4.0", ctx, h, emb, num_video_frames); + h = attention_layer_forward("input_blocks.4.1", ctx, h, context, num_video_frames); + hs.push_back(h); + h = (std::dynamic_pointer_cast(blocks["input_blocks.6.0"]))->forward(ctx, h); + hs.push_back(h); + + h = resblock_forward("input_blocks.7.0", ctx, h, emb, num_video_frames); + h = attention_layer_forward("input_blocks.7.1", ctx, h, context, num_video_frames); + hs.push_back(h); + + // --- *no* mid blocks here --- + + int control_offset = controls.size() - 2; + auto h_skip = hs.back();hs.pop_back(); + if (controls.size() > 0) {auto cs = ggml_scale_inplace(ctx, controls[control_offset], control_strength);h_skip = ggml_add(ctx, h_skip, cs);control_offset--;} + h = ggml_concat(ctx, h, h_skip, 2); + h = resblock_forward("output_blocks.0.0", ctx, h, emb, num_video_frames); + h = attention_layer_forward("up_blocks.0.attentions.0", ctx, h, context, num_video_frames); + h_skip = hs.back();hs.pop_back(); + if (controls.size() > 0) {auto cs = ggml_scale_inplace(ctx, controls[control_offset], control_strength);h_skip = ggml_add(ctx, h_skip, cs);control_offset--;} + h = ggml_concat(ctx, h, h_skip, 2); + h = resblock_forward("output_blocks.1.0", ctx, h, emb, num_video_frames); + h = attention_layer_forward("up_blocks.0.attentions.1", ctx, h, context, num_video_frames); + h = (std::dynamic_pointer_cast(blocks["output_blocks.2.1"]))->forward(ctx, h); + + h_skip = hs.back();hs.pop_back(); + if (controls.size() > 0) {auto cs = ggml_scale_inplace(ctx, controls[control_offset], control_strength);h_skip = ggml_add(ctx, h_skip, cs);control_offset--;} + h = ggml_concat(ctx, h, h_skip, 2); + h = resblock_forward("output_blocks.3.0", ctx, h, emb, num_video_frames); + h = attention_layer_forward("output_blocks.3.1", ctx, h, context, num_video_frames); + h_skip = hs.back();hs.pop_back(); + if (controls.size() > 0) {auto cs = ggml_scale_inplace(ctx, controls[control_offset], control_strength);h_skip = ggml_add(ctx, h_skip, cs);control_offset--;} + h = ggml_concat(ctx, h, h_skip, 2); + h = resblock_forward("output_blocks.4.0", ctx, h, emb, num_video_frames); + h = attention_layer_forward("output_blocks.4.1", ctx, h, context, num_video_frames); + h = (std::dynamic_pointer_cast(blocks["output_blocks.5.2"]))->forward(ctx, h); + + h_skip = hs.back();hs.pop_back(); + if (controls.size() > 0) {auto cs = ggml_scale_inplace(ctx, controls[control_offset], control_strength);h_skip = ggml_add(ctx, h_skip, cs);control_offset--;} + h = ggml_concat(ctx, h, h_skip, 2); + h = resblock_forward("output_blocks.6.0", ctx, h, emb, num_video_frames); + h = attention_layer_forward("output_blocks.6.1", ctx, h, context, num_video_frames); + h_skip = hs.back();hs.pop_back(); + if (controls.size() > 0) {auto cs = ggml_scale_inplace(ctx, controls[control_offset], control_strength);h_skip = ggml_add(ctx, h_skip, cs);control_offset--;} + h = ggml_concat(ctx, h, h_skip, 2); + h = resblock_forward("output_blocks.7.0", ctx, h, emb, num_video_frames); + h = attention_layer_forward("output_blocks.7.1", ctx, h, context, num_video_frames); + } + else { + for (int i = 0; i < len_mults; i++) { + int mult = channel_mult[i]; + for (int j = 0; j < num_res_blocks; j++) { + input_block_idx += 1; + std::string name = "input_blocks." + std::to_string(input_block_idx) + ".0"; + h = resblock_forward(name, ctx, h, emb, num_video_frames); // [N, mult*model_channels, h, w] + if (std::find(attention_resolutions.begin(), attention_resolutions.end(), ds) != attention_resolutions.end()) { + std::string name = "input_blocks." + std::to_string(input_block_idx) + ".1"; + h = attention_layer_forward(name, ctx, h, context, num_video_frames); // [N, mult*model_channels, h, w] + } + hs.push_back(h); } - hs.push_back(h); - } - if (i != len_mults - 1) { - ds *= 2; - input_block_idx += 1; + if (i != len_mults - 1) { + ds *= 2; + input_block_idx += 1; - std::string name = "input_blocks." + std::to_string(input_block_idx) + ".0"; - auto block = std::dynamic_pointer_cast(blocks[name]); + std::string name = "input_blocks." + std::to_string(input_block_idx) + ".0"; + auto block = std::dynamic_pointer_cast(blocks[name]); - h = block->forward(ctx, h); // [N, mult*model_channels, h/(2^(i+1)), w/(2^(i+1))] - hs.push_back(h); + h = block->forward(ctx, h); // [N, mult*model_channels, h/(2^(i+1)), w/(2^(i+1))] + hs.push_back(h); + } } - } - // [N, 4*model_channels, h/8, w/8] + // [N, 4*model_channels, h/8, w/8] - // middle_block - h = resblock_forward("middle_block.0", ctx, h, emb, num_video_frames); // [N, 4*model_channels, h/8, w/8] - h = attention_layer_forward("middle_block.1", ctx, h, context, num_video_frames); // [N, 4*model_channels, h/8, w/8] - h = resblock_forward("middle_block.2", ctx, h, emb, num_video_frames); // [N, 4*model_channels, h/8, w/8] + // middle_block + h = resblock_forward("middle_block.0", ctx, h, emb, num_video_frames); // [N, 4*model_channels, h/8, w/8] + h = attention_layer_forward("middle_block.1", ctx, h, context, num_video_frames); // [N, 4*model_channels, h/8, w/8] + h = resblock_forward("middle_block.2", ctx, h, emb, num_video_frames); // [N, 4*model_channels, h/8, w/8] - if (controls.size() > 0) { - auto cs = ggml_scale_inplace(ctx, controls[controls.size() - 1], control_strength); - h = ggml_add(ctx, h, cs); // middle control - } - int control_offset = controls.size() - 2; - - // output_blocks - int output_block_idx = 0; - for (int i = (int)len_mults - 1; i >= 0; i--) { - for (int j = 0; j < num_res_blocks + 1; j++) { - auto h_skip = hs.back(); - hs.pop_back(); - - if (controls.size() > 0) { - auto cs = ggml_scale_inplace(ctx, controls[control_offset], control_strength); - h_skip = ggml_add(ctx, h_skip, cs); // control net condition - control_offset--; - } + if (controls.size() > 0) { + auto cs = ggml_scale_inplace(ctx, controls[controls.size() - 1], control_strength); + h = ggml_add(ctx, h, cs); // middle control + } + int control_offset = controls.size() - 2; + + // output_blocks + int output_block_idx = 0; + for (int i = (int)len_mults - 1; i >= 0; i--) { + for (int j = 0; j < num_res_blocks + 1; j++) { + auto h_skip = hs.back(); + hs.pop_back(); + + if (controls.size() > 0) { + auto cs = ggml_scale_inplace(ctx, controls[control_offset], control_strength); + h_skip = ggml_add(ctx, h_skip, cs); // control net condition + control_offset--; + } - h = ggml_concat(ctx, h, h_skip, 2); + h = ggml_concat(ctx, h, h_skip, 2); - std::string name = "output_blocks." + std::to_string(output_block_idx) + ".0"; + std::string name = "output_blocks." + std::to_string(output_block_idx) + ".0"; - h = resblock_forward(name, ctx, h, emb, num_video_frames); + h = resblock_forward(name, ctx, h, emb, num_video_frames); - int up_sample_idx = 1; - if (std::find(attention_resolutions.begin(), attention_resolutions.end(), ds) != attention_resolutions.end()) { - std::string name = "output_blocks." + std::to_string(output_block_idx) + ".1"; + int up_sample_idx = 1; + if (std::find(attention_resolutions.begin(), attention_resolutions.end(), ds) != attention_resolutions.end()) { + std::string name = "output_blocks." + std::to_string(output_block_idx) + ".1"; - h = attention_layer_forward(name, ctx, h, context, num_video_frames); + h = attention_layer_forward(name, ctx, h, context, num_video_frames); - up_sample_idx++; - } + up_sample_idx++; + } - if (i > 0 && j == num_res_blocks) { - std::string name = "output_blocks." + std::to_string(output_block_idx) + "." + std::to_string(up_sample_idx); - auto block = std::dynamic_pointer_cast(blocks[name]); + if (i > 0 && j == num_res_blocks) { + std::string name = "output_blocks." + std::to_string(output_block_idx) + "." + std::to_string(up_sample_idx); + auto block = std::dynamic_pointer_cast(blocks[name]); - h = block->forward(ctx, h); + h = block->forward(ctx, h); - ds /= 2; - } + ds /= 2; + } - output_block_idx += 1; + output_block_idx += 1; + } } } - // out h = out_0->forward(ctx, h); h = ggml_silu_inplace(ctx, h); From dd73373dfa84ec2679c25db03733b95b2e3b909a Mon Sep 17 00:00:00 2001 From: akleine Date: Wed, 30 Jul 2025 07:15:36 +0200 Subject: [PATCH 2/2] docs: add support for SD1.x with tiny a U-Net --- docs/tiny_U-Nets_in_SD1x.md | 67 +++++++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) create mode 100644 docs/tiny_U-Nets_in_SD1x.md diff --git a/docs/tiny_U-Nets_in_SD1x.md b/docs/tiny_U-Nets_in_SD1x.md new file mode 100644 index 00000000..fb3c7d37 --- /dev/null +++ b/docs/tiny_U-Nets_in_SD1x.md @@ -0,0 +1,67 @@ +# Running SD1.x models with tiny U-Nets + +### Preface + +Tiny SD 1.x models have a very small U-Net part. Unlike other 1.x models they consist of only 6 U-Net blocks, resulting in relatively small checkpoint files (approximately 1 GB). Running these models saves almost 50% of the time. For more details, refer to the paper: https://arxiv.org/pdf/2305.15798.pdf . + +There are only a few Tiny SD 1.x models available online, such as: + + * https://huggingface.co/segmind/tiny-sd + * https://huggingface.co/segmind/portrait-finetuned + * https://huggingface.co/nota-ai/bk-sdm-tiny + +To create a checkpoint file, follow these steps: + +### Download model from Hugging Face + +Download the model using Python on your computer, for example this way: + +```python +import torch +from diffusers import StableDiffusionPipeline +pipe = StableDiffusionPipeline.from_pretrained("segmind/tiny-sd") +unet=pipe.unet +for param in unet.parameters(): + param.data = param.data.contiguous() # <- important here +pipe.save_pretrained("segmindtiny-sd", safe_serialization=True) +``` + +### Convert that to a ckpt file + +To convert the downloaded model to a checkpoint file, you need another Python script. Download the conversion script from here: + + * https://raw.githubusercontent.com/huggingface/diffusers/refs/heads/main/scripts/convert_diffusers_to_original_stable_diffusion.py + + +### Run convert script + +Now, run that conversion script: + +```bash +python convert_diffusers_to_original_stable_diffusion.py \ + --model_path ./segmindtiny-sd \ + --checkpoint_path ./segmind_tiny-sd.ckpt --half +``` + +The file **segmind_tiny-sd.ckpt** will be generated and is now ready to use with sd.cpp + +You can follow a similar process for other models mentioned above from Hugging Face. + + +### Another ckpt file on the net + +There is another model file available online: + + * https://huggingface.co/ClashSAN/small-sd/resolve/main/tinySDdistilled.ckpt + +If you want to use that, you have to adjust some **non-contiguous tensors** first: + +```python +import torch +ckpt = torch.load("tinySDdistilled.ckpt", map_location=torch.device('cpu')) +for key, value in ckpt['state_dict'].items(): + if isinstance(value, torch.Tensor): + ckpt['state_dict'][key] = value.contiguous() +torch.save(ckpt, "tinySDdistilled_fixed.ckpt") +``` +