@@ -386,6 +386,10 @@ class StableDiffusionGGML {
386
386
diffusion_model->alloc_params_buffer ();
387
387
diffusion_model->get_param_tensors (tensors);
388
388
389
+ if (sd_version_is_unet_edit (version)) {
390
+ vae_decode_only = false ;
391
+ }
392
+
389
393
if (!use_tiny_autoencoder) {
390
394
if (sd_ctx_params->keep_vae_on_cpu && !ggml_backend_is_cpu (backend)) {
391
395
LOG_INFO (" VAE Autoencoder: Using CPU backend" );
@@ -2037,23 +2041,36 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g
2037
2041
init_latent = generate_init_latent (sd_ctx, work_ctx, width, height);
2038
2042
}
2039
2043
2044
+ sd_guidance_params_t guidance = sd_img_gen_params->guidance ;
2045
+ std::vector<sd_image_t *> ref_images;
2046
+ for (int i = 0 ; i < sd_img_gen_params->ref_images_count ; i++) {
2047
+ ref_images.push_back (&sd_img_gen_params->ref_images [i]);
2048
+ }
2049
+
2050
+ std::vector<uint8_t > empty_image_data;
2051
+ sd_image_t empty_image = {(uint32_t )width, (uint32_t )height, 3 , nullptr };
2052
+ if (ref_images.empty () && sd_version_is_unet_edit (sd_ctx->sd ->version ))
2053
+ {
2054
+ LOG_WARN (" This model needs at least one reference image; using empty reference" );
2055
+ empty_image_data.reserve (width * height * 3 );
2056
+ ref_images.push_back (&empty_image);
2057
+ empty_image.data = empty_image_data.data ();
2058
+ guidance.img_cfg = 0 .f ;
2059
+ }
2060
+
2040
2061
if (sd_img_gen_params->ref_images_count > 0 ) {
2041
2062
LOG_INFO (" EDIT mode" );
2042
2063
}
2043
- else if (sd_ctx->sd ->version == VERSION_SD1_PIX2PIX || sd_ctx->sd ->version == VERSION_SDXL_PIX2PIX) {
2044
- LOG_ERROR (" This model needs at least one reference image" );
2045
- return NULL ;
2046
- }
2047
2064
2048
2065
std::vector<struct ggml_tensor *> ref_latents;
2049
- for (int i = 0 ; i < sd_img_gen_params-> ref_images_count ; i++) {
2066
+ for (int i = 0 ; i < ref_images. size () ; i++) {
2050
2067
ggml_tensor* img = ggml_new_tensor_4d (work_ctx,
2051
2068
GGML_TYPE_F32,
2052
- sd_img_gen_params-> ref_images [i]. width ,
2053
- sd_img_gen_params-> ref_images [i]. height ,
2069
+ ref_images[i]-> width ,
2070
+ ref_images[i]-> height ,
2054
2071
3 ,
2055
2072
1 );
2056
- sd_image_to_tensor (sd_img_gen_params-> ref_images [i]. data , img);
2073
+ sd_image_to_tensor (ref_images[i]-> data , img);
2057
2074
2058
2075
ggml_tensor* latent = NULL ;
2059
2076
if (sd_ctx->sd ->use_tiny_autoencoder ) {
@@ -2086,7 +2103,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g
2086
2103
SAFE_STR (sd_img_gen_params->prompt ),
2087
2104
SAFE_STR (sd_img_gen_params->negative_prompt ),
2088
2105
sd_img_gen_params->clip_skip ,
2089
- sd_img_gen_params-> guidance ,
2106
+ guidance,
2090
2107
sd_img_gen_params->eta ,
2091
2108
width,
2092
2109
height,
0 commit comments