@@ -440,6 +440,10 @@ class StableDiffusionGGML {
440
440
diffusion_model->alloc_params_buffer ();
441
441
diffusion_model->get_param_tensors (tensors);
442
442
443
+ if (sd_version_is_unet_edit (version)) {
444
+ vae_decode_only = false ;
445
+ }
446
+
443
447
if (high_noise_diffusion_model) {
444
448
high_noise_diffusion_model->alloc_params_buffer ();
445
449
high_noise_diffusion_model->get_param_tensors (tensors);
@@ -2300,23 +2304,36 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g
2300
2304
init_latent = generate_init_latent (sd_ctx, work_ctx, width, height);
2301
2305
}
2302
2306
2307
+ sd_guidance_params_t guidance = sd_img_gen_params->sample_params .guidance ;
2308
+ std::vector<sd_image_t *> ref_images;
2309
+ for (int i = 0 ; i < sd_img_gen_params->ref_images_count ; i++) {
2310
+ ref_images.push_back (&sd_img_gen_params->ref_images [i]);
2311
+ }
2312
+
2313
+ std::vector<uint8_t > empty_image_data;
2314
+ sd_image_t empty_image = {(uint32_t )width, (uint32_t )height, 3 , nullptr };
2315
+ if (ref_images.empty () && sd_version_is_unet_edit (sd_ctx->sd ->version ))
2316
+ {
2317
+ LOG_WARN (" This model needs at least one reference image; using an empty reference" );
2318
+ empty_image_data.reserve (width * height * 3 );
2319
+ ref_images.push_back (&empty_image);
2320
+ empty_image.data = empty_image_data.data ();
2321
+ guidance.img_cfg = 0 .f ;
2322
+ }
2323
+
2303
2324
if (sd_img_gen_params->ref_images_count > 0 ) {
2304
2325
LOG_INFO (" EDIT mode" );
2305
2326
}
2306
- else if (sd_ctx->sd ->version == VERSION_SD1_PIX2PIX || sd_ctx->sd ->version == VERSION_SDXL_PIX2PIX) {
2307
- LOG_ERROR (" This model needs at least one reference image" );
2308
- return NULL ;
2309
- }
2310
2327
2311
2328
std::vector<struct ggml_tensor *> ref_latents;
2312
- for (int i = 0 ; i < sd_img_gen_params-> ref_images_count ; i++) {
2329
+ for (int i = 0 ; i < ref_images. size () ; i++) {
2313
2330
ggml_tensor* img = ggml_new_tensor_4d (work_ctx,
2314
2331
GGML_TYPE_F32,
2315
- sd_img_gen_params-> ref_images [i]. width ,
2316
- sd_img_gen_params-> ref_images [i]. height ,
2332
+ ref_images[i]-> width ,
2333
+ ref_images[i]-> height ,
2317
2334
3 ,
2318
2335
1 );
2319
- sd_image_to_tensor (sd_img_gen_params-> ref_images [i]. data , img);
2336
+ sd_image_to_tensor (ref_images[i]-> data , img);
2320
2337
2321
2338
ggml_tensor* latent = NULL ;
2322
2339
if (sd_ctx->sd ->use_tiny_autoencoder ) {
@@ -2349,7 +2366,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g
2349
2366
SAFE_STR (sd_img_gen_params->prompt ),
2350
2367
SAFE_STR (sd_img_gen_params->negative_prompt ),
2351
2368
sd_img_gen_params->clip_skip ,
2352
- sd_img_gen_params-> sample_params . guidance ,
2369
+ guidance,
2353
2370
sd_img_gen_params->sample_params .eta ,
2354
2371
width,
2355
2372
height,
0 commit comments