@@ -448,6 +448,10 @@ class StableDiffusionGGML {
448
448
diffusion_model->alloc_params_buffer ();
449
449
diffusion_model->get_param_tensors (tensors);
450
450
451
+ if (sd_version_is_unet_edit (version)) {
452
+ vae_decode_only = false ;
453
+ }
454
+
451
455
if (high_noise_diffusion_model) {
452
456
high_noise_diffusion_model->alloc_params_buffer ();
453
457
high_noise_diffusion_model->get_param_tensors (tensors);
@@ -2319,23 +2323,36 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g
2319
2323
init_latent = generate_init_latent (sd_ctx, work_ctx, width, height);
2320
2324
}
2321
2325
2326
+ sd_guidance_params_t guidance = sd_img_gen_params->sample_params .guidance ;
2327
+ std::vector<sd_image_t *> ref_images;
2328
+ for (int i = 0 ; i < sd_img_gen_params->ref_images_count ; i++) {
2329
+ ref_images.push_back (&sd_img_gen_params->ref_images [i]);
2330
+ }
2331
+
2332
+ std::vector<uint8_t > empty_image_data;
2333
+ sd_image_t empty_image = {(uint32_t )width, (uint32_t )height, 3 , nullptr };
2334
+ if (ref_images.empty () && sd_version_is_unet_edit (sd_ctx->sd ->version ))
2335
+ {
2336
+ LOG_WARN (" This model needs at least one reference image; using an empty reference" );
2337
+ empty_image_data.reserve (width * height * 3 );
2338
+ ref_images.push_back (&empty_image);
2339
+ empty_image.data = empty_image_data.data ();
2340
+ guidance.img_cfg = 0 .f ;
2341
+ }
2342
+
2322
2343
if (sd_img_gen_params->ref_images_count > 0 ) {
2323
2344
LOG_INFO (" EDIT mode" );
2324
2345
}
2325
- else if (sd_ctx->sd ->version == VERSION_SD1_PIX2PIX || sd_ctx->sd ->version == VERSION_SDXL_PIX2PIX) {
2326
- LOG_ERROR (" This model needs at least one reference image" );
2327
- return NULL ;
2328
- }
2329
2346
2330
2347
std::vector<ggml_tensor*> ref_latents;
2331
- for (int i = 0 ; i < sd_img_gen_params-> ref_images_count ; i++) {
2348
+ for (int i = 0 ; i < ref_images. size () ; i++) {
2332
2349
ggml_tensor* img = ggml_new_tensor_4d (work_ctx,
2333
2350
GGML_TYPE_F32,
2334
- sd_img_gen_params-> ref_images [i]. width ,
2335
- sd_img_gen_params-> ref_images [i]. height ,
2351
+ ref_images[i]-> width ,
2352
+ ref_images[i]-> height ,
2336
2353
3 ,
2337
2354
1 );
2338
- sd_image_to_tensor (sd_img_gen_params-> ref_images [i]. data , img);
2355
+ sd_image_to_tensor (ref_images[i]-> data , img);
2339
2356
2340
2357
ggml_tensor* latent = NULL ;
2341
2358
if (sd_ctx->sd ->use_tiny_autoencoder ) {
@@ -2368,7 +2385,7 @@ sd_image_t* generate_image(sd_ctx_t* sd_ctx, const sd_img_gen_params_t* sd_img_g
2368
2385
SAFE_STR (sd_img_gen_params->prompt ),
2369
2386
SAFE_STR (sd_img_gen_params->negative_prompt ),
2370
2387
sd_img_gen_params->clip_skip ,
2371
- sd_img_gen_params-> sample_params . guidance ,
2388
+ guidance,
2372
2389
sd_img_gen_params->sample_params .eta ,
2373
2390
width,
2374
2391
height,
0 commit comments