@@ -126,6 +126,7 @@ static std::string format(const char * fmt, ...) {
126
126
#define TN_MVLM_PROJ_PEG " mm.model.peg.%d.%s"
127
127
#define TN_IMAGE_NEWLINE " model.image_newline"
128
128
129
+ #define TN_MINICPMV_POS_EMBD " resampler.pos_embed"
129
130
#define TN_MINICPMV_POS_EMBD_K " resampler.pos_embed_k"
130
131
#define TN_MINICPMV_QUERY " resampler.query"
131
132
#define TN_MINICPMV_PROJ " resampler.proj.weight"
@@ -502,6 +503,7 @@ struct clip_vision_model {
502
503
struct ggml_tensor * mm_model_peg_0_b;
503
504
504
505
// MINICPMV projection
506
+ struct ggml_tensor * mm_model_pos_embed;
505
507
struct ggml_tensor * mm_model_pos_embed_k;
506
508
struct ggml_tensor * mm_model_query;
507
509
struct ggml_tensor * mm_model_proj;
@@ -644,7 +646,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
644
646
if (ctx->has_minicpmv_projector ) {
645
647
int pos_w = image_size_width/patch_size;
646
648
int pos_h = image_size_height/patch_size;
647
- if (ctx->minicpmv_version == 2 ) {
649
+ if (ctx->minicpmv_version == 1 ) {
650
+ pos_embed = ggml_new_tensor_3d (ctx0, GGML_TYPE_F32, 2304 , pos_w * pos_h, 1 );
651
+ }
652
+ else if (ctx->minicpmv_version == 2 ) {
648
653
pos_embed = ggml_new_tensor_3d (ctx0, GGML_TYPE_F32, 4096 , pos_w * pos_h, 1 );
649
654
}
650
655
else if (ctx->minicpmv_version == 3 ) {
@@ -952,16 +957,23 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
952
957
v = ggml_add (ctx0, ggml_mul (ctx0, v, model.mm_model_ln_kv_w ), model.mm_model_ln_kv_b );
953
958
}
954
959
{ // position
955
- // q = ggml_add(ctx0, q, model.mm_model_pos_embed);
956
- k = ggml_add (ctx0, v, pos_embed);
960
+ if (ctx->minicpmv_version == 1 ) {
961
+ q = ggml_add (ctx0, q, model.mm_model_pos_embed );
962
+ }
963
+ k = ggml_add (ctx0, v, pos_embed);
957
964
}
958
965
959
966
{ // attention
960
967
int hidden_size = 4096 ;
961
968
const int d_head = 128 ;
962
969
int n_head = hidden_size/d_head;
963
970
int num_query = 96 ;
964
- if (ctx->minicpmv_version == 2 ) {
971
+ if (ctx->minicpmv_version == 1 ) {
972
+ hidden_size = 2304 ;
973
+ n_head = hidden_size/d_head;
974
+ num_query = 64 ;
975
+ }
976
+ else if (ctx->minicpmv_version == 2 ) {
965
977
hidden_size = 4096 ;
966
978
n_head = hidden_size/d_head;
967
979
num_query = 96 ;
@@ -1421,7 +1433,9 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
1421
1433
vision_model.mm_model_peg_0_b = get_tensor (new_clip->ctx_data , format (TN_MVLM_PROJ_PEG, 0 , " bias" ));
1422
1434
}
1423
1435
else if (new_clip->proj_type == PROJECTOR_TYPE_RESAMPLER) {
1424
- // vision_model.mm_model_pos_embed = get_tensor(new_clip->ctx_data, TN_MINICPMV_POS_EMBD);
1436
+ if (new_clip->minicpmv_version == 1 ) {
1437
+ vision_model.mm_model_pos_embed = get_tensor (new_clip->ctx_data , TN_MINICPMV_POS_EMBD);
1438
+ }
1425
1439
vision_model.mm_model_pos_embed_k = get_tensor (new_clip->ctx_data , TN_MINICPMV_POS_EMBD_K);
1426
1440
vision_model.mm_model_query = get_tensor (new_clip->ctx_data , TN_MINICPMV_QUERY);
1427
1441
vision_model.mm_model_proj = get_tensor (new_clip->ctx_data , TN_MINICPMV_PROJ);
@@ -1913,7 +1927,7 @@ static std::vector<std::vector<clip_image_u8 *>> uhd_slice_image(const clip_imag
1913
1927
}
1914
1928
1915
1929
int clip_uhd_num_image_embeds_col (struct clip_ctx * ctx_clip){
1916
- const int max_slice_nums=9 ;
1930
+ const int max_slice_nums=ctx_clip-> max_slice_nums ;
1917
1931
const int scale_resolution=448 ;
1918
1932
const int original_width = ctx_clip->load_image_size ->width ;
1919
1933
const int original_height = ctx_clip->load_image_size ->height ;
@@ -1929,25 +1943,51 @@ int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip){
1929
1943
bool clip_image_preprocess (struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch * res_imgs) {
1930
1944
1931
1945
if (clip_is_minicpmv (ctx)){
1932
- std::vector<std::vector<clip_image_u8 *>> imgs = uhd_slice_image (img, ctx->max_slice_nums );
1933
- res_imgs->size = 0 ;
1934
- for (size_t i = 0 ; i < imgs.size (); ++i){
1935
- res_imgs->size += imgs[i].size ();
1936
- }
1937
- res_imgs->data = new clip_image_f32[res_imgs->size ];
1938
- int idx = 0 ;
1939
- for (size_t i = 0 ; i < imgs.size (); ++i){
1940
- for (size_t j = 0 ; j < imgs[i].size (); ++j) {
1941
- LOG_TEE (" %s: %d %d\n " , __func__,imgs[i][j]->nx ,imgs[i][j]->ny );
1942
- clip_image_f32 * res = clip_image_f32_init ();
1943
- normalize_image_u8_to_f32 (imgs[i][j], res, ctx->image_mean , ctx->image_std );
1944
- res_imgs->data [idx++] = *res;
1945
- clip_image_f32_free (res);
1946
+ if (ctx->minicpmv_version >1 )
1947
+ {
1948
+ std::vector<std::vector<clip_image_u8 *>> imgs = uhd_slice_image (img, ctx->max_slice_nums );
1949
+ res_imgs->size = 0 ;
1950
+ for (size_t i = 0 ; i < imgs.size (); ++i){
1951
+ res_imgs->size += imgs[i].size ();
1952
+ }
1953
+ res_imgs->data = new clip_image_f32[res_imgs->size ];
1954
+ int idx = 0 ;
1955
+ for (size_t i = 0 ; i < imgs.size (); ++i){
1956
+ for (size_t j = 0 ; j < imgs[i].size (); ++j) {
1957
+ LOG_TEE (" %s: %d %d\n " , __func__,imgs[i][j]->nx ,imgs[i][j]->ny );
1958
+ clip_image_f32 * res = clip_image_f32_init ();
1959
+ normalize_image_u8_to_f32 (imgs[i][j], res, ctx->image_mean , ctx->image_std );
1960
+ res_imgs->data [idx++] = *res;
1961
+ clip_image_f32_free (res);
1962
+ }
1946
1963
}
1964
+ return true ;
1965
+ }
1966
+ else {
1967
+ if (res_imgs->size == 0 ){
1968
+ std::vector<std::vector<clip_image_u8 *>> imgs = uhd_slice_image (img, ctx->max_slice_nums );
1969
+ res_imgs->size = 0 ;
1970
+ for (size_t i = 0 ; i < imgs.size (); ++i){
1971
+ res_imgs->size += imgs[i].size ();
1972
+ }
1973
+ res_imgs->data = new clip_image_f32[res_imgs->size ];
1974
+ int idx = 0 ;
1975
+
1976
+ for (size_t i = 0 ; i < imgs.size (); ++i){
1977
+ for (size_t j = 0 ; j < imgs[i].size (); ++j) {
1978
+ LOG_TEE (" %s: %d %d\n " , __func__,imgs[i][j]->nx ,imgs[i][j]->ny );
1979
+ clip_image_f32_batch img_res_v_batch;
1980
+ img_res_v_batch.size = 1 ;
1981
+ img_res_v_batch.data = nullptr ;
1982
+ clip_image_preprocess (ctx, imgs[i][j], &img_res_v_batch);
1983
+ res_imgs->data [idx++] = img_res_v_batch.data [0 ];
1984
+ }
1985
+ }
1986
+ return true ;
1987
+ }
1947
1988
}
1948
- return true ;
1949
1989
}
1950
-
1990
+
1951
1991
bool pad_to_square = true ;
1952
1992
if (!ctx->has_vision_encoder ) {
1953
1993
LOG_TEE (" This gguf file seems to have no vision encoder\n " );
@@ -2164,7 +2204,10 @@ int clip_n_patches(const struct clip_ctx * ctx) {
2164
2204
if (ctx->proj_type == PROJECTOR_TYPE_LDP || ctx->proj_type == PROJECTOR_TYPE_LDPV2) {
2165
2205
n_patches /= 4 ;
2166
2206
} else if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
2167
- if (ctx->minicpmv_version == 2 ) {
2207
+ if (ctx->minicpmv_version == 1 ) {
2208
+ n_patches = 64 ;
2209
+ }
2210
+ else if (ctx->minicpmv_version == 2 ) {
2168
2211
n_patches = 96 ;
2169
2212
}
2170
2213
else if (ctx->minicpmv_version == 3 ) {
@@ -2310,7 +2353,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
2310
2353
}
2311
2354
const int pos_w = ctx->load_image_size ->width /patch_size;
2312
2355
const int pos_h = ctx->load_image_size ->height /patch_size;
2313
-
2356
+
2314
2357
{
2315
2358
struct ggml_tensor * inp_raw = ggml_graph_get_tensor (gf, " inp_raw" );
2316
2359
float * data = (float *)malloc (ggml_nbytes (inp_raw));
@@ -2337,8 +2380,19 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
2337
2380
ggml_backend_tensor_set (inp_raw, data, 0 , ggml_nbytes (inp_raw));
2338
2381
free (data);
2339
2382
}
2340
- if (ctx->has_minicpmv_projector ) {
2383
+ if (ctx->minicpmv_version ) {
2384
+ if (ctx->minicpmv_version == 1 )
2341
2385
{
2386
+ struct ggml_tensor * positions = ggml_graph_get_tensor (gf, " positions" );
2387
+
2388
+ int * positions_data = (int *)malloc (ggml_nbytes (positions));
2389
+ for (int i = 0 ; i < num_positions; i++) {
2390
+ positions_data[i] = i;
2391
+ }
2392
+ ggml_backend_tensor_set (positions, positions_data, 0 , ggml_nbytes (positions));
2393
+ free (positions_data);
2394
+ }
2395
+ else {
2342
2396
// inspired from siglip:
2343
2397
// -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit
2344
2398
// -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit/blob/d66538faeba44480d0bfaa42145eef26f9423199/modeling_siglip.py#L316
@@ -2360,14 +2414,17 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
2360
2414
ggml_backend_tensor_set (positions, positions_data, 0 , ggml_nbytes (positions));
2361
2415
free (positions_data);
2362
2416
}
2363
-
2417
+
2364
2418
{
2365
2419
// inspired from resampler of Qwen-VL:
2366
2420
// -> https://huggingface.co/Qwen/Qwen-VL/tree/main
2367
2421
// -> https://huggingface.co/Qwen/Qwen-VL/blob/0547ed36a86561e2e42fecec8fd0c4f6953e33c4/visual.py#L23
2368
2422
struct ggml_tensor * pos_embed = ggml_graph_get_tensor (gf, " pos_embed" );
2369
2423
int embed_dim = 4096 ;
2370
- if (ctx->minicpmv_version == 2 ) {
2424
+ if (ctx->minicpmv_version == 1 ) {
2425
+ embed_dim = 2304 ;
2426
+ }
2427
+ else if (ctx->minicpmv_version == 2 ) {
2371
2428
embed_dim = 4096 ;
2372
2429
}
2373
2430
else if (ctx->minicpmv_version == 3 ) {
@@ -2588,7 +2645,10 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
2588
2645
return ctx->vision_model .mm_3_b ->ne [0 ];
2589
2646
}
2590
2647
if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
2591
- if (ctx->minicpmv_version == 2 ) {
2648
+ if (ctx->minicpmv_version == 1 ) {
2649
+ return 2304 ;
2650
+ }
2651
+ else if (ctx->minicpmv_version == 2 ) {
2592
2652
return 4096 ;
2593
2653
}
2594
2654
else if (ctx->minicpmv_version == 3 ) {
0 commit comments