Skip to content

Commit c63cc26

Browse files
committed
add v2.0
1 parent e83698b commit c63cc26

File tree

1 file changed

+88
-28
lines changed

1 file changed

+88
-28
lines changed

examples/llava/clip.cpp

+88-28
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,7 @@ static std::string format(const char * fmt, ...) {
126126
#define TN_MVLM_PROJ_PEG "mm.model.peg.%d.%s"
127127
#define TN_IMAGE_NEWLINE "model.image_newline"
128128

129+
#define TN_MINICPMV_POS_EMBD "resampler.pos_embed"
129130
#define TN_MINICPMV_POS_EMBD_K "resampler.pos_embed_k"
130131
#define TN_MINICPMV_QUERY "resampler.query"
131132
#define TN_MINICPMV_PROJ "resampler.proj.weight"
@@ -502,6 +503,7 @@ struct clip_vision_model {
502503
struct ggml_tensor * mm_model_peg_0_b;
503504

504505
// MINICPMV projection
506+
struct ggml_tensor * mm_model_pos_embed;
505507
struct ggml_tensor * mm_model_pos_embed_k;
506508
struct ggml_tensor * mm_model_query;
507509
struct ggml_tensor * mm_model_proj;
@@ -644,7 +646,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
644646
if (ctx->has_minicpmv_projector) {
645647
int pos_w = image_size_width/patch_size;
646648
int pos_h = image_size_height/patch_size;
647-
if (ctx->minicpmv_version == 2) {
649+
if (ctx->minicpmv_version == 1) {
650+
pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 2304, pos_w * pos_h, 1);
651+
}
652+
else if (ctx->minicpmv_version == 2) {
648653
pos_embed = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 4096, pos_w * pos_h, 1);
649654
}
650655
else if (ctx->minicpmv_version == 3) {
@@ -952,16 +957,23 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
952957
v = ggml_add(ctx0, ggml_mul(ctx0, v, model.mm_model_ln_kv_w), model.mm_model_ln_kv_b);
953958
}
954959
{ // position
955-
// q = ggml_add(ctx0, q, model.mm_model_pos_embed);
956-
k = ggml_add(ctx0, v, pos_embed);
960+
if (ctx->minicpmv_version == 1) {
961+
q = ggml_add(ctx0, q, model.mm_model_pos_embed);
962+
}
963+
k = ggml_add(ctx0, v, pos_embed);
957964
}
958965

959966
{ // attention
960967
int hidden_size = 4096;
961968
const int d_head = 128;
962969
int n_head = hidden_size/d_head;
963970
int num_query = 96;
964-
if (ctx->minicpmv_version == 2) {
971+
if (ctx->minicpmv_version == 1) {
972+
hidden_size = 2304;
973+
n_head = hidden_size/d_head;
974+
num_query = 64;
975+
}
976+
else if (ctx->minicpmv_version == 2) {
965977
hidden_size = 4096;
966978
n_head = hidden_size/d_head;
967979
num_query = 96;
@@ -1421,7 +1433,9 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
14211433
vision_model.mm_model_peg_0_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_PEG, 0, "bias"));
14221434
}
14231435
else if (new_clip->proj_type == PROJECTOR_TYPE_RESAMPLER) {
1424-
// vision_model.mm_model_pos_embed = get_tensor(new_clip->ctx_data, TN_MINICPMV_POS_EMBD);
1436+
if (new_clip->minicpmv_version == 1) {
1437+
vision_model.mm_model_pos_embed = get_tensor(new_clip->ctx_data, TN_MINICPMV_POS_EMBD);
1438+
}
14251439
vision_model.mm_model_pos_embed_k = get_tensor(new_clip->ctx_data, TN_MINICPMV_POS_EMBD_K);
14261440
vision_model.mm_model_query = get_tensor(new_clip->ctx_data, TN_MINICPMV_QUERY);
14271441
vision_model.mm_model_proj = get_tensor(new_clip->ctx_data, TN_MINICPMV_PROJ);
@@ -1913,7 +1927,7 @@ static std::vector<std::vector<clip_image_u8 *>> uhd_slice_image(const clip_imag
19131927
}
19141928

19151929
int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip){
1916-
const int max_slice_nums=9;
1930+
const int max_slice_nums=ctx_clip->max_slice_nums;
19171931
const int scale_resolution=448;
19181932
const int original_width = ctx_clip->load_image_size->width;
19191933
const int original_height = ctx_clip->load_image_size->height;
@@ -1929,25 +1943,51 @@ int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip){
19291943
bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch * res_imgs) {
19301944

19311945
if(clip_is_minicpmv(ctx)){
1932-
std::vector<std::vector<clip_image_u8 *>> imgs = uhd_slice_image(img, ctx->max_slice_nums);
1933-
res_imgs->size = 0;
1934-
for (size_t i = 0; i < imgs.size(); ++i){
1935-
res_imgs->size += imgs[i].size();
1936-
}
1937-
res_imgs->data = new clip_image_f32[res_imgs->size];
1938-
int idx = 0;
1939-
for (size_t i = 0; i < imgs.size(); ++i){
1940-
for (size_t j = 0; j < imgs[i].size(); ++j) {
1941-
LOG_TEE("%s: %d %d\n", __func__,imgs[i][j]->nx,imgs[i][j]->ny);
1942-
clip_image_f32 * res = clip_image_f32_init();
1943-
normalize_image_u8_to_f32(imgs[i][j], res, ctx->image_mean, ctx->image_std);
1944-
res_imgs->data[idx++] = *res;
1945-
clip_image_f32_free(res);
1946+
if (ctx->minicpmv_version >1)
1947+
{
1948+
std::vector<std::vector<clip_image_u8 *>> imgs = uhd_slice_image(img, ctx->max_slice_nums);
1949+
res_imgs->size = 0;
1950+
for (size_t i = 0; i < imgs.size(); ++i){
1951+
res_imgs->size += imgs[i].size();
1952+
}
1953+
res_imgs->data = new clip_image_f32[res_imgs->size];
1954+
int idx = 0;
1955+
for (size_t i = 0; i < imgs.size(); ++i){
1956+
for (size_t j = 0; j < imgs[i].size(); ++j) {
1957+
LOG_TEE("%s: %d %d\n", __func__,imgs[i][j]->nx,imgs[i][j]->ny);
1958+
clip_image_f32 * res = clip_image_f32_init();
1959+
normalize_image_u8_to_f32(imgs[i][j], res, ctx->image_mean, ctx->image_std);
1960+
res_imgs->data[idx++] = *res;
1961+
clip_image_f32_free(res);
1962+
}
19461963
}
1964+
return true;
1965+
}
1966+
else {
1967+
if (res_imgs->size == 0){
1968+
std::vector<std::vector<clip_image_u8 *>> imgs = uhd_slice_image(img, ctx->max_slice_nums);
1969+
res_imgs->size = 0;
1970+
for (size_t i = 0; i < imgs.size(); ++i){
1971+
res_imgs->size += imgs[i].size();
1972+
}
1973+
res_imgs->data = new clip_image_f32[res_imgs->size];
1974+
int idx = 0;
1975+
1976+
for (size_t i = 0; i < imgs.size(); ++i){
1977+
for (size_t j = 0; j < imgs[i].size(); ++j) {
1978+
LOG_TEE("%s: %d %d\n", __func__,imgs[i][j]->nx,imgs[i][j]->ny);
1979+
clip_image_f32_batch img_res_v_batch;
1980+
img_res_v_batch.size = 1;
1981+
img_res_v_batch.data = nullptr;
1982+
clip_image_preprocess(ctx, imgs[i][j], &img_res_v_batch);
1983+
res_imgs->data[idx++] = img_res_v_batch.data[0];
1984+
}
1985+
}
1986+
return true;
1987+
}
19471988
}
1948-
return true;
19491989
}
1950-
1990+
19511991
bool pad_to_square = true;
19521992
if (!ctx->has_vision_encoder) {
19531993
LOG_TEE("This gguf file seems to have no vision encoder\n");
@@ -2164,7 +2204,10 @@ int clip_n_patches(const struct clip_ctx * ctx) {
21642204
if (ctx->proj_type == PROJECTOR_TYPE_LDP || ctx->proj_type == PROJECTOR_TYPE_LDPV2) {
21652205
n_patches /= 4;
21662206
} else if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
2167-
if (ctx->minicpmv_version == 2) {
2207+
if (ctx->minicpmv_version == 1) {
2208+
n_patches = 64;
2209+
}
2210+
else if (ctx->minicpmv_version == 2) {
21682211
n_patches = 96;
21692212
}
21702213
else if (ctx->minicpmv_version == 3) {
@@ -2310,7 +2353,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
23102353
}
23112354
const int pos_w = ctx->load_image_size->width/patch_size;
23122355
const int pos_h = ctx->load_image_size->height/patch_size;
2313-
2356+
23142357
{
23152358
struct ggml_tensor * inp_raw = ggml_graph_get_tensor(gf, "inp_raw");
23162359
float * data = (float *)malloc(ggml_nbytes(inp_raw));
@@ -2337,8 +2380,19 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
23372380
ggml_backend_tensor_set(inp_raw, data, 0, ggml_nbytes(inp_raw));
23382381
free(data);
23392382
}
2340-
if (ctx->has_minicpmv_projector) {
2383+
if (ctx->minicpmv_version) {
2384+
if (ctx->minicpmv_version == 1)
23412385
{
2386+
struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
2387+
2388+
int* positions_data = (int*)malloc(ggml_nbytes(positions));
2389+
for (int i = 0; i < num_positions; i++) {
2390+
positions_data[i] = i;
2391+
}
2392+
ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
2393+
free(positions_data);
2394+
}
2395+
else {
23422396
// inspired from siglip:
23432397
// -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit
23442398
// -> https://huggingface.co/HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit/blob/d66538faeba44480d0bfaa42145eef26f9423199/modeling_siglip.py#L316
@@ -2360,14 +2414,17 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
23602414
ggml_backend_tensor_set(positions, positions_data, 0, ggml_nbytes(positions));
23612415
free(positions_data);
23622416
}
2363-
2417+
23642418
{
23652419
// inspired from resampler of Qwen-VL:
23662420
// -> https://huggingface.co/Qwen/Qwen-VL/tree/main
23672421
// -> https://huggingface.co/Qwen/Qwen-VL/blob/0547ed36a86561e2e42fecec8fd0c4f6953e33c4/visual.py#L23
23682422
struct ggml_tensor * pos_embed = ggml_graph_get_tensor(gf, "pos_embed");
23692423
int embed_dim = 4096;
2370-
if (ctx->minicpmv_version == 2) {
2424+
if (ctx->minicpmv_version == 1) {
2425+
embed_dim = 2304;
2426+
}
2427+
else if (ctx->minicpmv_version == 2) {
23712428
embed_dim = 4096;
23722429
}
23732430
else if (ctx->minicpmv_version == 3) {
@@ -2588,7 +2645,10 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
25882645
return ctx->vision_model.mm_3_b->ne[0];
25892646
}
25902647
if (ctx->proj_type == PROJECTOR_TYPE_RESAMPLER) {
2591-
if (ctx->minicpmv_version == 2) {
2648+
if (ctx->minicpmv_version == 1) {
2649+
return 2304;
2650+
}
2651+
else if (ctx->minicpmv_version == 2) {
25922652
return 4096;
25932653
}
25942654
else if (ctx->minicpmv_version == 3) {

0 commit comments

Comments
 (0)