diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 4087187c19834..710779e814bdd 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -603,6 +603,9 @@ def get_vocab_base_pre(self, tokenizer) -> str: if chkhsh == "855059429035d75a914d1eda9f10a876752e281a054a7a3d421ef0533e5b6249": # ref: https://huggingface.co/HuggingFaceTB/SmolLM-135M res = "smollm" + if chkhsh == "c7699093ba4255a91e702aa38a596aa81669f3525dae06c2953267dde580f448": + # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-zh + res = "jina-v2-zh" if res is None: logger.warning("\n") diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index d5a2d925eaef5..11702b5043580 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -94,6 +94,7 @@ class TOKENIZER_TYPE(IntEnum): {"name": "codeshell", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/WisdomShell/CodeShell-7B", }, {"name": "tekken", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistralai/Mistral-Nemo-Base-2407", }, {"name": "smollm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/HuggingFaceTB/SmolLM-135M", }, + {"name": "jina-v2-zh", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-zh", }, ] diff --git a/include/llama.h b/include/llama.h index 413070d95a5c4..433f0d6f3bad9 100644 --- a/include/llama.h +++ b/include/llama.h @@ -95,6 +95,7 @@ extern "C" { LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20, LLAMA_VOCAB_PRE_TYPE_SMOLLM = 21, LLAMA_VOCAB_PRE_TYPE_CODESHELL = 22, + LLAMA_VOCAB_PRE_TYPE_JINA_V2_ZH = 23, }; // note: these values should be synchronized with ggml_rope diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index c482b36899a1c..1696b86b34e89 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -11,6 +11,7 @@ #include #include #include +#include // // helpers @@ -446,6 +447,9 @@ struct llm_tokenizer_bpe { "[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", }; break; + case LLAMA_VOCAB_PRE_TYPE_JINA_V2_ZH: + regex_exprs = {"\\w+|[^\\w\\s]+"}; + break; default: // default regex for BPE tokenization pre-processing regex_exprs = { @@ -498,7 +502,20 @@ struct llm_tokenizer_bpe { void tokenize(const std::string & text, std::vector & output) { int final_prev_index = -1; - const auto word_collection = unicode_regex_split(text, regex_exprs); + std::vector word_collection; + if (vocab.type_pre == LLAMA_VOCAB_PRE_TYPE_JINA_V2_ZH) { + + std::string lowercase_text = lowercase(text); + std::regex regexPattern(regex_exprs[0]); + std::sregex_token_iterator it(lowercase_text.begin(), lowercase_text.end(), regexPattern); + std::sregex_token_iterator end; + + while (it != end) { + word_collection.push_back(*it++); + } + } else { + word_collection = unicode_regex_split(text, regex_exprs); + } symbols_final.clear(); diff --git a/src/llama.cpp b/src/llama.cpp index bc0183741db4e..51cf3c6a2b78a 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -5385,8 +5385,8 @@ static void llm_load_vocab( tokenizer_pre == "jina-v2-de" || tokenizer_pre == "jina-v2-code") { vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2; - } else if ( - tokenizer_pre == "refact") { + + } else if (tokenizer_pre == "refact") { vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_REFACT; } else if ( tokenizer_pre == "command-r") { @@ -5436,6 +5436,9 @@ static void llm_load_vocab( } else if ( tokenizer_pre == "codeshell") { vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_CODESHELL; + } else if ( + tokenizer_pre == "jina-v2-zh") { + vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_JINA_V2_ZH; } else { throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str())); } @@ -5486,8 +5489,7 @@ static void llm_load_vocab( for (uint32_t i = 0; i < n_vocab; i++) { std::string word = gguf_get_arr_str(ctx, token_idx, i); - GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0); - + //GGML_ASSERT(unicode_cpts_from_utf8(word).size() > 0); Remove check, some vocabs contain by mistake the NULL in vocab, (not ideal if it happens more than once) (jinaai-embeddings-v2-base-zh) vocab.token_to_id[word] = i; vocab.max_token_len = std::max(vocab.max_token_len, (int) word.size()); @@ -5560,9 +5562,18 @@ static void llm_load_vocab( } else if (vocab.type == LLAMA_VOCAB_TYPE_WPM) { vocab.linefeed_id = vocab.special_pad_id; } else { - const std::vector ids = llama_tokenize_internal(vocab, "\xC4\x8A", false); // U+010A - GGML_ASSERT(!ids.empty() && "model vocab missing newline token"); - vocab.linefeed_id = ids[0]; + try { + const std::vector ids = llama_tokenize_internal(vocab, "\xC4\x8A", false); // U+010A + if (ids.empty()) { + LLAMA_LOG_WARN("%s: %s vocabulary, but newline token not found: %s! Using special_pad_id instead.", __func__, llama_model_vocab_type_name(vocab.type), "\xC4\x8A"); + vocab.linefeed_id = -1; + } else { + vocab.linefeed_id = ids[0]; + } + } catch (const std::exception & e) { + LLAMA_LOG_WARN("%s: %s vocabulary, but newline token not found: %s! Using special_pad_id instead.", __func__, llama_model_vocab_type_name(vocab.type), e.what()); + vocab.linefeed_id = vocab.special_pad_id; + } } // special tokens diff --git a/src/unicode.cpp b/src/unicode.cpp index 46650bff06d15..2a95a01bec974 100644 --- a/src/unicode.cpp +++ b/src/unicode.cpp @@ -816,3 +816,17 @@ std::vector unicode_regex_split(const std::string & text, const std return unicode_byte_encoding_process(bpe_words); } + + + +std::string lowercase(const std::string & text) { + std::string lowercase(""); + const std::vector cpts = unicode_cpts_from_utf8(text); + + for (const char32_t cpt : cpts) { + const std::string s = unicode_cpt_to_utf8(unicode_tolower(cpt)); + lowercase += unicode_cpt_to_utf8(unicode_tolower(cpt)); // append char to word + } + + return lowercase; +} diff --git a/src/unicode.h b/src/unicode.h index 008532a242ab8..ec2bc3e1820e4 100644 --- a/src/unicode.h +++ b/src/unicode.h @@ -65,3 +65,5 @@ uint8_t unicode_utf8_to_byte(const std::string & utf8); uint32_t unicode_tolower(uint32_t cp); std::vector unicode_regex_split(const std::string & text, const std::vector & regex_exprs); + +std::string lowercase(const std::string & text);