Skip to content

Commit

Permalink
Merge branch 'feat-jina-embeddings-v2-zh' of https://github.com/JoanF…
Browse files Browse the repository at this point in the history
…M/llama.cpp into feat-jina-embeddings-v2-zh
  • Loading branch information
JoanFM committed Jul 8, 2024
2 parents 175391d + 841b9a5 commit 0699a4c
Showing 1 changed file with 0 additions and 9 deletions.
9 changes: 0 additions & 9 deletions src/llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15468,15 +15468,6 @@ struct llm_tokenizer_bpe {
case LLAMA_VOCAB_PRE_TYPE_JINA_V2_ZH:
regex_exprs = {"\\w+|[^\\w\\s]+"};
break;
default:
// default regex for BPE tokenization pre-processing
regex_exprs = {
"[\\p{P}\\$\\+<=>\\^~\\|]+",
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
"\\p{N}+",
"[0-9][0-9][0-9]",
};
break;
}
}

Expand Down

0 comments on commit 0699a4c

Please sign in to comment.