Skip to content

Commit

Permalink
add conversion for bge-m3; small fix in unigram tokenizer
Browse files Browse the repository at this point in the history
  • Loading branch information
iamlemec committed Jul 23, 2024
1 parent c4dd11d commit 59ebf7a
Show file tree
Hide file tree
Showing 2 changed files with 102 additions and 1 deletion.
98 changes: 98 additions & 0 deletions convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -2343,6 +2343,104 @@ def set_gguf_parameters(self):
self.gguf_writer.add_rope_freq_base(self.hparams["rotary_emb_base"])


@Model.register("XLMRobertaModel")
class XLMRobertaModel(BertModel):
model_arch = gguf.MODEL_ARCH.BERT

def set_vocab(self):
# to avoid TypeError: Descriptors cannot be created directly
# exception when importing sentencepiece_model_pb2
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
from sentencepiece import SentencePieceProcessor
from sentencepiece import sentencepiece_model_pb2 as model

tokenizer_path = self.dir_model / 'sentencepiece.bpe.model'
if not tokenizer_path.is_file():
raise FileNotFoundError(f"File not found: {tokenizer_path}")

sentencepiece_model = model.ModelProto() # pyright: ignore[reportAttributeAccessIssue]
sentencepiece_model.ParseFromString(open(tokenizer_path, "rb").read())
assert sentencepiece_model.trainer_spec.model_type == 1 # UNIGRAM

add_prefix = sentencepiece_model.normalizer_spec.add_dummy_prefix
remove_whitespaces = sentencepiece_model.normalizer_spec.remove_extra_whitespaces
precompiled_charsmap = sentencepiece_model.normalizer_spec.precompiled_charsmap

tokenizer = SentencePieceProcessor()
tokenizer.LoadFromFile(str(tokenizer_path))

vocab_size = self.hparams.get('vocab_size', tokenizer.vocab_size())

tokens: list[bytes] = [f"[PAD{i}]".encode("utf-8") for i in range(vocab_size)]
scores: list[float] = [-10000.0] * vocab_size
toktypes: list[int] = [SentencePieceTokenTypes.UNKNOWN] * vocab_size

for token_id in range(tokenizer.vocab_size()):
piece = tokenizer.IdToPiece(token_id)
text = piece.encode("utf-8")
score = tokenizer.GetScore(token_id)

toktype = SentencePieceTokenTypes.NORMAL
if tokenizer.IsUnknown(token_id):
toktype = SentencePieceTokenTypes.UNKNOWN
elif tokenizer.IsControl(token_id):
toktype = SentencePieceTokenTypes.CONTROL
elif tokenizer.IsUnused(token_id):
toktype = SentencePieceTokenTypes.UNUSED
elif tokenizer.IsByte(token_id):
toktype = SentencePieceTokenTypes.BYTE

tokens[token_id] = text
scores[token_id] = score
toktypes[token_id] = toktype

if vocab_size > len(tokens):
pad_count = vocab_size - len(tokens)
logger.debug(f"Padding vocab with {pad_count} token(s) - [PAD1] through [PAD{pad_count}]")
for i in range(1, pad_count + 1):
tokens.append(bytes(f"[PAD{i}]", encoding="utf-8"))
scores.append(-1000.0)
toktypes.append(SentencePieceTokenTypes.UNUSED)

# realign tokens (see HF tokenizer code)
tokens = [b'<s>', b'<pad>', b'</s>', b'<unk>'] + tokens[3:-1]
scores = [0.0, -10000.0, 0.0, -10000.0] + scores[3:-1]
toktypes = [
SentencePieceTokenTypes.CONTROL,
SentencePieceTokenTypes.CONTROL,
SentencePieceTokenTypes.CONTROL,
SentencePieceTokenTypes.UNKNOWN,
] + toktypes[3:-1]

self.gguf_writer.add_tokenizer_model("t5")
self.gguf_writer.add_tokenizer_pre("default")
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_scores(scores)
self.gguf_writer.add_token_types(toktypes)
self.gguf_writer.add_add_space_prefix(add_prefix)
self.gguf_writer.add_token_type_count(1) # not sure about this
self.gguf_writer.add_remove_extra_whitespaces(remove_whitespaces)
if precompiled_charsmap:
self.gguf_writer.add_precompiled_charsmap(precompiled_charsmap)

special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
special_vocab.add_to_gguf(self.gguf_writer)

self.gguf_writer.add_add_bos_token(True) # looks like we need this
self.gguf_writer.add_add_eos_token(True)

def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
# position embeddings start at 2 (padding_idx = 1), so just chop down the weight tensor
if name == "embeddings.position_embeddings.weight":
del bid # unused

data_torch[:-2,:] = data_torch[2:,:].clone()
data_torch[-2:,:] = 0.0

return [(self.map_tensor_name(name), data_torch)]

return super().modify_tensors(data_torch, name, bid)

@Model.register("GemmaForCausalLM")
class GemmaModel(Model):
model_arch = gguf.MODEL_ARCH.GEMMA
Expand Down
5 changes: 4 additions & 1 deletion src/llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15859,6 +15859,9 @@ struct llm_tokenizer_ugm {
* the best tokenization.
*/
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
// get current size of output (for reversal later)
size_t output_size = output.size();

// normalize the input first
std::string normalized;
normalize(text, &normalized);
Expand Down Expand Up @@ -15938,7 +15941,7 @@ struct llm_tokenizer_ugm {
}

// reverse the output since we added tokens starting from the end of the input
std::reverse(output.begin(), output.end());
std::reverse(output.begin() + output_size, output.end());
}

private:
Expand Down

0 comments on commit 59ebf7a

Please sign in to comment.