Skip to content

Commit

Permalink
convert : refactor vocab selection logic (ggerganov#6355)
Browse files Browse the repository at this point in the history
  • Loading branch information
cebtenzzre authored Mar 28, 2024
1 parent 66ba560 commit be55134
Show file tree
Hide file tree
Showing 4 changed files with 204 additions and 176 deletions.
27 changes: 9 additions & 18 deletions convert-hf-to-gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
import gguf

from convert import HfVocab
from convert import LlamaHfVocab


###### MODEL DEFINITIONS ######
Expand Down Expand Up @@ -230,7 +230,7 @@ def _get_part_names(self):
def _set_vocab_gpt2(self):
dir_model = self.dir_model
hparams = self.hparams
tokens: list[bytearray] = []
tokens: list[str] = []
toktypes: list[int] = []

from transformers import AutoTokenizer
Expand All @@ -243,8 +243,7 @@ def _set_vocab_gpt2(self):

for i in range(vocab_size):
if i not in reverse_vocab:
pad_token = f"[PAD{i}]".encode('utf-8')
tokens.append(bytearray(pad_token))
tokens.append(f"[PAD{i}]")
toktypes.append(gguf.TokenType.USER_DEFINED)
elif reverse_vocab[i] in added_vocab:
tokens.append(reverse_vocab[i])
Expand All @@ -266,7 +265,7 @@ def _set_vocab_gpt2(self):
def _set_vocab_qwen(self):
dir_model = self.dir_model
hparams = self.hparams
tokens: list[bytearray] = []
tokens: list[str] = []
toktypes: list[int] = []

from transformers import AutoTokenizer
Expand All @@ -291,8 +290,7 @@ def _set_vocab_qwen(self):

for i in range(vocab_size):
if i not in reverse_vocab:
pad_token = f"[PAD{i}]".encode("utf-8")
tokens.append(bytearray(pad_token))
tokens.append(f"[PAD{i}]")
toktypes.append(gguf.TokenType.USER_DEFINED)
elif reverse_vocab[i] in added_vocab:
tokens.append(reverse_vocab[i])
Expand Down Expand Up @@ -372,12 +370,8 @@ def _set_vocab_sentencepiece(self):
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
special_vocab.add_to_gguf(self.gguf_writer)

def _set_vocab_hf(self):
path = self.dir_model
added_tokens_path = self.dir_model
vocab = HfVocab(
path, added_tokens_path if added_tokens_path.exists() else None
)
def _set_vocab_llama_hf(self):
vocab = LlamaHfVocab(self.dir_model)
tokens = []
scores = []
toktypes = []
Expand Down Expand Up @@ -1099,7 +1093,7 @@ def set_gguf_parameters(self):
self.gguf_writer.add_file_type(self.ftype)

def set_vocab(self):
self._set_vocab_hf()
self._set_vocab_llama_hf()

def _reverse_hf_permute(self, weights: Tensor, n_head: int, n_kv_head: int | None = None) -> Tensor:
if n_kv_head is not None and n_head != n_kv_head:
Expand Down Expand Up @@ -1700,11 +1694,8 @@ def set_gguf_parameters(self):
self.gguf_writer.add_pooling_type(pooling_type)

def set_vocab(self):
path = self.dir_model
added_tokens_path = self.dir_model if self.dir_model.exists() else None

# use huggingface vocab to get all tokens
vocab = HfVocab(path, added_tokens_path)
vocab = LlamaHfVocab(self.dir_model, ignore_nonllama=True)
tokens, scores, toktypes = zip(*vocab.all_tokens())
assert len(tokens) == vocab.vocab_size
self.vocab_size = vocab.vocab_size
Expand Down
6 changes: 3 additions & 3 deletions convert-persimmon-to-gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,12 +106,12 @@ def main():
tensor_map = gguf.get_tensor_name_map(arch, block_count)
print(tensor_map)
for name in tensors.keys():
data = tensors[name]
data_torch = tensors[name]
if name.endswith(".self_attention.rotary_emb.inv_freq"):
continue
old_dtype = data.dtype
old_dtype = data_torch.dtype
# TODO: FP16 conversion produces garbage outputs. (Q8_0 does not, so..?)
data = data.to(torch.float32).squeeze().numpy()
data = data_torch.to(torch.float32).squeeze().numpy()
new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
if new_name is None:
print("Can not map tensor '" + name + "'")
Expand Down
Loading

0 comments on commit be55134

Please sign in to comment.