diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 35be92a3fa6327..86b209f6b74ba8 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -2603,12 +2603,12 @@ class XLMRoberrtaModel(BertModel): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) + self.pad_token_id = self.hparams["pad_token_id"] def set_gguf_parameters(self): # set this and pop it so super doesn't write it too context_length_train = self.hparams.pop("max_position_embeddings") - pad_token_id = self.hparams["pad_token_id"] - context_length = context_length_train - pad_token_id - 1 # since padding_idx=1 + context_length = context_length_train - self.pad_token_id - 1 # since padding_idx=1 self.gguf_writer.add_context_length(context_length) super().set_gguf_parameters() @@ -2700,7 +2700,8 @@ def write_tensors(self): # chop off position embeddings by two to handle padding_idx offset (1 + padding_token_id) if name == "embeddings.position_embeddings.weight": - data_torch = data_torch[2:] + context_chop = self.pad_token_id + 1 + data_torch = data_torch[context_chop:] # convert any unsupported data types to float32 if data_torch.dtype not in (torch.float16, torch.float32):