Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

b2998 #136

Merged
merged 11 commits into from
May 25, 2024
Prev Previous commit
Next Next commit
gguf-py : fix and simplify quantized shape round-trip (ggml-org#7483)
* gguf-py : fix and simplify quantized shape round-trip

* gguf-py : remove unused import
compilade authored May 25, 2024

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature. The key has expired.
commit b83bab15a5d2a1e7807d09613a9b34309d86cfaa
7 changes: 3 additions & 4 deletions convert-hf-to-gguf.py
Original file line number Diff line number Diff line change
@@ -313,11 +313,10 @@ def write_tensors(self):
data = data.astype(np.float32)
data_qtype = gguf.GGMLQuantizationType.F32

block_size, type_size = gguf.GGML_QUANT_SIZES[data_qtype]
shape = gguf.quant_shape_from_byte_shape(data.shape, data_qtype) if data.dtype == np.uint8 else data.shape

# reverse shape to make it similar to the internal ggml dimension order
shape_str = f"""{{{', '.join(str(n) for n in reversed(
(*data.shape[:-1], data.shape[-1] * data.dtype.itemsize // type_size * block_size))
)}}}"""
shape_str = f"{{{', '.join(str(n) for n in reversed(shape))}}}"

# n_dims is implicit in the shape
logger.info(f"{f'%-{max_name_len}s' % f'{new_name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}")
6 changes: 5 additions & 1 deletion gguf-py/gguf/gguf_reader.py
Original file line number Diff line number Diff line change
@@ -12,6 +12,8 @@
import numpy as np
import numpy.typing as npt

from .quants import quant_shape_to_byte_shape

if __name__ == "__main__":
import sys
from pathlib import Path
@@ -251,6 +253,7 @@ def _build_tensors(self, start_offs: int, fields: list[ReaderField]) -> None:
tensor_names.add(tensor_name)
ggml_type = GGMLQuantizationType(raw_dtype[0])
n_elems = int(np.prod(dims))
np_dims = tuple(reversed(dims.tolist()))
block_size, type_size = GGML_QUANT_SIZES[ggml_type]
n_bytes = n_elems * type_size // block_size
data_offs = int(start_offs + offset_tensor[0])
@@ -279,14 +282,15 @@ def _build_tensors(self, start_offs: int, fields: list[ReaderField]) -> None:
else:
item_count = n_bytes
item_type = np.uint8
np_dims = quant_shape_to_byte_shape(np_dims, ggml_type)
tensors.append(ReaderTensor(
name = tensor_name,
tensor_type = ggml_type,
shape = dims,
n_elements = n_elems,
n_bytes = n_bytes,
data_offset = data_offs,
data = self._get(data_offs, item_type, item_count),
data = self._get(data_offs, item_type, item_count).reshape(np_dims),
field = field,
))
self.tensors = tensors
8 changes: 3 additions & 5 deletions gguf-py/gguf/gguf_writer.py
Original file line number Diff line number Diff line change
@@ -13,7 +13,6 @@
import numpy as np

from .constants import (
GGML_QUANT_SIZES,
GGUF_DEFAULT_ALIGNMENT,
GGUF_MAGIC,
GGUF_VERSION,
@@ -26,6 +25,8 @@
TokenType,
)

from .quants import quant_shape_from_byte_shape

logger = logging.getLogger(__name__)


@@ -229,10 +230,7 @@ def add_tensor_info(
else:
dtype = raw_dtype
if tensor_dtype == np.uint8:
block_size, type_size = GGML_QUANT_SIZES[raw_dtype]
if tensor_shape[-1] % type_size != 0:
raise ValueError(f"Quantized tensor row size ({tensor_shape[-1]}) is not a multiple of {dtype.name} type size ({type_size})")
tensor_shape = tuple(tensor_shape[:-1]) + (tensor_shape[-1] // type_size * block_size,)
tensor_shape = quant_shape_from_byte_shape(tensor_shape, raw_dtype)
n_dims = len(tensor_shape)
self.ti_data += self._pack("I", n_dims)
for i in range(n_dims):
16 changes: 15 additions & 1 deletion gguf-py/gguf/quants.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from __future__ import annotations
from typing import Callable
from typing import Callable, Sequence

from numpy.typing import DTypeLike

@@ -9,6 +9,20 @@
import numpy as np


def quant_shape_to_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizationType):
block_size, type_size = GGML_QUANT_SIZES[quant_type]
if shape[-1] % block_size != 0:
raise ValueError(f"Quantized tensor row size ({shape[-1]}) is not a multiple of {quant_type.name} block size ({block_size})")
return (*shape[:-1], shape[-1] // block_size * type_size)


def quant_shape_from_byte_shape(shape: Sequence[int], quant_type: GGMLQuantizationType):
block_size, type_size = GGML_QUANT_SIZES[quant_type]
if shape[-1] % type_size != 0:
raise ValueError(f"Quantized tensor bytes per row ({shape[-1]}) is not a multiple of {quant_type.name} type size ({type_size})")
return (*shape[:-1], shape[-1] // type_size * block_size)


# same as ggml_compute_fp32_to_bf16 in ggml-impl.h
def __compute_fp32_to_bf16(n: np.ndarray) -> np.ndarray:
n = n.astype(np.float32, copy=False).view(np.int32)
4 changes: 1 addition & 3 deletions gguf-py/scripts/gguf-new-metadata.py
Original file line number Diff line number Diff line change
@@ -118,9 +118,7 @@ def copy_with_new_metadata(reader: gguf.GGUFReader, writer: gguf.GGUFWriter, new

for tensor in reader.tensors:
total_bytes += tensor.n_bytes
# Dimensions are written in reverse order, so flip them first
shape = np.flipud(tensor.shape).tolist()
writer.add_tensor_info(tensor.name, shape, tensor.data.dtype, tensor.data.nbytes, tensor.tensor_type)
writer.add_tensor_info(tensor.name, tensor.data.shape, tensor.data.dtype, tensor.data.nbytes, tensor.tensor_type)

bar = tqdm(desc="Writing", total=total_bytes, unit="byte", unit_scale=True)