Skip to content

Commit 69050a1

Browse files
authored
Refactor gguf scripts to improve metadata handling (#11909)
* Refactor gguf scripts to improve metadata handling Added contents method to ReaderField class Added endianess property to GGUFReader class * update scripts * fix import * remove unused import * attempt to work around flake and pyright errors * second attempt * give up, ignore type * bump version * apply newbyteorder fixes
1 parent 3567ee3 commit 69050a1

File tree

6 files changed

+88
-81
lines changed

6 files changed

+88
-81
lines changed

gguf-py/examples/reader.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,14 @@
22
import logging
33
import sys
44
from pathlib import Path
5-
from gguf.gguf_reader import GGUFReader
65

76
logger = logging.getLogger("reader")
87

8+
# Necessary to load the local gguf package
99
sys.path.insert(0, str(Path(__file__).parent.parent))
1010

11+
from gguf.gguf_reader import GGUFReader
12+
1113

1214
def read_gguf_file(gguf_file_path):
1315
"""

gguf-py/gguf/gguf_reader.py

+56-7
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66

77
import logging
88
import os
9+
import sys
910
from collections import OrderedDict
1011
from typing import Any, Literal, NamedTuple, TypeVar, Union
1112

@@ -15,7 +16,6 @@
1516
from .quants import quant_shape_to_byte_shape
1617

1718
if __name__ == "__main__":
18-
import sys
1919
from pathlib import Path
2020

2121
# Allow running file in package as a script.
@@ -28,6 +28,7 @@
2828
GGUF_VERSION,
2929
GGMLQuantizationType,
3030
GGUFValueType,
31+
GGUFEndian,
3132
)
3233

3334
logger = logging.getLogger(__name__)
@@ -53,6 +54,48 @@ class ReaderField(NamedTuple):
5354

5455
types: list[GGUFValueType] = []
5556

57+
def contents(self, index_or_slice: int | slice = slice(None)) -> Any:
58+
if self.types:
59+
to_string = lambda x: str(x.tobytes(), encoding='utf-8') # noqa: E731
60+
main_type = self.types[0]
61+
62+
if main_type == GGUFValueType.ARRAY:
63+
sub_type = self.types[-1]
64+
65+
if sub_type == GGUFValueType.STRING:
66+
indices = self.data[index_or_slice]
67+
68+
if isinstance(index_or_slice, int):
69+
return to_string(self.parts[indices]) # type: ignore
70+
else:
71+
return [to_string(self.parts[idx]) for idx in indices] # type: ignore
72+
else:
73+
# FIXME: When/if _get_field_parts() support multi-dimensional arrays, this must do so too
74+
75+
# Check if it's unsafe to perform slice optimization on data
76+
# if any(True for idx in self.data if len(self.parts[idx]) != 1):
77+
# optim_slice = slice(None)
78+
# else:
79+
# optim_slice = index_or_slice
80+
# index_or_slice = slice(None)
81+
82+
# if isinstance(optim_slice, int):
83+
# return self.parts[self.data[optim_slice]].tolist()[0]
84+
# else:
85+
# return [pv for idx in self.data[optim_slice] for pv in self.parts[idx].tolist()][index_or_slice]
86+
87+
if isinstance(index_or_slice, int):
88+
return self.parts[self.data[index_or_slice]].tolist()[0]
89+
else:
90+
return [pv for idx in self.data[index_or_slice] for pv in self.parts[idx].tolist()]
91+
92+
if main_type == GGUFValueType.STRING:
93+
return to_string(self.parts[-1])
94+
else:
95+
return self.parts[-1].tolist()[0]
96+
97+
return None
98+
5699

57100
class ReaderTensor(NamedTuple):
58101
name: str
@@ -101,10 +144,19 @@ def __init__(self, path: os.PathLike[str] | str, mode: Literal['r', 'r+', 'c'] =
101144
# If we get 0 here that means it's (probably) a GGUF file created for
102145
# the opposite byte order of the machine this script is running on.
103146
self.byte_order = 'S'
104-
temp_version = temp_version.newbyteorder(self.byte_order)
147+
temp_version = temp_version.view(temp_version.dtype.newbyteorder(self.byte_order))
105148
version = temp_version[0]
106149
if version not in READER_SUPPORTED_VERSIONS:
107150
raise ValueError(f'Sorry, file appears to be version {version} which we cannot handle')
151+
if sys.byteorder == "little":
152+
# Host is little endian
153+
host_endian = GGUFEndian.LITTLE
154+
swapped_endian = GGUFEndian.BIG
155+
else:
156+
# Sorry PDP or other weird systems that don't use BE or LE.
157+
host_endian = GGUFEndian.BIG
158+
swapped_endian = GGUFEndian.LITTLE
159+
self.endianess = swapped_endian if self.byte_order == "S" else host_endian
108160
self.fields: OrderedDict[str, ReaderField] = OrderedDict()
109161
self.tensors: list[ReaderTensor] = []
110162
offs += self._push_field(ReaderField(offs, 'GGUF.version', [temp_version], [0], [GGUFValueType.UINT32]))
@@ -146,11 +198,7 @@ def _get(
146198
itemsize = int(np.empty([], dtype = dtype).itemsize)
147199
end_offs = offset + itemsize * count
148200
arr = self.data[offset:end_offs].view(dtype=dtype)[:count]
149-
if override_order is not None:
150-
return arr.view(arr.dtype.newbyteorder(override_order))
151-
if self.byte_order == 'S':
152-
return arr.view(arr.dtype.newbyteorder(self.byte_order))
153-
return arr
201+
return arr.view(arr.dtype.newbyteorder(self.byte_order if override_order is None else override_order))
154202

155203
def _push_field(self, field: ReaderField, skip_sum: bool = False) -> int:
156204
if field.name in self.fields:
@@ -192,6 +240,7 @@ def _get_field_parts(
192240
offs += int(alen.nbytes)
193241
aparts: list[npt.NDArray[Any]] = [raw_itype, alen]
194242
data_idxs: list[int] = []
243+
# FIXME: Handle multi-dimensional arrays properly instead of flattening
195244
for idx in range(alen[0]):
196245
curr_size, curr_parts, curr_idxs, curr_types = self._get_field_parts(offs, raw_itype[0])
197246
if idx == 0:

gguf-py/gguf/scripts/gguf_convert_endian.py

+8-15
Original file line numberDiff line numberDiff line change
@@ -20,22 +20,15 @@
2020

2121

2222
def convert_byteorder(reader: gguf.GGUFReader, args: argparse.Namespace) -> None:
23-
if np.uint32(1) == np.uint32(1).newbyteorder("<"):
24-
# Host is little endian
25-
host_endian = "little"
26-
swapped_endian = "big"
23+
file_endian = reader.endianess.name
24+
if reader.byte_order == 'S':
25+
host_endian = 'BIG' if file_endian == 'LITTLE' else 'LITTLE'
2726
else:
28-
# Sorry PDP or other weird systems that don't use BE or LE.
29-
host_endian = "big"
30-
swapped_endian = "little"
31-
if reader.byte_order == "S":
32-
file_endian = swapped_endian
33-
else:
34-
file_endian = host_endian
35-
order = host_endian if args.order == "native" else args.order
36-
logger.info(f"* Host is {host_endian.upper()} endian, GGUF file seems to be {file_endian.upper()} endian")
27+
host_endian = file_endian
28+
order = host_endian if args.order == "native" else args.order.upper()
29+
logger.info(f"* Host is {host_endian} endian, GGUF file seems to be {file_endian} endian")
3730
if file_endian == order:
38-
logger.info(f"* File is already {order.upper()} endian. Nothing to do.")
31+
logger.info(f"* File is already {order} endian. Nothing to do.")
3932
sys.exit(0)
4033
logger.info("* Checking tensors for conversion compatibility")
4134
for tensor in reader.tensors:
@@ -47,7 +40,7 @@ def convert_byteorder(reader: gguf.GGUFReader, args: argparse.Namespace) -> None
4740
gguf.GGMLQuantizationType.Q6_K,
4841
):
4942
raise ValueError(f"Cannot handle type {tensor.tensor_type.name} for tensor {repr(tensor.name)}")
50-
logger.info(f"* Preparing to convert from {file_endian.upper()} to {order.upper()}")
43+
logger.info(f"* Preparing to convert from {file_endian} to {order}")
5144
if args.dry_run:
5245
return
5346
logger.warning("*** Warning *** Warning *** Warning **")

gguf-py/gguf/scripts/gguf_dump.py

+17-17
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,6 @@
99
from pathlib import Path
1010
from typing import Any
1111

12-
import numpy as np
13-
1412
# Necessary to load the local gguf package
1513
if "NO_LOCAL_GGUF" not in os.environ and (Path(__file__).parent.parent.parent.parent / 'gguf-py').exists():
1614
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
@@ -21,11 +19,11 @@
2119

2220

2321
def get_file_host_endian(reader: GGUFReader) -> tuple[str, str]:
24-
host_endian = 'LITTLE' if np.uint32(1) == np.uint32(1).newbyteorder("<") else 'BIG'
22+
file_endian = reader.endianess.name
2523
if reader.byte_order == 'S':
26-
file_endian = 'BIG' if host_endian == 'LITTLE' else 'LITTLE'
24+
host_endian = 'BIG' if file_endian == 'LITTLE' else 'LITTLE'
2725
else:
28-
file_endian = host_endian
26+
host_endian = file_endian
2927
return (host_endian, file_endian)
3028

3129

@@ -45,12 +43,20 @@ def dump_metadata(reader: GGUFReader, args: argparse.Namespace) -> None:
4543
pretty_type = str(field.types[-1].name)
4644

4745
log_message = f' {n:5}: {pretty_type:10} | {len(field.data):8} | {field.name}'
48-
if len(field.types) == 1:
46+
if field.types:
4947
curr_type = field.types[0]
5048
if curr_type == GGUFValueType.STRING:
51-
log_message += ' = {0}'.format(repr(str(bytes(field.parts[-1]), encoding='utf-8')[:60]))
52-
elif field.types[0] in reader.gguf_scalar_to_np:
53-
log_message += ' = {0}'.format(field.parts[-1][0])
49+
content = field.contents()
50+
if len(content) > 60:
51+
content = content[:57] + '...'
52+
log_message += ' = {0}'.format(repr(content))
53+
elif curr_type in reader.gguf_scalar_to_np:
54+
log_message += ' = {0}'.format(field.contents())
55+
else:
56+
content = repr(field.contents(slice(6)))
57+
if len(field.data) > 6:
58+
content = content[:-1] + ', ...]'
59+
log_message += ' = {0}'.format(content)
5460
print(log_message) # noqa: NP100
5561
if args.no_tensors:
5662
return
@@ -82,15 +88,9 @@ def dump_metadata_json(reader: GGUFReader, args: argparse.Namespace) -> None:
8288
curr["array_types"] = [t.name for t in field.types][1:]
8389
if not args.json_array:
8490
continue
85-
itype = field.types[-1]
86-
if itype == GGUFValueType.STRING:
87-
curr["value"] = [str(bytes(field.parts[idx]), encoding="utf-8") for idx in field.data]
88-
else:
89-
curr["value"] = [pv for idx in field.data for pv in field.parts[idx].tolist()]
90-
elif field.types[0] == GGUFValueType.STRING:
91-
curr["value"] = str(bytes(field.parts[-1]), encoding="utf-8")
91+
curr["value"] = field.contents()
9292
else:
93-
curr["value"] = field.parts[-1].tolist()[0]
93+
curr["value"] = field.contents()
9494
if not args.no_tensors:
9595
for idx, tensor in enumerate(reader.tensors):
9696
tensors[tensor.name] = {

gguf-py/gguf/scripts/gguf_new_metadata.py

+3-40
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88
import json
99
from pathlib import Path
1010

11-
import numpy as np
1211
from tqdm import tqdm
1312
from typing import Any, Sequence, NamedTuple
1413

@@ -27,45 +26,10 @@ class MetadataDetails(NamedTuple):
2726
description: str = ''
2827

2928

30-
def get_byteorder(reader: gguf.GGUFReader) -> gguf.GGUFEndian:
31-
if np.uint32(1) == np.uint32(1).newbyteorder("<"):
32-
# Host is little endian
33-
host_endian = gguf.GGUFEndian.LITTLE
34-
swapped_endian = gguf.GGUFEndian.BIG
35-
else:
36-
# Sorry PDP or other weird systems that don't use BE or LE.
37-
host_endian = gguf.GGUFEndian.BIG
38-
swapped_endian = gguf.GGUFEndian.LITTLE
39-
40-
if reader.byte_order == "S":
41-
return swapped_endian
42-
else:
43-
return host_endian
44-
45-
46-
def decode_field(field: gguf.ReaderField | None) -> Any:
47-
if field and field.types:
48-
main_type = field.types[0]
49-
50-
if main_type == gguf.GGUFValueType.ARRAY:
51-
sub_type = field.types[-1]
52-
53-
if sub_type == gguf.GGUFValueType.STRING:
54-
return [str(bytes(field.parts[idx]), encoding='utf-8') for idx in field.data]
55-
else:
56-
return [pv for idx in field.data for pv in field.parts[idx].tolist()]
57-
if main_type == gguf.GGUFValueType.STRING:
58-
return str(bytes(field.parts[-1]), encoding='utf-8')
59-
else:
60-
return field.parts[-1][0]
61-
62-
return None
63-
64-
6529
def get_field_data(reader: gguf.GGUFReader, key: str) -> Any:
6630
field = reader.get_field(key)
6731

68-
return decode_field(field)
32+
return field.contents() if field else None
6933

7034

7135
def find_token(token_list: Sequence[int], token: str) -> Sequence[int]:
@@ -93,7 +57,7 @@ def copy_with_new_metadata(reader: gguf.GGUFReader, writer: gguf.GGUFWriter, new
9357
logger.debug(f'Removing {field.name}')
9458
continue
9559

96-
old_val = MetadataDetails(field.types[0], decode_field(field))
60+
old_val = MetadataDetails(field.types[0], field.contents())
9761
val = new_metadata.get(field.name, old_val)
9862

9963
if field.name in new_metadata:
@@ -192,7 +156,6 @@ def main() -> None:
192156
reader = gguf.GGUFReader(args.input, 'r')
193157

194158
arch = get_field_data(reader, gguf.Keys.General.ARCHITECTURE)
195-
endianess = get_byteorder(reader)
196159

197160
token_list = get_field_data(reader, gguf.Keys.Tokenizer.LIST) or []
198161

@@ -230,7 +193,7 @@ def main() -> None:
230193
sys.exit(0)
231194

232195
logger.info(f'* Writing: {args.output}')
233-
writer = gguf.GGUFWriter(args.output, arch=arch, endianess=endianess)
196+
writer = gguf.GGUFWriter(args.output, arch=arch, endianess=reader.endianess)
234197

235198
alignment = get_field_data(reader, gguf.Keys.General.ALIGNMENT)
236199
if alignment is not None:

gguf-py/pyproject.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "gguf"
3-
version = "0.15.0"
3+
version = "0.16.0"
44
description = "Read and write ML models in GGUF for GGML"
55
authors = ["GGML <[email protected]>"]
66
packages = [

0 commit comments

Comments
 (0)