Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

suport for gguf Q*_K #448

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
152 changes: 138 additions & 14 deletions auto_round/export/export_to_gguf/quant.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,53 +19,57 @@
"bf16": (1, 2),
"q4_0": (32, 2 + 16),
"q4_1": (32, 2 + 2 + 16),
"q4_k": (256, 2 + 2 + QK_K // 2 + 12),
"q4_k": (256, 2 + 2 + QK_K//2 + 12),
}

GGML_QUANT_BLOCK = {}


def register_block(name):

def register(cls):
GGML_QUANT_BLOCK[name] = cls
return cls

return register


def ggml_quant(data: np.array, ggml_type, scale = None, zp = None):
def ggml_quant(data: np.array, ggml_type, scale=None, zp=None):
block_size, type_size = GGML_QUANT_SIZES[ggml_type]

data = data.astype(np.float32, copy=False)
shape = data.shape
n_blocks = data.size // block_size
blocks = data.reshape((n_blocks, block_size))

new_data = GGML_QUANT_BLOCK[ggml_type](blocks, scale, zp)
new_data = new_data.reshape(*shape[:-1], shape[-1] // block_size * type_size)
return new_data


@register_block("bf16")
def bf16_quant_block(blocks: np.array, scale = None, zp = None):
def bf16_quant_block(blocks: np.array, scale=None, zp=None):
n = blocks.view(np.uint32)
# force nan to quiet
n = np.where((n & 0x7fffffff) > 0x7f800000, (n & np.uint32(0xffff0000)) | np.uint32(64 << 16), n)
# round to nearest even
n = (np.uint64(n) + (0x7fff + ((n >> 16) & 1))) >> 16
return n.astype(np.uint16).view(np.uint8)


@register_block("q4_0")
def q4_0_quant_block(blocks: np.array, scale = None, zp = None):
def q4_0_quant_block(blocks: np.array, scale=None, zp=None):
if scale is not None:
d = scale.reshape((-1,1))
d = scale.reshape((-1, 1))
else:
imax = abs(blocks).argmax(axis=-1, keepdims=True)
max = np.take_along_axis(blocks, imax, axis=-1)
d = max / -8
with np.errstate(divide="ignore"):
id = np.where(d == 0, 0, 1 / d)

qs = np.trunc(
(np.float64(blocks) * np.float64(id)) + np.float64(8.5),
dtype=np.float32).astype(np.uint8).clip(0, 15)
qs = np.trunc((np.float64(blocks) * np.float64(id)) + np.float64(8.5),
dtype=np.float32).astype(np.uint8).clip(0, 15)

n_blocks = blocks.shape[0]
block_size = GGML_QUANT_SIZES["q4_0"][0]
Expand All @@ -78,18 +82,18 @@ def q4_0_quant_block(blocks: np.array, scale = None, zp = None):


@register_block("q4_1")
def q4_1_quant_block(blocks: np.array, scale = None, zp = None):
def q4_1_quant_block(blocks: np.array, scale=None, zp=None):
if scale is not None:
d = scale.reshape((-1,1))
min = zp.reshape((-1,1)) * d * -1
d = scale.reshape((-1, 1))
min = zp.reshape((-1, 1)) * d * -1
else:
max = blocks.max(axis=-1, keepdims=True)
min = blocks.min(axis=-1, keepdims=True)
d = (max - min) / 15
d = (max-min) / 15
with np.errstate(divide="ignore"):
id = np.where(d == 0, 0, 1 / d)

qs = np.trunc((blocks - min) * id + np.float32(0.5), dtype=np.float32).astype(np.uint8).clip(0, 15)
qs = np.trunc((blocks-min) * id + np.float32(0.5), dtype=np.float32).astype(np.uint8).clip(0, 15)

n_blocks = blocks.shape[0]
block_size = GGML_QUANT_SIZES["q4_1"][0]
Expand All @@ -98,4 +102,124 @@ def q4_1_quant_block(blocks: np.array, scale = None, zp = None):

d = d.astype(np.float16).view(np.uint8)
m = min.astype(np.float16).view(np.uint8)
breakpoint()
return np.concatenate([d, m, qs], axis=-1)


def make_qkx2_quants(data, weight, nmax, group_size, rmin=-1, rdelta=0.1, nstep=20):
group_min = np.min(data)
group_max = np.max(data)

sum_w = np.sum(weight)
sum_x = np.sum(weight * data)

if group_min > 0:
group_min = 0
if group_min == group_max:
L = [0] * group_size
the_min = -group_min
return 0.0, L, the_min

iscale = nmax / (group_max-group_min)
scale = 1 / iscale
best_mad = 0
L = []
for i in range(group_size):
l = np.round(iscale * (data[i] - group_min))
L.append(int(max(0, min(nmax, l))))
diff = scale * L[-1] + group_min - data[i]
diff = diff**2
w = weight[i]
best_mad += w * diff
if nstep < 1:
the_min = -group_min
return scale, L, the_min

Laux = []
for step in range(nstep):
iscale = (rmin + rdelta*step + nmax) / (group_max-group_min)
sum_l, sum_l2, sum_xl = 0, 0, 0
for i in range(group_size):
l = round(iscale * (data[i] - group_min))
l = max(0, min(nmax, l))
Laux.append(l)
sum_l += weight[i] * l
sum_l2 += weight[i] * l * l
sum_xl += weight[i] * l * data[i]
D = sum_w*sum_l2 - sum_l*sum_l
if D > 0:
this_scale = (sum_w*sum_xl - sum_x*sum_l) / D
this_min = (sum_l2*sum_x - sum_l*sum_xl) / D
if this_min > 0:
this_min = 0
this_scale = sum_xl / sum_l2
mad = 0
for i in range(group_size):
diff = this_scale * Laux[i] + this_min - data[i]
diff = diff**2
mad += w * diff
if mad < best_mad:
for i in range(group_size):
L[i] = Laux[i]
best_mad = mad
scale = this_scale
group_min = this_min

the_min = -group_min
return scale, the_min


@register_block("q4_k")
def q4_k_quant_block(blocks: np.array, scale=None, zp=None):
nb = blocks.shape[0]
output_scale = np.empty((nb, QK_K//32 + 4), dtype=np.uint8)
output_d = np.empty(nb, dtype=np.float32)
output_dmin = np.empty(nb, dtype=np.float32)
output_qs = np.empty((nb, QK_K // 64, 32), dtype=np.uint8)

blocks = blocks.reshape((nb, QK_K // 32, 32))
sum_x2 = np.sum(np.power(blocks, 2), axis=-1)
av_x = np.sqrt(sum_x2 / 32)
weight = blocks + av_x.reshape((*av_x.shape, 1))
scales = np.empty(QK_K // 32, dtype=np.float32)
mins = np.empty(QK_K // 32, dtype=np.float32)
for i in range(nb):
if scale is not None:
pass
else:
for j in range(QK_K // 32):
d_scale, the_min = make_qkx2_quants(
blocks[i][j], weight[i][j], nmax=15, group_size=32, rmin=-1, rdelta=0.1, nstep=20)
scales[j] = d_scale
mins[j] = the_min

max_scale = max(scales)
max_min = max(mins)
inv_scale = 63. / max_scale if max_scale > 0 else 0.
inv_min = 63. / max_min if max_min > 0 else 0.

ls = np.round(inv_scale * scales).astype(np.uint8)
lm = np.round(inv_min * mins).astype(np.uint8)
output_scale[i][:4] = ls[:4]
output_scale[i][4:8] = lm[:4]

output_scale[i][8:] = (ls[4:] & 0xF) | ((lm[4:] & 0xF) << 4)
output_scale[i][:4] |= ((ls[4:] >> 4) << 6)
output_scale[i][4:8] |= ((lm[4:] >> 4) << 6)

output_d[i] = max_scale / 63
output_dmin[i] = max_min / 63

d_tmp = output_d[i] * ls
dm_tmp = output_dmin[i] * lm

all_L = np.round((blocks[i] + dm_tmp.reshape(-1, 1)) / d_tmp.reshape(-1, 1)).astype(np.uint8)
all_L = np.clip(all_L, 0, 15)

for j in range(QK_K // 64):
output_qs[i][j] = all_L[j] | (all_L[j + 1] << 4)

output_d = output_d.reshape(-1, 1).astype(np.float16).view(np.uint8)
output_dmin = output_dmin.reshape(-1, 1).astype(np.float16).view(np.uint8)
output_qs = output_qs.reshape(nb, QK_K // 2)
return np.concatenate([output_d, output_dmin, output_scale, output_qs], axis=-1)