Skip to content

Commit a2c10e0

Browse files
committed
Merge branch 'master' into concedo
# Conflicts: # .devops/full.Dockerfile # README.md # main.cpp
2 parents f128838 + 074bea2 commit a2c10e0

15 files changed

+561
-293
lines changed

Makefile

+1-1
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ endif
3131
#
3232

3333
CFLAGS = -I. -O3 -DNDEBUG -std=c11 -fPIC
34-
CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC
34+
CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++17 -fPIC
3535
LDFLAGS =
3636

3737
# OS specific

alpaca.sh

+6
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
#!/bin/bash
2+
#
3+
# Temporary script - will be removed in the future
4+
#
5+
6+
./main -m ./models/ggml-alpaca-7b-q4.bin --color -f ./prompts/alpaca.txt -ins --top_k 10000 --temp 0.96 --repeat_penalty 1 -t 7

convert-pth-to-ggml.py

+78-109
Original file line numberDiff line numberDiff line change
@@ -16,144 +16,91 @@
1616
# At the start of the ggml file we write the model parameters
1717
# and vocabulary.
1818
#
19-
import os
19+
import argparse
2020
import sys
2121
import json
2222
import struct
2323
import numpy as np
2424
import torch
2525
from sentencepiece import SentencePieceProcessor
2626

27-
if len(sys.argv) < 3:
28-
print("Usage: convert-ckpt-to-ggml.py dir-model ftype\n")
29-
print(" ftype == 0 -> float32")
30-
print(" ftype == 1 -> float16")
31-
sys.exit(1)
27+
def parse_args():
3228

33-
# output in the same directory as the model
34-
dir_model = sys.argv[1]
35-
36-
fname_hparams = sys.argv[1] + "/params.json"
37-
fname_tokenizer = sys.argv[1] + "/../tokenizer.model"
29+
parser = argparse.ArgumentParser(description='Convert a LLaMA model checkpoint to a ggml compatible file')
30+
parser.add_argument('dir_model', help='directory containing the model checkpoint')
31+
parser.add_argument('ftype', type=int, choices=[0, 1], default=1, help='file type (0: float32, 1: float16)')
32+
return parser.parse_args()
3833

3934
def get_n_parts(dim):
40-
if dim == 4096:
41-
return 1
42-
elif dim == 5120:
43-
return 2
44-
elif dim == 6656:
45-
return 4
46-
elif dim == 8192:
47-
return 8
48-
else:
49-
print("Invalid dim: " + str(dim))
50-
sys.exit(1)
5135

52-
# possible data types
53-
# ftype == 0 -> float32
54-
# ftype == 1 -> float16
55-
#
56-
# map from ftype to string
57-
ftype_str = ["f32", "f16"]
58-
59-
ftype = 1
60-
if len(sys.argv) > 2:
61-
ftype = int(sys.argv[2])
62-
if ftype < 0 or ftype > 1:
63-
print("Invalid ftype: " + str(ftype))
36+
mappings = {4096: 1, 5120: 2, 6656: 4, 8192: 8}
37+
n_parts = mappings.get(dim)
38+
if n_parts is None:
39+
print(f"Invalid dim: {dim}")
6440
sys.exit(1)
65-
fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin"
66-
67-
if os.path.exists(fname_out):
68-
print(f"Skip conversion, it already exists: {fname_out}")
69-
sys.exit(0)
7041

71-
with open(fname_hparams, "r") as f:
72-
hparams = json.load(f)
42+
print(f"n_parts = {n_parts}\n")
43+
return n_parts
7344

74-
tokenizer = SentencePieceProcessor(fname_tokenizer)
45+
def load_hparams_and_tokenizer(dir_model):
7546

76-
hparams.update({"vocab_size": tokenizer.vocab_size()})
47+
fname_hparams = f"{dir_model}/params.json"
48+
fname_tokenizer = f"{dir_model}/../tokenizer.model"
7749

78-
n_parts = get_n_parts(hparams["dim"])
50+
with open(fname_hparams, "r") as f:
51+
hparams = json.load(f)
52+
print(hparams)
7953

80-
print(hparams)
81-
print('n_parts = ', n_parts)
54+
tokenizer = SentencePieceProcessor(fname_tokenizer)
55+
hparams.update({"vocab_size": tokenizer.vocab_size()})
8256

83-
for p in range(n_parts):
84-
print('Processing part ', p)
57+
return hparams, tokenizer
8558

86-
#fname_model = sys.argv[1] + "/consolidated.00.pth"
87-
fname_model = sys.argv[1] + "/consolidated.0" + str(p) + ".pth"
88-
fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin"
89-
if (p > 0):
90-
fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin" + "." + str(p)
59+
def write_header(fout, hparams, ftype):
9160

92-
model = torch.load(fname_model, map_location="cpu")
61+
keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"]
62+
values = [
63+
0x67676d66, # magic: ggml in hex
64+
1, # file version
65+
*[hparams[key] for key in keys],
66+
hparams["dim"] // hparams["n_heads"], # rot (obsolete)
67+
ftype
68+
]
69+
fout.write(struct.pack("i" * len(values), *values))
9370

94-
fout = open(fname_out, "wb")
71+
def write_tokens(fout, tokenizer):
9572

96-
fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
97-
fout.write(struct.pack("i", hparams["vocab_size"]))
98-
fout.write(struct.pack("i", hparams["dim"]))
99-
fout.write(struct.pack("i", hparams["multiple_of"]))
100-
fout.write(struct.pack("i", hparams["n_heads"]))
101-
fout.write(struct.pack("i", hparams["n_layers"]))
102-
fout.write(struct.pack("i", hparams["dim"] // hparams["n_heads"])) # rot (obsolete)
103-
fout.write(struct.pack("i", ftype))
104-
105-
# Is this correct??
10673
for i in range(tokenizer.vocab_size()):
10774
if tokenizer.is_unknown(i):
108-
# "<unk>" token (translated as ??)
10975
text = " \u2047 ".encode("utf-8")
110-
fout.write(struct.pack("i", len(text)))
111-
fout.write(text)
11276
elif tokenizer.is_control(i):
113-
# "<s>"/"</s>" tokens
114-
fout.write(struct.pack("i", 0))
77+
text = b""
11578
elif tokenizer.is_byte(i):
116-
# "<U+XX>" tokens (which may be invalid UTF-8)
11779
piece = tokenizer.id_to_piece(i)
11880
if len(piece) != 6:
119-
print("Invalid token: " + piece)
81+
print(f"Invalid token: {piece}")
12082
sys.exit(1)
12183
byte_value = int(piece[3:-1], 16)
122-
fout.write(struct.pack("i", 1))
123-
fout.write(struct.pack("B", byte_value))
84+
text = struct.pack("B", byte_value)
12485
else:
125-
# normal token. Uses U+2581 (LOWER ONE EIGHTH BLOCK) to represent spaces.
12686
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
127-
fout.write(struct.pack("i", len(text)))
128-
fout.write(text)
87+
fout.write(struct.pack("i", len(text)))
88+
fout.write(text)
89+
fout.write(struct.pack("f", tokenizer.get_score(i)))
12990

130-
for k, v in model.items():
131-
name = k
132-
shape = v.shape
91+
def process_and_write_variables(fout, model, ftype):
13392

134-
# skip layers.X.attention.inner_attention.rope.freqs
135-
if name[-5:] == "freqs":
136-
continue
93+
for name, datao in model.items():
13794

138-
print("Processing variable: " + name + " with shape: ", shape, " and type: ", v.dtype)
95+
if name.endswith("freqs"):
96+
continue
13997

140-
#data = tf.train.load_variable(dir_model, name).squeeze()
141-
data = v.numpy().squeeze()
142-
n_dims = len(data.shape);
98+
shape = datao.shape
14399

144-
# for efficiency - transpose some matrices
145-
# "model/h.*/attn/c_attn/w"
146-
# "model/h.*/attn/c_proj/w"
147-
# "model/h.*/mlp/c_fc/w"
148-
# "model/h.*/mlp/c_proj/w"
149-
#if name[-14:] == "/attn/c_attn/w" or \
150-
# name[-14:] == "/attn/c_proj/w" or \
151-
# name[-11:] == "/mlp/c_fc/w" or \
152-
# name[-13:] == "/mlp/c_proj/w":
153-
# print(" Transposing")
154-
# data = data.transpose()
100+
print(f"Processing variable: {name} with shape: {shape} and type: {datao.dtype}")
155101

156-
dshape = data.shape
102+
data = datao.numpy().squeeze()
103+
n_dims = len(shape)
157104

158105
# default type is fp16
159106
ftype_cur = 1
@@ -164,18 +111,40 @@ def get_n_parts(dim):
164111

165112
# header
166113
sname = name.encode('utf-8')
167-
fout.write(struct.pack("iii", n_dims, len(sname), ftype_cur))
168-
for i in range(n_dims):
169-
fout.write(struct.pack("i", dshape[n_dims - 1 - i]))
170-
fout.write(sname);
114+
fout.write(struct.pack("iii", len(data.shape), len(sname), ftype_cur))
115+
for dim in reversed(data.shape):
116+
fout.write(struct.pack("i", dim))
117+
fout.write(sname)
171118

172-
# data
119+
# data output to file
173120
data.tofile(fout)
174121

175-
# I hope this deallocates the memory ..
176-
model = None
122+
def main():
123+
124+
args = parse_args()
125+
dir_model = args.dir_model
126+
ftype = args.ftype
127+
ftype_str = ["f32", "f16"]
128+
129+
hparams, tokenizer = load_hparams_and_tokenizer(dir_model)
130+
n_parts = get_n_parts(hparams["dim"])
131+
132+
for p in range(n_parts):
133+
134+
print(f"Processing part {p}\n")
135+
136+
fname_model = f"{dir_model}/consolidated.0{p}.pth"
137+
fname_out = f"{dir_model}/ggml-model-{ftype_str[ftype]}.bin{'' if p == 0 else '.' + str(p)}"
138+
139+
model = torch.load(fname_model, map_location="cpu")
140+
141+
with open(fname_out, "wb") as fout:
142+
write_header(fout, hparams, ftype)
143+
write_tokens(fout, tokenizer)
144+
process_and_write_variables(fout, model, ftype)
177145

178-
fout.close()
146+
del model
147+
print(f"Done. Output file: {fname_out}, (part {p})\n")
179148

180-
print("Done. Output file: " + fname_out + ", (part ", p, ")")
181-
print("")
149+
if __name__ == "__main__":
150+
main()

expose.cpp

+2-3
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ extern "C" {
5555

5656
int n_parts_overwrite = inputs.n_parts_overwrite;
5757

58-
if (!llama_model_load(api_params.model, api_model, api_vocab, api_params.n_ctx, n_parts_overwrite)) {
58+
if (!llama_model_load(api_params.model, api_model, api_vocab, api_params.n_ctx, GGML_TYPE_F16, n_parts_overwrite)) {
5959
fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, api_params.model.c_str());
6060
return false;
6161
}
@@ -101,7 +101,6 @@ extern "C" {
101101
if(reset_state)
102102
{
103103
api_params.prompt.insert(0, 1, ' ');
104-
mem_per_token = 0;
105104
}
106105
// tokenize the prompt
107106
std::vector<gpt_vocab::id> embd_inp = ::llama_tokenize(api_vocab, api_params.prompt, true);
@@ -164,7 +163,7 @@ extern "C" {
164163

165164
{
166165
// set the logit of the eos token (2) to zero to avoid sampling it
167-
api_logits[api_logits.size() - n_vocab + 2] = 0;
166+
api_logits[api_logits.size() - n_vocab + EOS_TOKEN_ID] = 0;
168167
//set logits of opening square bracket to zero.
169168
api_logits[api_logits.size() - n_vocab + 518] = 0;
170169
api_logits[api_logits.size() - n_vocab + 29961] = 0;

llamacpp.dll

6.55 KB
Binary file not shown.

0 commit comments

Comments
 (0)