Skip to content

Commit 074bea2

Browse files
eizslaren
andauthored
sentencepiece bpe compatible tokenizer (#252)
* potential out of bounds read * fix quantize * style * Update convert-pth-to-ggml.py * mild cleanup * don't need the space-prefixing here rn since main.cpp already does it * new file magic + version header field * readme notice * missing newlines Co-authored-by: slaren <[email protected]>
1 parent 5cb63e2 commit 074bea2

7 files changed

+182
-46
lines changed

Makefile

+1-1
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ endif
3131
#
3232

3333
CFLAGS = -I. -O3 -DNDEBUG -std=c11 -fPIC
34-
CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC
34+
CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++17 -fPIC
3535
LDFLAGS =
3636

3737
# OS specific

README.md

+3
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,9 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
1111
- Cache input prompts for faster initialization: https://github.com/ggerganov/llama.cpp/issues/64
1212
- Create a `llama.cpp` logo: https://github.com/ggerganov/llama.cpp/issues/105
1313

14+
**TEMPORARY NOTICE:**
15+
If you're updating to the latest master, you will need to regenerate your model files as the format has changed.
16+
1417
## Description
1518

1619
The main goal is to run the model using 4-bit quantization on a MacBook

convert-pth-to-ggml.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,8 @@ def write_header(fout, hparams, ftype):
6060

6161
keys = ["vocab_size", "dim", "multiple_of", "n_heads", "n_layers"]
6262
values = [
63-
0x67676d6c, # magic: ggml in hex
63+
0x67676d66, # magic: ggml in hex
64+
1, # file version
6465
*[hparams[key] for key in keys],
6566
hparams["dim"] // hparams["n_heads"], # rot (obsolete)
6667
ftype
@@ -85,6 +86,7 @@ def write_tokens(fout, tokenizer):
8586
text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
8687
fout.write(struct.pack("i", len(text)))
8788
fout.write(text)
89+
fout.write(struct.pack("f", tokenizer.get_score(i)))
8890

8991
def process_and_write_variables(fout, model, ftype):
9092

main.cpp

+20-1
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
#include "utils.h"
44

55
#include <cassert>
6+
#include <cinttypes>
67
#include <cmath>
78
#include <cstdio>
89
#include <cstring>
@@ -105,10 +106,24 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
105106
{
106107
uint32_t magic;
107108
fin.read((char *) &magic, sizeof(magic));
108-
if (magic != 0x67676d6c) {
109+
if (magic == 0x67676d6c) {
110+
fprintf(stderr, "%s: invalid model file '%s' (too old, regenerate your model files!)\n",
111+
__func__, fname.c_str());
112+
return false;
113+
}
114+
if (magic != 0x67676d66) {
109115
fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
110116
return false;
111117
}
118+
119+
uint32_t format_version;
120+
fin.read((char *) &format_version, sizeof(format_version));
121+
122+
if (format_version != 1) {
123+
fprintf(stderr, "%s: invalid model file '%s' (unsupported format version %" PRIu32 ")\n",
124+
__func__, fname.c_str(), format_version);
125+
return false;
126+
}
112127
}
113128

114129
int n_ff = 0;
@@ -154,8 +169,12 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
154169
word.resize(len);
155170
fin.read((char *) word.data(), len);
156171

172+
float score;
173+
fin.read((char *) &score, sizeof(score));
174+
157175
vocab.token_to_id[word] = i;
158176
vocab.id_to_token[i] = word;
177+
vocab.score[i] = score;
159178

160179
//if (i < 30000) {
161180
// fprintf(stderr, "%s: vocab[%d] = '%s'\n", __func__, i, word.c_str());

quantize.cpp

+23-1
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
#include "utils.h"
44

55
#include <cassert>
6+
#include <cinttypes>
67
#include <cmath>
78
#include <cstdio>
89
#include <cstring>
@@ -63,12 +64,28 @@ bool llama_model_quantize(const std::string & fname_inp, const std::string & fna
6364
{
6465
uint32_t magic;
6566
finp.read((char *) &magic, sizeof(magic));
66-
if (magic != 0x67676d6c) {
67+
if (magic == 0x67676d6c) {
68+
fprintf(stderr, "%s: invalid model file '%s' (too old, regenerate your model files!)\n",
69+
__func__, fname_inp.c_str());
70+
return false;
71+
}
72+
if (magic != 0x67676d66) {
6773
fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname_inp.c_str());
6874
return false;
6975
}
7076

7177
fout.write((char *) &magic, sizeof(magic));
78+
79+
uint32_t format_version;
80+
finp.read((char *) &format_version, sizeof(format_version));
81+
82+
if (format_version != 1) {
83+
fprintf(stderr, "%s: invalid model file '%s' (unsupported format version %" PRIu32 ")\n",
84+
__func__, fname_inp.c_str(), format_version);
85+
return false;
86+
}
87+
88+
fout.write((char *) &format_version, sizeof(format_version));
7289
}
7390

7491
llama_hparams hparams;
@@ -122,8 +139,13 @@ bool llama_model_quantize(const std::string & fname_inp, const std::string & fna
122139
finp.read ((char *) word.data(), len);
123140
fout.write((char *) word.data(), len);
124141

142+
float score;
143+
finp.read ((char *) &score, sizeof(score));
144+
fout.write((char *) &score, sizeof(score));
145+
125146
vocab.token_to_id[word] = i;
126147
vocab.id_to_token[i] = word;
148+
vocab.score[i] = score;
127149
}
128150
}
129151

utils.cpp

+130-41
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
#include <regex>
77
#include <iostream>
88
#include <iterator>
9+
#include <queue>
910
#include <string>
1011
#include <math.h>
1112

@@ -294,58 +295,146 @@ std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::stri
294295
return tokens;
295296
}
296297

297-
// TODO: Calculate this constant from the vocabulary
298-
#define MAX_TOKEN_LEN 18
299-
// SentencePiece implementation after https://guillaume-be.github.io/2020-05-30/sentence_piece
300-
std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos) {
301-
std::vector<gpt_vocab::id> res;
302-
std::vector<int> score;
303-
std::vector<gpt_vocab::id> prev;
304-
int len = text.length();
305-
306-
score.resize(len + 1);
307-
prev.resize(len + 1);
308-
309-
// Forward pass
310-
for (int i = 0; i < len; i++) {
311-
int max_len = std::min(len - i, MAX_TOKEN_LEN);
312-
for (int sub_len = 1; sub_len <= max_len; sub_len++) {
313-
auto sub = text.substr(i, sub_len);
314-
auto token = vocab.token_to_id.find(sub);
315-
if (token != vocab.token_to_id.end()) {
316-
int token_score = sub.length() * sub.length();
317-
int local_score = score[i] + token_score;
318-
int next = i + sub_len;
319-
if (score[next] < local_score) {
320-
score[next] = local_score;
321-
prev[next] = (*token).second;
298+
static size_t utf8_len(char src) {
299+
const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
300+
uint8_t highbits = static_cast<uint8_t>(src) >> 4;
301+
return lookup[highbits];
302+
}
303+
304+
struct llama_sp_symbol {
305+
using index = int;
306+
index prev;
307+
index next;
308+
std::string_view text;
309+
};
310+
311+
struct llama_sp_bigram {
312+
struct comparator {
313+
bool operator()(llama_sp_bigram & l, llama_sp_bigram & r) {
314+
return (l.score < r.score) || (l.score == r.score && l.left > r.left);
315+
}
316+
};
317+
using queue_storage = std::vector<llama_sp_bigram>;
318+
using queue = std::priority_queue<llama_sp_bigram, queue_storage, comparator>;
319+
llama_sp_symbol::index left;
320+
llama_sp_symbol::index right;
321+
float score;
322+
size_t size;
323+
};
324+
325+
struct llama_tokenizer {
326+
llama_tokenizer(const gpt_vocab & vocab): vocab_(vocab) {}
327+
328+
void tokenize(std::string_view text, std::vector<gpt_vocab::id> & output) {
329+
// split string into utf8 chars
330+
int index = 0;
331+
while (!text.empty()) {
332+
llama_sp_symbol sym;
333+
size_t char_len = std::min(text.size(), utf8_len(text.data()[0]));
334+
sym.text = std::string_view(text.data(), char_len);
335+
sym.prev = index - 1;
336+
text.remove_prefix(char_len);
337+
sym.next = text.empty() ? -1 : index + 1;
338+
index++;
339+
symbols_.emplace_back(std::move(sym));
340+
}
341+
342+
// seed the work queue with all possible 2-character tokens.
343+
for (size_t i = 1; i < symbols_.size(); ++i) {
344+
try_add_bigram(i - 1, i);
345+
}
346+
347+
// keep substituting the highest frequency pairs for as long as we can.
348+
while (!work_queue_.empty()) {
349+
auto bigram = work_queue_.top();
350+
work_queue_.pop();
351+
352+
auto & left_sym = symbols_[bigram.left];
353+
auto & right_sym = symbols_[bigram.right];
354+
355+
// if one of the symbols already got merged, skip it.
356+
if (left_sym.text.empty() || right_sym.text.empty() ||
357+
left_sym.text.size() + right_sym.text.size() != bigram.size) {
358+
continue;
359+
}
360+
361+
// merge the right sym into the left one
362+
left_sym.text = std::string_view(left_sym.text.data(), left_sym.text.size() + right_sym.text.size());
363+
right_sym.text = std::string_view("");
364+
365+
// remove the right sym from the chain
366+
left_sym.next = right_sym.next;
367+
if (right_sym.next >= 0) {
368+
symbols_[right_sym.next].prev = bigram.left;
369+
}
370+
371+
// find more substitutions
372+
try_add_bigram(left_sym.prev, bigram.left);
373+
try_add_bigram(bigram.left, left_sym.next);
374+
}
375+
376+
for (int i = 0; i != -1; i = symbols_[i].next) {
377+
auto& symbol = symbols_[i];
378+
auto token = vocab_.token_to_id.find(std::string(symbol.text));
379+
380+
if (token == vocab_.token_to_id.end()) {
381+
// output any symbols that did not form tokens as bytes.
382+
for (int j = 0; j < symbol.text.size(); ++j) {
383+
gpt_vocab::id token_id = static_cast<uint8_t>(symbol.text[j]) + 3;
384+
output.push_back(token_id);
322385
}
386+
} else {
387+
output.push_back((*token).second);
323388
}
324389
}
325390
}
326391

327-
// Backward pass
328-
int i = len;
329-
while (i > 0) {
330-
gpt_vocab::id token_id = prev[i];
331-
if (token_id == 0) {
332-
// TODO: Return error or something more meaningful
333-
printf("failed to tokenize string!\n");
334-
break;
392+
private:
393+
void try_add_bigram(int left, int right) {
394+
if (left == -1 || right == -1) {
395+
return;
396+
}
397+
398+
std::string_view text(symbols_[left].text.data(), symbols_[left].text.size() + symbols_[right].text.size());
399+
auto token = vocab_.token_to_id.find(std::string(text));
400+
401+
if (token == vocab_.token_to_id.end()) {
402+
return;
335403
}
336-
res.push_back(token_id);
337-
auto token = (*vocab.id_to_token.find(token_id)).second;
338-
i -= token.length();
404+
405+
auto score = vocab_.score.find((*token).second);
406+
407+
if (score == vocab_.score.end()) {
408+
return;
409+
}
410+
411+
llama_sp_bigram bigram;
412+
bigram.left = left;
413+
bigram.right = right;
414+
bigram.score = (*score).second;
415+
bigram.size = text.size();
416+
work_queue_.push(bigram);
339417
}
340418

341-
if (bos) {
342-
res.push_back(1); // TODO: replace with vocab.bos
419+
const gpt_vocab & vocab_;
420+
std::vector<llama_sp_symbol> symbols_;
421+
llama_sp_bigram::queue work_queue_;
422+
};
423+
424+
std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, std::string_view text, bool bos) {
425+
llama_tokenizer tokenizer(vocab);
426+
std::vector<gpt_vocab::id> output;
427+
428+
if (text.size() == 0) {
429+
return output;
343430
}
344431

345-
// Pieces are in reverse order so correct that
346-
std::reverse(res.begin(), res.end());
432+
if (bos) {
433+
output.push_back(1);
434+
}
347435

348-
return res;
436+
tokenizer.tokenize(text, output);
437+
return output;
349438
}
350439

351440
bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab) {

utils.h

+2-1
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ struct gpt_vocab {
5858

5959
std::map<token, id> token_to_id;
6060
std::map<id, token> id_to_token;
61+
std::map<id, float> score;
6162
};
6263

6364
void replace(std::string & str, const std::string & needle, const std::string & replacement);
@@ -79,7 +80,7 @@ std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::stri
7980

8081
// TODO: this is probably wrong, but I cannot figure out how this tokenizer works ..
8182
// ref: https://github.com/google/sentencepiece
82-
std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos);
83+
std::vector<gpt_vocab::id> llama_tokenize(const gpt_vocab & vocab, std::string_view text, bool bos);
8384

8485
// load the tokens from encoder.json
8586
bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab);

0 commit comments

Comments
 (0)