Skip to content

Commit 8d39365

Browse files
committed
update license, added backwards compatibility with both ggml model formats, fixed context length issues.
1 parent a2c10e0 commit 8d39365

11 files changed

+807
-15
lines changed

LICENSE.md

+661
Large diffs are not rendered by default.

LICENSE MIT_LICENSE_GGML_LLAMACPP_ONLY

+5
Original file line numberDiff line numberDiff line change
@@ -19,3 +19,8 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
1919
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
2020
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
2121
SOFTWARE.
22+
23+
===================================
24+
25+
Note that the above license applies ONLY to the GGML library and llama.cpp by ggerganov which are licensed under the MIT License
26+
Kobold Lite by Concedo and the provided python ctypes bindings in llamacpp.dll are licensed under the AGPL v3.0 License

Makefile

+7-4
Original file line numberDiff line numberDiff line change
@@ -188,15 +188,18 @@ ggml.o: ggml.c ggml.h
188188
utils.o: utils.cpp utils.h
189189
$(CXX) $(CXXFLAGS) -c utils.cpp -o utils.o
190190

191+
extra.o: extra.cpp extra.h
192+
$(CXX) $(CXXFLAGS) -c extra.cpp -o extra.o
193+
191194
clean:
192195
rm -f *.o main quantize
193196

194-
main: main.cpp ggml.o utils.o
195-
$(CXX) $(CXXFLAGS) main.cpp ggml.o utils.o -o main $(LDFLAGS)
197+
main: main.cpp ggml.o utils.o extra.o
198+
$(CXX) $(CXXFLAGS) main.cpp ggml.o utils.o extra.o -o main $(LDFLAGS)
196199
./main -h
197200

198-
llamalib: expose.cpp ggml.o utils.o
199-
$(CXX) $(CXXFLAGS) expose.cpp ggml.o utils.o -shared -o llamacpp.dll $(LDFLAGS)
201+
llamalib: expose.cpp ggml.o utils.o extra.o
202+
$(CXX) $(CXXFLAGS) expose.cpp ggml.o utils.o extra.o -shared -o llamacpp.dll $(LDFLAGS)
200203

201204
quantize: quantize.cpp ggml.o utils.o
202205
$(CXX) $(CXXFLAGS) quantize.cpp ggml.o utils.o -o quantize $(LDFLAGS)

README.md

+7-1
Original file line numberDiff line numberDiff line change
@@ -16,4 +16,10 @@ If you care, **please contribute to [this discussion](https://github.com/ggergan
1616
## Usage
1717
- Windows binaries are provided in the form of **llamacpp.dll** but if you feel worried go ahead and rebuild it yourself.
1818
- Weights are not included, you can use the llama.cpp quantize.exe to generate them from your official weight files (or download them from...places).
19-
- To run, simply clone the repo and run `llama_for_kobold.py [ggml_quant_model.bin] [port]`, and then connect with Kobold or Kobold Lite (for example, https://lite.koboldai.net/?local=1&port=5001).
19+
- To run, simply clone the repo and run `llama_for_kobold.py [ggml_quant_model.bin] [port]`, and then connect with Kobold or Kobold Lite.
20+
- By default, you can connect to http://localhost:5001 (you can also use https://lite.koboldai.net/?local=1&port=5001).
21+
22+
## License
23+
- The original GGML library and llama.cpp by ggerganov are licensed under the MIT License
24+
- However, Kobold Lite is licensed under the AGPL v3.0 License
25+
- The provided python ctypes bindings in llamacpp.dll are also under the AGPL v3.0 License

expose.cpp

+25-3
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
//Python will ALWAYS provide the memory, we just write to it.
99

1010
#include "main.cpp"
11+
#include "extra.h"
1112

1213
extern "C" {
1314

@@ -45,6 +46,7 @@ extern "C" {
4546
std::vector<float> api_logits;
4647
std::vector<gpt_vocab::id> last_n_tokens;
4748
size_t mem_per_token = 0;
49+
bool legacy_format = false;
4850

4951
bool load_model(const load_model_inputs inputs)
5052
{
@@ -55,10 +57,16 @@ extern "C" {
5557

5658
int n_parts_overwrite = inputs.n_parts_overwrite;
5759

58-
if (!llama_model_load(api_params.model, api_model, api_vocab, api_params.n_ctx, GGML_TYPE_F16, n_parts_overwrite)) {
60+
int loadresult = llama_model_load(api_params.model, api_model, api_vocab, api_params.n_ctx, GGML_TYPE_F16, n_parts_overwrite);
61+
if (!loadresult) {
5962
fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, api_params.model.c_str());
6063
return false;
6164
}
65+
legacy_format = (loadresult==2?true:false);
66+
if(legacy_format)
67+
{
68+
printf("\n---\nWarning: Your model is using an OUTDATED format. Please reconvert it for better results!\n");
69+
}
6270

6371
return true;
6472
}
@@ -103,8 +111,20 @@ extern "C" {
103111
api_params.prompt.insert(0, 1, ' ');
104112
}
105113
// tokenize the prompt
106-
std::vector<gpt_vocab::id> embd_inp = ::llama_tokenize(api_vocab, api_params.prompt, true);
107-
api_params.n_predict = std::min(api_params.n_predict, api_model.hparams.n_ctx - (int)embd_inp.size());
114+
std::vector<gpt_vocab::id> embd_inp;
115+
if(legacy_format)
116+
{
117+
embd_inp = ::legacy_llama_tokenize(api_vocab, api_params.prompt, true);
118+
}else{
119+
embd_inp = ::llama_tokenize(api_vocab, api_params.prompt, true);
120+
}
121+
122+
//api_params.n_predict = std::min(api_params.n_predict, api_model.hparams.n_ctx - (int)embd_inp.size());
123+
//truncate to front of the prompt if its too long
124+
if (embd_inp.size() + api_params.n_predict > api_model.hparams.n_ctx) {
125+
int offset = embd_inp.size() - api_model.hparams.n_ctx + api_params.n_predict;
126+
embd_inp = std::vector<gpt_vocab::id>(embd_inp.begin() + offset, embd_inp.end());
127+
}
108128
std::vector<gpt_vocab::id> embd;
109129

110130
int last_n_size = api_params.repeat_last_n;
@@ -131,6 +151,7 @@ extern "C" {
131151
std::mt19937 api_rng(api_params.seed);
132152
std::string concat_output = "";
133153

154+
printf("\nProcessing: ");
134155
while (remaining_tokens > 0)
135156
{
136157
gpt_vocab::id id = 0;
@@ -141,6 +162,7 @@ extern "C" {
141162
// std::cout << i << ',';
142163
// }
143164
//printf("\nnp:%d embd:%d mem:%d",api_n_past,embd.size(),mem_per_token);
165+
printf("|");
144166
if (!llama_eval(api_model, api_params.n_threads, api_n_past, embd, api_logits, mem_per_token))
145167
{
146168
fprintf(stderr, "Failed to predict\n");

extra.cpp

+72
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
2+
#include "extra.h"
3+
4+
#include <cassert>
5+
#include <cstring>
6+
#include <fstream>
7+
#include <regex>
8+
#include <iostream>
9+
#include <iterator>
10+
#include <queue>
11+
#include <string>
12+
#include <math.h>
13+
14+
#if defined(_MSC_VER) || defined(__MINGW32__)
15+
#include <malloc.h> // using malloc.h with MSC/MINGW
16+
#elif !defined(__FreeBSD__) && !defined(__NetBSD__)
17+
#include <alloca.h>
18+
#endif
19+
20+
// TODO: Calculate this constant from the vocabulary
21+
#define MAX_TOKEN_LEN 18
22+
// SentencePiece implementation after https://guillaume-be.github.io/2020-05-30/sentence_piece
23+
std::vector<gpt_vocab::id> legacy_llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos) {
24+
std::vector<gpt_vocab::id> res;
25+
std::vector<int> score;
26+
std::vector<gpt_vocab::id> prev;
27+
int len = text.length();
28+
29+
score.resize(len + 1);
30+
prev.resize(len + 1);
31+
32+
// Forward pass
33+
for (int i = 0; i < len; i++) {
34+
int max_len = std::min(len - i, MAX_TOKEN_LEN);
35+
for (int sub_len = 1; sub_len <= max_len; sub_len++) {
36+
auto sub = text.substr(i, sub_len);
37+
auto token = vocab.token_to_id.find(sub);
38+
if (token != vocab.token_to_id.end()) {
39+
int token_score = sub.length() * sub.length();
40+
int local_score = score[i] + token_score;
41+
int next = i + sub_len;
42+
if (score[next] < local_score) {
43+
score[next] = local_score;
44+
prev[next] = (*token).second;
45+
}
46+
}
47+
}
48+
}
49+
50+
// Backward pass
51+
int i = len;
52+
while (i > 0) {
53+
gpt_vocab::id token_id = prev[i];
54+
if (token_id == 0) {
55+
// TODO: Return error or something more meaningful
56+
printf("failed to tokenize string!\n");
57+
break;
58+
}
59+
res.push_back(token_id);
60+
auto token = (*vocab.id_to_token.find(token_id)).second;
61+
i -= token.length();
62+
}
63+
64+
if (bos) {
65+
res.push_back(1); // TODO: replace with vocab.bos
66+
}
67+
68+
// Pieces are in reverse order so correct that
69+
std::reverse(res.begin(), res.end());
70+
71+
return res;
72+
}

extra.h

+14
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
#include "utils.h"
2+
3+
#include <cassert>
4+
#include <cinttypes>
5+
#include <cmath>
6+
#include <cstdio>
7+
#include <cstring>
8+
#include <fstream>
9+
#include <iostream>
10+
#include <map>
11+
#include <string>
12+
#include <vector>
13+
14+
std::vector<gpt_vocab::id> legacy_llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos);

llama_for_kobold.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -74,8 +74,8 @@ def generate(prompt,max_length=20,temperature=0.8,top_k=100,top_p=0.85,rep_pen=1
7474
# global vars
7575
global friendlymodelname
7676
friendlymodelname = ""
77-
maxctx = 1024
78-
maxlen = 256
77+
maxctx = 512
78+
maxlen = 128
7979
modelbusy = False
8080
port = 5001
8181
last_context = ""
@@ -151,6 +151,7 @@ def do_POST(self):
151151
if last_context!="" and newprompt.startswith(last_context):
152152
fresh_state = False
153153
newprompt = newprompt[len(last_context):]
154+
print("Resuming state, new input len: " + str(len(newprompt)))
154155
#print("trimmed: " + newprompt)
155156
recvtxt = generate(
156157
prompt=newprompt,

llamacpp.dll

5.67 KB
Binary file not shown.

main.cpp

+13-5
Original file line numberDiff line numberDiff line change
@@ -89,8 +89,8 @@ struct llama_model {
8989
std::map<std::string, struct ggml_tensor *> tensors;
9090
};
9191

92-
// load the model's weights from a file
93-
bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab & vocab, int n_ctx, ggml_type memory_type = GGML_TYPE_F32, int n_parts_overwrite=-1) {
92+
// load the model's weights from a file. return val: 0=fail, 1=newformat, 2=legacy
93+
int llama_model_load(const std::string & fname, llama_model & model, gpt_vocab & vocab, int n_ctx, ggml_type memory_type = GGML_TYPE_F32, int n_parts_overwrite=-1) {
9494
fprintf(stderr, "%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
9595

9696
std::vector<char> f_buf(1024*1024);
@@ -102,15 +102,18 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
102102
return false;
103103
}
104104

105+
bool legacy_file_format = false;
105106
// verify magic
106107
{
107108
uint32_t magic;
108109
fin.read((char *) &magic, sizeof(magic));
109110
if (magic == 0x67676d6c) {
110111
fprintf(stderr, "%s: invalid model file '%s' (too old, regenerate your model files!)\n",
111112
__func__, fname.c_str());
112-
return false;
113+
legacy_file_format = true;
113114
}
115+
else
116+
{
114117
if (magic != 0x67676d66) {
115118
fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
116119
return false;
@@ -124,6 +127,7 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
124127
__func__, fname.c_str(), format_version);
125128
return false;
126129
}
130+
}
127131
}
128132

129133
int n_ff = 0;
@@ -173,12 +177,16 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
173177
word.resize(len);
174178
fin.read((char *) word.data(), len);
175179

180+
if(!legacy_file_format)
181+
{
176182
float score;
177183
fin.read((char *) &score, sizeof(score));
184+
vocab.score[i] = score;
185+
}
178186

179187
vocab.token_to_id[word] = i;
180188
vocab.id_to_token[i] = word;
181-
vocab.score[i] = score;
189+
182190

183191
//if (i < 30000) {
184192
// fprintf(stderr, "%s: vocab[%d] = '%s'\n", __func__, i, word.c_str());
@@ -531,7 +539,7 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
531539
fin.close();
532540
}
533541

534-
return true;
542+
return (legacy_file_format?2:true);
535543
}
536544

537545
// evaluate the transformer

main.exe

5.94 KB
Binary file not shown.

0 commit comments

Comments
 (0)