update license, added backwards compatibility with both ggml model formats, fixed context length issues.

LostRuins · LostRuins · commit 8d39365af61c · 2023-03-20T23:43:35.000+08:00
diff --git a/LICENSE.md b/LICENSE.md
diff --git a/MIT_LICENSE_GGML_LLAMACPP_ONLY b/MIT_LICENSE_GGML_LLAMACPP_ONLY
@@ -19,3 +19,8 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
+
+===================================
+
+Note that the above license applies ONLY to the GGML library and llama.cpp by ggerganov which are licensed under the MIT License
+Kobold Lite by Concedo and the provided python ctypes bindings in llamacpp.dll are licensed under the AGPL v3.0 License
diff --git a/Makefile b/Makefile
@@ -188,15 +188,18 @@ ggml.o: ggml.c ggml.h
 utils.o: utils.cpp utils.h
 	$(CXX) $(CXXFLAGS) -c utils.cpp -o utils.o
 
+extra.o: extra.cpp extra.h
+	$(CXX) $(CXXFLAGS) -c extra.cpp -o extra.o
+
 clean:
 	rm -f *.o main quantize
 
-main: main.cpp ggml.o utils.o
-	$(CXX) $(CXXFLAGS) main.cpp ggml.o utils.o -o main $(LDFLAGS)
+main: main.cpp ggml.o utils.o extra.o
+	$(CXX) $(CXXFLAGS) main.cpp ggml.o utils.o extra.o -o main $(LDFLAGS)
 	./main -h
 	
-llamalib: expose.cpp ggml.o utils.o
-	$(CXX) $(CXXFLAGS) expose.cpp ggml.o utils.o -shared -o llamacpp.dll $(LDFLAGS)
+llamalib: expose.cpp ggml.o utils.o extra.o
+	$(CXX) $(CXXFLAGS) expose.cpp ggml.o utils.o extra.o -shared -o llamacpp.dll $(LDFLAGS)
 
 quantize: quantize.cpp ggml.o utils.o
 	$(CXX) $(CXXFLAGS) quantize.cpp ggml.o utils.o -o quantize $(LDFLAGS)
diff --git a/README.md b/README.md
@@ -16,4 +16,10 @@ If you care, **please contribute to [this discussion](https://github.com/ggergan
 ## Usage
 - Windows binaries are provided in the form of **llamacpp.dll** but if you feel worried go ahead and rebuild it yourself.
 - Weights are not included, you can use the llama.cpp quantize.exe to generate them from your official weight files (or download them from...places).
-- To run, simply clone the repo and run `llama_for_kobold.py [ggml_quant_model.bin] [port]`, and then connect with Kobold or Kobold Lite (for example, https://lite.koboldai.net/?local=1&port=5001).
+- To run, simply clone the repo and run `llama_for_kobold.py [ggml_quant_model.bin] [port]`, and then connect with Kobold or Kobold Lite. 
+- By default, you can connect to http://localhost:5001 (you can also use https://lite.koboldai.net/?local=1&port=5001).
+
+## License
+- The original GGML library and llama.cpp by ggerganov are licensed under the MIT License
+- However, Kobold Lite is licensed under the AGPL v3.0 License
+- The provided python ctypes bindings in llamacpp.dll are also under the AGPL v3.0 License
diff --git a/expose.cpp b/expose.cpp
@@ -8,6 +8,7 @@
 //Python will ALWAYS provide the memory, we just write to it.
 
 #include "main.cpp"
+#include "extra.h"
 
 extern "C" {
 
@@ -45,6 +46,7 @@ extern "C" {
     std::vector<float> api_logits;
     std::vector<gpt_vocab::id> last_n_tokens;
     size_t mem_per_token = 0;
+    bool legacy_format = false;
 
     bool load_model(const load_model_inputs inputs)
     {
@@ -55,10 +57,16 @@ extern "C" {
 
         int n_parts_overwrite =  inputs.n_parts_overwrite;
 
-        if (!llama_model_load(api_params.model, api_model, api_vocab, api_params.n_ctx, GGML_TYPE_F16, n_parts_overwrite)) {  
+        int loadresult = llama_model_load(api_params.model, api_model, api_vocab, api_params.n_ctx, GGML_TYPE_F16, n_parts_overwrite);
+        if (!loadresult) {  
             fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, api_params.model.c_str());
             return false;
         }
+        legacy_format = (loadresult==2?true:false);
+        if(legacy_format)
+        {
+            printf("\n---\nWarning: Your model is using an OUTDATED format. Please reconvert it for better results!\n");
+        }
 
         return true;
     }
@@ -103,8 +111,20 @@ extern "C" {
             api_params.prompt.insert(0, 1, ' ');
         }
         // tokenize the prompt
-        std::vector<gpt_vocab::id> embd_inp = ::llama_tokenize(api_vocab, api_params.prompt, true);
-        api_params.n_predict = std::min(api_params.n_predict, api_model.hparams.n_ctx - (int)embd_inp.size());
+        std::vector<gpt_vocab::id> embd_inp;
+        if(legacy_format)
+        {
+            embd_inp = ::legacy_llama_tokenize(api_vocab, api_params.prompt, true);
+        }else{
+            embd_inp = ::llama_tokenize(api_vocab, api_params.prompt, true);
+        }
+         
+        //api_params.n_predict = std::min(api_params.n_predict, api_model.hparams.n_ctx - (int)embd_inp.size());
+        //truncate to front of the prompt if its too long
+        if (embd_inp.size() + api_params.n_predict > api_model.hparams.n_ctx) {
+            int offset = embd_inp.size() - api_model.hparams.n_ctx + api_params.n_predict;
+            embd_inp = std::vector<gpt_vocab::id>(embd_inp.begin() + offset, embd_inp.end());
+        }
         std::vector<gpt_vocab::id> embd;
         
         int last_n_size = api_params.repeat_last_n;
@@ -131,6 +151,7 @@ extern "C" {
         std::mt19937 api_rng(api_params.seed);
         std::string concat_output = "";        
        
+        printf("\nProcessing: ");
         while (remaining_tokens > 0)
         {
             gpt_vocab::id id = 0;
@@ -141,6 +162,7 @@ extern "C" {
                 //     std::cout << i << ',';
                 // }
                 //printf("\nnp:%d embd:%d mem:%d",api_n_past,embd.size(),mem_per_token);
+                printf("|");
                 if (!llama_eval(api_model, api_params.n_threads, api_n_past, embd, api_logits, mem_per_token))
                 {
                     fprintf(stderr, "Failed to predict\n");
diff --git a/extra.cpp b/extra.cpp
@@ -0,0 +1,72 @@
+
+#include "extra.h"
+
+#include <cassert>
+#include <cstring>
+#include <fstream>
+#include <regex>
+#include <iostream>
+#include <iterator>
+#include <queue>
+#include <string>
+#include <math.h>
+
+ #if defined(_MSC_VER) || defined(__MINGW32__)
+ #include <malloc.h> // using malloc.h with MSC/MINGW
+ #elif !defined(__FreeBSD__) && !defined(__NetBSD__)
+ #include <alloca.h>
+ #endif
+
+// TODO: Calculate this constant from the vocabulary
+#define MAX_TOKEN_LEN 18
+// SentencePiece implementation after https://guillaume-be.github.io/2020-05-30/sentence_piece
+std::vector<gpt_vocab::id> legacy_llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos) {
+    std::vector<gpt_vocab::id> res;
+    std::vector<int> score;
+    std::vector<gpt_vocab::id> prev;
+    int len = text.length();
+
+    score.resize(len + 1);
+    prev.resize(len + 1);
+
+    // Forward pass
+    for (int i = 0; i < len; i++) {
+        int max_len = std::min(len - i, MAX_TOKEN_LEN);
+        for (int sub_len = 1; sub_len <= max_len; sub_len++) {
+            auto sub = text.substr(i, sub_len);
+            auto token = vocab.token_to_id.find(sub);
+            if (token != vocab.token_to_id.end()) {
+                int token_score = sub.length() * sub.length();
+                int local_score = score[i] + token_score;
+                int next = i + sub_len;
+                if (score[next] < local_score) {
+                    score[next] = local_score;
+                    prev[next] = (*token).second;
+                }
+            }
+        }
+    }
+
+    // Backward pass
+    int i = len;
+    while (i > 0) {
+        gpt_vocab::id token_id = prev[i];
+        if (token_id == 0) {
+	    // TODO: Return error or something more meaningful
+            printf("failed to tokenize string!\n");
+	    break;
+        }
+        res.push_back(token_id);
+        auto token = (*vocab.id_to_token.find(token_id)).second;
+        i -= token.length();
+    }
+
+    if (bos) {
+        res.push_back(1); // TODO: replace with vocab.bos
+    }
+
+    // Pieces are in reverse order so correct that
+    std::reverse(res.begin(), res.end());
+
+    return res;
+}
diff --git a/extra.h b/extra.h
@@ -0,0 +1,14 @@
+#include "utils.h"
+
+#include <cassert>
+#include <cinttypes>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <fstream>
+#include <iostream>
+#include <map>
+#include <string>
+#include <vector>
+
+std::vector<gpt_vocab::id> legacy_llama_tokenize(const gpt_vocab & vocab, const std::string & text, bool bos);
diff --git a/llama_for_kobold.py b/llama_for_kobold.py
@@ -74,8 +74,8 @@ def generate(prompt,max_length=20,temperature=0.8,top_k=100,top_p=0.85,rep_pen=1
 # global vars
 global friendlymodelname 
 friendlymodelname = ""
-maxctx = 1024
-maxlen = 256
+maxctx = 512
+maxlen = 128
 modelbusy = False
 port = 5001
 last_context = ""
@@ -151,6 +151,7 @@ def do_POST(self):
                 if last_context!="" and newprompt.startswith(last_context):
                     fresh_state = False
                     newprompt = newprompt[len(last_context):]
+                    print("Resuming state, new input len: " + str(len(newprompt)))
                     #print("trimmed: " + newprompt)
                 recvtxt = generate(
                     prompt=newprompt,
diff --git a/llamacpp.dll b/llamacpp.dll
diff --git a/main.cpp b/main.cpp
@@ -89,8 +89,8 @@ struct llama_model {
     std::map<std::string, struct ggml_tensor *> tensors;
 };
 
-// load the model's weights from a file
-bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab & vocab, int n_ctx, ggml_type memory_type = GGML_TYPE_F32, int n_parts_overwrite=-1) {
+// load the model's weights from a file. return val: 0=fail, 1=newformat, 2=legacy
+int llama_model_load(const std::string & fname, llama_model & model, gpt_vocab & vocab, int n_ctx, ggml_type memory_type = GGML_TYPE_F32, int n_parts_overwrite=-1) {
     fprintf(stderr, "%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
 
     std::vector<char> f_buf(1024*1024);
@@ -102,15 +102,18 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
         return false;
     }
 
+    bool legacy_file_format = false;
     // verify magic
     {
         uint32_t magic;
         fin.read((char *) &magic, sizeof(magic));
         if (magic == 0x67676d6c) {
             fprintf(stderr, "%s: invalid model file '%s' (too old, regenerate your model files!)\n",
                     __func__, fname.c_str());
-            return false;
+            legacy_file_format = true;
         }
+        else
+        {
         if (magic != 0x67676d66) {
             fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
             return false;
@@ -124,6 +127,7 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
                     __func__, fname.c_str(), format_version);
             return false;
         }
+        }
     }
 
     int n_ff = 0;
@@ -173,12 +177,16 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
             word.resize(len);
             fin.read((char *) word.data(), len);
 
+            if(!legacy_file_format)
+            {
             float score;
             fin.read((char *) &score, sizeof(score));
+            vocab.score[i] = score;
+            }
 
             vocab.token_to_id[word] = i;
             vocab.id_to_token[i] = word;
-            vocab.score[i] = score;
+            
 
             //if (i < 30000) {
             //    fprintf(stderr, "%s: vocab[%d] = '%s'\n", __func__, i, word.c_str());
@@ -531,7 +539,7 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
         fin.close();
     }
 
-    return true;
+    return (legacy_file_format?2:true);
 }
 
 // evaluate the transformer
diff --git a/main.exe b/main.exe