Merge IK main 2023-08-12

Nexesenex · Aug 12, 2024 · ed8f227 · ed8f227
1 parent 6e02327
commit ed8f227
Show file tree

Hide file tree

Showing 43 changed files with 13,666 additions and 803 deletions.
diff --git a/.devops/llama-server.Dockerfile b/.devops/llama-server.Dockerfile
@@ -3,7 +3,7 @@ ARG UBUNTU_VERSION=22.04
 FROM ubuntu:$UBUNTU_VERSION AS build
 
 RUN apt-get update && \
-    apt-get install -y build-essential git libcurl4-openssl-dev
+    apt-get install -y build-essential git libcurl4-openssl-dev curl
 
 WORKDIR /app
 

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -7,6 +7,10 @@ set(CMAKE_WARN_UNUSED_CLI YES)
 
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
+set(CMAKE_CUDA_USE_RESPONSE_FILE_FOR_INCLUDES 0)
+set(CMAKE_CUDA_USE_RESPONSE_FILE_FOR_LIBRARIES 0)
+set(CMAKE_CUDA_USE_RESPONSE_FILE_FOR_OBJECTS 0)
+
 if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
     set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
     set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")

diff --git a/LICENSE b/LICENSE
@@ -1,6 +1,7 @@
 MIT License
 
 Copyright (c) 2023-2024 The ggml authors
+Copyright (c) 2024 Iwan Kawrakow
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

diff --git a/Makefile b/Makefile
@@ -249,7 +249,7 @@ endif
 # keep standard at C11 and C++11
 MK_CPPFLAGS  = -Iggml/include -Iggml/src -Iinclude -Isrc -Icommon
 MK_CFLAGS    = -std=c11   -fPIC
-MK_CXXFLAGS  = -std=c++11 -fPIC
+MK_CXXFLAGS  = -std=c++17 -fPIC
 MK_NVCCFLAGS = -std=c++11
 
 ifdef LLAMA_NO_CCACHE
@@ -326,9 +326,9 @@ ifdef LLAMA_DEBUG
 	endif
 else
 	MK_CPPFLAGS   += -DNDEBUG
-	MK_CFLAGS     += -O3 -g
-	MK_CXXFLAGS   += -O3 -g
-	MK_NVCCFLAGS  += -O3 -g
+	MK_CFLAGS     += -O3
+	MK_CXXFLAGS   += -O3
+	MK_NVCCFLAGS  += -O3
 endif
 
 ifdef LLAMA_SANITIZE_THREAD
@@ -572,6 +572,12 @@ ifdef GGML_NVPL
 	OBJ_GGML    += ggml/src/ggml-blas.o
 endif # GGML_NVPL
 
+OBJ_GGML    += ggml/src/iqk/iqk_quantize.o
+ifndef GGML_NO_IQKMULMAT
+	MK_CPPFLAGS += -DGGML_USE_IQK_MULMAT
+	OBJ_GGML    += ggml/src/iqk/iqk_mul_mat.o
+endif
+
 ifndef GGML_NO_LLAMAFILE
 	MK_CPPFLAGS += -DGGML_USE_LLAMAFILE
 	OBJ_GGML    += ggml/src/llamafile/sgemm.o
@@ -1058,6 +1064,20 @@ ggml/src/ggml-blas.o: \
 	ggml/include/ggml-blas.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 
+ggml/src/iqk/iqk_quantize.o: \
+	ggml/src/iqk/iqk_quantize.cpp \
+	ggml/src/iqk/iqk_quantize.h \
+	ggml/src/ggml-quants.h ggml/src/ggml-common.h ggml/include/ggml.h ggml/src/ggml-impl.h
+
+ifndef GGML_NO_IQKMULMAT
+ggml/src/iqk/iqk_mul_mat.o: \
+	ggml/src/iqk/iqk_mul_mat.cpp \
+	ggml/src/iqk/iqk_mul_mat.h \
+	ggml/src/iqk/iqk_quantize.h \
+	ggml/src/ggml-quants.h ggml/src/ggml-common.h ggml/include/ggml.h ggml/src/ggml-impl.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+endif # GGML_NO_IQKMULMAT
+
 ifndef GGML_NO_LLAMAFILE
 ggml/src/llamafile/sgemm.o: \
 	ggml/src/llamafile/sgemm.cpp \

diff --git a/README.md b/README.md
diff --git a/common/common.cpp b/common/common.cpp
@@ -1296,6 +1296,14 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
         params.process_output = true;
         return true;
     }
+    if (arg == "--output-tensor-name") {
+        if (++i >= argc) {
+            invalid_param = true;
+            return true;
+        }
+        params.output_tensor_name = argv[i];
+        return true;
+    }
     if (arg == "--no-ppl") {
         params.compute_ppl = false;
         return true;

diff --git a/common/common.h b/common/common.h
@@ -246,6 +246,7 @@ struct gpt_params {
 
     // imatrix params
     std::string out_file = "imatrix.dat"; // save the resulting imatrix to this file
+    std::string output_tensor_name = "output.weight"; // name of the output tensor
 
     int32_t n_out_freq  = 10; // output the imatrix every n_out_freq iterations
     int32_t n_save_freq =  0; // save the imatrix every n_save_freq iterations

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
@@ -1617,23 +1617,40 @@ def weight_quant(self, weight):
         return weight.type(dtype), scale.type(torch.float32)
 
     def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        new_name = self.map_tensor_name(name)
-
-        if any(self.match_model_tensor_name(new_name, key, bid) for key in [
-            gguf.MODEL_TENSOR.ATTN_Q,
-            gguf.MODEL_TENSOR.ATTN_K,
-            gguf.MODEL_TENSOR.ATTN_V,
-            gguf.MODEL_TENSOR.ATTN_OUT,
-            gguf.MODEL_TENSOR.FFN_UP,
-            gguf.MODEL_TENSOR.FFN_DOWN,
-            gguf.MODEL_TENSOR.FFN_GATE,
-        ]):
-            # transform weight into 1/0/-1 (in fp32)
+        # transform weight into 1/0/-1 (in fp32)
+        if name.endswith(("q_proj.weight", "k_proj.weight", "v_proj.weight",
+                          "down_proj.weight", "up_proj.weight", "gate_proj.weight",
+                          "o_proj.weight")):
             weight_torch, scale_torch = self.weight_quant(data_torch)
-            yield (new_name, weight_torch)
-            yield (new_name.removesuffix(".weight") + ".scale", scale_torch)
-        else:
-            yield (new_name, data_torch)
+
+        tensors: list[tuple[str, Tensor]] = []
+
+        if name.endswith("q_proj.weight"):
+            tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), weight_torch))
+            tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid, suffix=".scale"), scale_torch))
+        elif name.endswith("k_proj.weight"):
+            tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), weight_torch))
+            tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid, suffix=".scale"), scale_torch))
+        elif name.endswith("v_proj.weight"):
+            tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid), weight_torch))
+            tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_V, bid, suffix=".scale"), scale_torch))
+        elif name.endswith("o_proj.weight"):
+            tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_OUT, bid), weight_torch))
+            tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_OUT, bid, suffix=".scale"), scale_torch))
+        elif name.endswith("up_proj.weight"):
+            tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP, bid), weight_torch))
+            tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.FFN_UP, bid, suffix=".scale"), scale_torch))
+        elif name.endswith("down_proj.weight"):
+            tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.FFN_DOWN, bid), weight_torch))
+            tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.FFN_DOWN, bid, suffix=".scale"), scale_torch))
+        elif name.endswith("gate_proj.weight"):
+            tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid), weight_torch))
+            tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid, suffix=".scale"), scale_torch))
+
+        if len(tensors) == 0:
+            tensors.append((self.map_tensor_name(name), data_torch))
+
+        return tensors
 
 
 @Model.register("GrokForCausalLM")