From 9f43f347283fffcea0fb0d117653701dc0cda985 Mon Sep 17 00:00:00 2001 From: lr1729 Date: Sun, 11 Aug 2024 14:15:04 -0700 Subject: [PATCH] support compiling with intel MKL for with LLAMA_MKL=1 --- Makefile | 32 +++++++++++++++++++++++++++++--- koboldcpp.py | 21 +++++++++++++++++++-- 2 files changed, 48 insertions(+), 5 deletions(-) diff --git a/Makefile b/Makefile index 00d7bbbba9133..1cb8f18bbe53b 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ # Add custom options to Makefile.local rather than editing this file. -include $(abspath $(lastword ${MAKEFILE_LIST})).local -default: koboldcpp_default koboldcpp_failsafe koboldcpp_openblas koboldcpp_noavx2 koboldcpp_clblast koboldcpp_clblast_noavx2 koboldcpp_cublas koboldcpp_hipblas koboldcpp_vulkan koboldcpp_vulkan_noavx2 +default: koboldcpp_default koboldcpp_failsafe koboldcpp_openblas koboldcpp_mkl koboldcpp_noavx2 koboldcpp_clblast koboldcpp_clblast_noavx2 koboldcpp_cublas koboldcpp_hipblas koboldcpp_vulkan koboldcpp_vulkan_noavx2 tools: quantize_gpt2 quantize_gptj quantize_gguf quantize_neox quantize_mpt quantize_clip whispermain sdmain gguf-split dev: koboldcpp_openblas dev2: koboldcpp_clblast @@ -19,10 +19,11 @@ ifndef UNAME_M UNAME_M := $(shell uname -m) endif +ifndef LLAMA_MKL ifneq ($(shell grep -e "Arch Linux" -e "ID_LIKE=arch" /etc/os-release 2>/dev/null),) -ARCH_ADD = -lcblas + ARCH_ADD = -lcblas +endif endif - # Mac OS + Arm can report x86_64 # ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789 @@ -54,6 +55,7 @@ FULLCFLAGS = NONECFLAGS = OPENBLAS_FLAGS = -DGGML_USE_OPENBLAS -DGGML_USE_BLAS -I/usr/local/include/openblas +MKL_FLAGS = -DGGML_USE_MKL -DGGML_USE_BLAS -I/opt/intel/oneapi/mkl/latest/include CLBLAST_FLAGS = -DGGML_USE_CLBLAST FAILSAFE_FLAGS = -DUSE_FAILSAFE VULKAN_FLAGS = -DGGML_USE_VULKAN -DSD_USE_VULKAN @@ -329,6 +331,7 @@ ifeq ($(OS),Windows_NT) DEFAULT_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o $@.dll $(LDFLAGS) FAILSAFE_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o $@.dll $(LDFLAGS) OPENBLAS_BUILD = $(CXX) $(CXXFLAGS) $^ lib/libopenblas.lib -shared -o $@.dll $(LDFLAGS) + MKL_BUILD = $(CXX) $(CXXFLAGS) $^ -lmkl_rt -shared -o $@.dll $(LDFLAGS) NOAVX2_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o $@.dll $(LDFLAGS) CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ lib/OpenCL.lib lib/clblast.lib -shared -o $@.dll $(LDFLAGS) VULKAN_BUILD = $(CXX) $(CXXFLAGS) $^ lib/vulkan-1.lib -shared -o $@.dll $(LDFLAGS) @@ -349,6 +352,11 @@ else ifdef LLAMA_OPENBLAS OPENBLAS_BUILD = $(CXX) $(CXXFLAGS) $^ $(ARCH_ADD) -lopenblas -shared -o $@.so $(LDFLAGS) endif + + ifdef LLAMA_MKL + MKL_BUILD = $(CXX) $(CXXFLAGS) $^ -lmkl_rt -shared -o $@.so $(LDFLAGS) + endif + ifdef LLAMA_CLBLAST ifeq ($(UNAME_S),Darwin) CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ -lclblast -framework OpenCL $(ARCH_ADD) -lopenblas -shared -o $@.so $(LDFLAGS) @@ -369,6 +377,7 @@ else ifndef LLAMA_OPENBLAS ifndef LLAMA_CLBLAST ifndef LLAMA_CUBLAS + ifndef LLAMA_MKL ifndef LLAMA_HIPBLAS ifndef LLAMA_VULKAN OPENBLAS_BUILD = @echo 'Your OS $(OS) does not appear to be Windows. For faster speeds, install and link a BLAS library. Set LLAMA_OPENBLAS=1 to compile with OpenBLAS support or LLAMA_CLBLAST=1 to compile with ClBlast support. This is just a reminder, not an error.' @@ -377,6 +386,7 @@ else endif endif endif + endif endif CCV := $(shell $(CC) --version | head -n 1) @@ -405,6 +415,8 @@ ggml.o: ggml/src/ggml.c ggml/include/ggml.h $(CC) $(FASTCFLAGS) $(FULLCFLAGS) -c $< -o $@ ggml_v4_openblas.o: ggml/src/ggml.c ggml/include/ggml.h $(CC) $(FASTCFLAGS) $(FULLCFLAGS) $(OPENBLAS_FLAGS) -c $< -o $@ +ggml_v4_mkl.o: ggml/src/ggml.c ggml/include/ggml.h + $(CC) $(FASTCFLAGS) $(FULLCFLAGS) $(MKL_FLAGS) -c $< -o $@ ggml_v4_failsafe.o: ggml/src/ggml.c ggml/include/ggml.h $(CC) $(FASTCFLAGS) $(NONECFLAGS) -c $< -o $@ ggml_v4_noavx2.o: ggml/src/ggml.c ggml/include/ggml.h @@ -469,6 +481,8 @@ ggml_v3.o: otherarch/ggml_v3.c otherarch/ggml_v3.h $(CC) $(FASTCFLAGS) $(FULLCFLAGS) -c $< -o $@ ggml_v3_openblas.o: otherarch/ggml_v3.c otherarch/ggml_v3.h $(CC) $(FASTCFLAGS) $(FULLCFLAGS) $(OPENBLAS_FLAGS) -c $< -o $@ +ggml_v3_mkl.o: otherarch/ggml_v3.c otherarch/ggml_v3.h + $(CC) $(FASTCFLAGS) $(FULLCFLAGS) $(MKL_FLAGS) -c $< -o $@ ggml_v3_failsafe.o: otherarch/ggml_v3.c otherarch/ggml_v3.h $(CC) $(FASTCFLAGS) $(NONECFLAGS) -c $< -o $@ ggml_v3_noavx2.o: otherarch/ggml_v3.c otherarch/ggml_v3.h @@ -485,6 +499,8 @@ ggml_v2.o: otherarch/ggml_v2.c otherarch/ggml_v2.h $(CC) $(FASTCFLAGS) $(FULLCFLAGS) -c $< -o $@ ggml_v2_openblas.o: otherarch/ggml_v2.c otherarch/ggml_v2.h $(CC) $(FASTCFLAGS) $(FULLCFLAGS) $(OPENBLAS_FLAGS) -c $< -o $@ +ggml_v2_mkl.o: otherarch/ggml_v2.c otherarch/ggml_v2.h + $(CC) $(FASTCFLAGS) $(FULLCFLAGS) $(MKL_FLAGS) -c $< -o $@ ggml_v2_failsafe.o: otherarch/ggml_v2.c otherarch/ggml_v2.h $(CC) $(FASTCFLAGS) $(NONECFLAGS) -c $< -o $@ ggml_v2_noavx2.o: otherarch/ggml_v2.c otherarch/ggml_v2.h @@ -553,6 +569,8 @@ gpttype_adapter.o: $(GPTTYPE_ADAPTER) $(CXX) $(CXXFLAGS) -c $< -o $@ gpttype_adapter_openblas.o: $(GPTTYPE_ADAPTER) $(CXX) $(CXXFLAGS) $(OPENBLAS_FLAGS) -c $< -o $@ +gpttype_adapter_mkl.o: $(GPTTYPE_ADAPTER) + $(CXX) $(CXXFLAGS) $(MKL_FLAGS) -c $< -o $@ gpttype_adapter_clblast.o: $(GPTTYPE_ADAPTER) $(CXX) $(CXXFLAGS) $(CLBLAST_FLAGS) -c $< -o $@ gpttype_adapter_cublas.o: $(GPTTYPE_ADAPTER) @@ -602,6 +620,14 @@ koboldcpp_openblas: $(DONOTHING) endif +ifdef MKL_BUILD +koboldcpp_mkl: ggml_v4_mkl.o ggml_v3_mkl.o ggml_v2_mkl.o ggml_v1.o expose.o gpttype_adapter_mkl.o sdcpp_default.o whispercpp_default.o llavaclip_default.o llava.o ggml-backend_default.o ggml-blas.o $(OBJS_FULL) $(OBJS) + $(MKL_BUILD) +else +koboldcpp_mkl: + $(DONOTHING) +endif + ifdef FAILSAFE_BUILD koboldcpp_failsafe: ggml_v4_failsafe.o ggml_v3_failsafe.o ggml_v2_failsafe.o ggml_v1_failsafe.o expose.o gpttype_adapter_failsafe.o sdcpp_default.o whispercpp_default.o llavaclip_default.o llava.o ggml-backend_default.o $(OBJS_FAILSAFE) $(OBJS) $(FAILSAFE_BUILD) diff --git a/koboldcpp.py b/koboldcpp.py index 0fd17b7f4b5e0..9187ae4d6faa0 100644 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -291,6 +291,7 @@ def pick_existant_file(ntoption,nonntoption): lib_default = pick_existant_file("koboldcpp_default.dll","koboldcpp_default.so") lib_failsafe = pick_existant_file("koboldcpp_failsafe.dll","koboldcpp_failsafe.so") lib_openblas = pick_existant_file("koboldcpp_openblas.dll","koboldcpp_openblas.so") +lib_mkl = pick_existant_file("koboldcpp_mkl.dll","koboldcpp_mkl.so") lib_noavx2 = pick_existant_file("koboldcpp_noavx2.dll","koboldcpp_noavx2.so") lib_clblast = pick_existant_file("koboldcpp_clblast.dll","koboldcpp_clblast.so") lib_clblast_noavx2 = pick_existant_file("koboldcpp_clblast_noavx2.dll","koboldcpp_clblast_noavx2.so") @@ -299,6 +300,7 @@ def pick_existant_file(ntoption,nonntoption): lib_vulkan = pick_existant_file("koboldcpp_vulkan.dll","koboldcpp_vulkan.so") lib_vulkan_noavx2 = pick_existant_file("koboldcpp_vulkan_noavx2.dll","koboldcpp_vulkan_noavx2.so") libname = "" + lib_option_pairs = [ (lib_openblas, "Use OpenBLAS"), (lib_default, "Use No BLAS"), @@ -309,8 +311,9 @@ def pick_existant_file(ntoption,nonntoption): (lib_noavx2, "NoAVX2 Mode (Old CPU)"), (lib_clblast_noavx2, "CLBlast NoAVX2 (Old CPU)"), (lib_vulkan_noavx2, "Vulkan NoAVX2 (Old CPU)"), + (lib_mkl, "Use Intel MKL"), (lib_failsafe, "Failsafe Mode (Old CPU)")] -openblas_option, default_option, clblast_option, cublas_option, hipblas_option, vulkan_option, noavx2_option, clblast_noavx2_option, vulkan_noavx2_option, failsafe_option = (opt if file_exists(lib) or (os.name == 'nt' and file_exists(opt + ".dll")) else None for lib, opt in lib_option_pairs) +openblas_option, default_option, clblast_option, cublas_option, hipblas_option, vulkan_option, noavx2_option, clblast_noavx2_option, vulkan_noavx2_option, mkl_option, failsafe_option = (opt if file_exists(lib) or (os.name == 'nt' and file_exists(opt + ".dll")) else None for lib, opt in lib_option_pairs) runopts = [opt for lib, opt in lib_option_pairs if file_exists(lib)] def init_library(): @@ -322,6 +325,7 @@ def init_library(): use_clblast = False #uses CLBlast instead use_cublas = False #uses cublas instead use_hipblas = False #uses hipblas instead + use_mkl = False #uses intel mkl use_noavx2 = False #uses no avx2 instructions use_failsafe = False #uses no intrinsics, failsafe mode use_vulkan = False #uses vulkan (needs avx2) @@ -370,6 +374,12 @@ def init_library(): else: print("Attempting to use CLBlast library for faster prompt ingestion. A compatible clblast will be required.") use_clblast = True + elif args.usemkl: + if not file_exists(lib_mkl) or (os.name=='nt' and not file_exists("libopenblas.dll")): + print("Warning: Intel MKL library file not found. Non-BLAS library will be used.") + else: + use_mkl = True + print("Attempting to use Intel MKL library for faster prompt ingestion. A compatible libopenblas will be required.") else: if not file_exists(lib_openblas) or (os.name=='nt' and not file_exists("libopenblas.dll")): print("Warning: OpenBLAS library file not found. Non-BLAS library will be used.") @@ -397,6 +407,8 @@ def init_library(): libname = lib_cublas elif use_hipblas: libname = lib_hipblas + elif use_mkl: + libname = lib_mkl elif use_openblas: libname = lib_openblas elif use_vulkan: @@ -2966,6 +2978,7 @@ def export_vars(): args.nommap = disablemmap.get()==1 args.smartcontext = smartcontext.get()==1 args.flashattention = flashattention.get()==1 + args.use_mkl = runopts_var.get() == "Use Intel MKL" args.noshift = contextshift.get()==0 args.remotetunnel = remotetunnel.get()==1 args.foreground = keepforeground.get()==1 @@ -3157,6 +3170,9 @@ def import_vars(dict): elif "noavx2" in dict and dict["noavx2"]: if noavx2_option is not None: runopts_var.set(noavx2_option) + elif "usemkl" in dict and dict["usemkl"]: + if default_option is not None: + runopts_var.set("Use Intel MKL") elif "noblas" in dict and dict["noblas"]: if default_option is not None: runopts_var.set(default_option) @@ -4329,7 +4345,6 @@ def start_in_seperate_process(launch_args): return (output_queue, input_queue, p) if __name__ == '__main__': - def check_range(value_type, min_value, max_value): def range_checker(arg: str): try: @@ -4357,8 +4372,10 @@ def range_checker(arg: str): compatgroup.add_argument("--usecublas", help="Use CuBLAS for GPU Acceleration. Requires CUDA. Select lowvram to not allocate VRAM scratch buffer. Enter a number afterwards to select and use 1 GPU. Leaving no number will use all GPUs. For hipBLAS binaries, please check YellowRoseCx rocm fork.", nargs='*',metavar=('[lowvram|normal] [main GPU ID] [mmq] [rowsplit]'), choices=['normal', 'lowvram', '0', '1', '2', '3', 'mmq', 'rowsplit']) compatgroup.add_argument("--usevulkan", help="Use Vulkan for GPU Acceleration. Can optionally specify GPU Device ID (e.g. --usevulkan 0).", metavar=('[Device ID]'), nargs='*', type=int, default=None) compatgroup.add_argument("--useclblast", help="Use CLBlast for GPU Acceleration. Must specify exactly 2 arguments, platform ID and device ID (e.g. --useclblast 1 0).", type=int, choices=range(0,9), nargs=2) + compatgroup.add_argument("--usemkl", help="Use Intel MKL for BLAS acceleration.", action='store_true') compatgroup.add_argument("--noblas", help="Do not use any accelerated prompt ingestion", action='store_true') parser.add_argument("--contextsize", help="Controls the memory allocated for maximum context size, only change if you need more RAM for big contexts. (default 4096). Supported values are [256,512,1024,2048,3072,4096,6144,8192,12288,16384,24576,32768,49152,65536,98304,131072]. IF YOU USE ANYTHING ELSE YOU ARE ON YOUR OWN.",metavar=('[256,512,1024,2048,3072,4096,6144,8192,12288,16384,24576,32768,49152,65536,98304,131072]'), type=check_range(int,256,262144), default=4096) + parser.add_argument("--gpulayers", help="Set number of layers to offload to GPU when using GPU. Requires GPU. Set to -1 to try autodetect (experimental)",metavar=('[GPU layers]'), nargs='?', const=1, type=int, default=0) parser.add_argument("--tensor_split", help="For CUDA and Vulkan only, ratio to split tensors across multiple GPUs, space-separated list of proportions, e.g. 7 3", metavar=('[Ratios]'), type=float, nargs='+')