Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

support compiling with intel MKL for with LLAMA_MKL=1 #1063

Open
wants to merge 1 commit into
base: concedo_experimental
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 29 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Add custom options to Makefile.local rather than editing this file.
-include $(abspath $(lastword ${MAKEFILE_LIST})).local

default: koboldcpp_default koboldcpp_failsafe koboldcpp_openblas koboldcpp_noavx2 koboldcpp_clblast koboldcpp_clblast_noavx2 koboldcpp_cublas koboldcpp_hipblas koboldcpp_vulkan koboldcpp_vulkan_noavx2
default: koboldcpp_default koboldcpp_failsafe koboldcpp_openblas koboldcpp_mkl koboldcpp_noavx2 koboldcpp_clblast koboldcpp_clblast_noavx2 koboldcpp_cublas koboldcpp_hipblas koboldcpp_vulkan koboldcpp_vulkan_noavx2
tools: quantize_gpt2 quantize_gptj quantize_gguf quantize_neox quantize_mpt quantize_clip whispermain sdmain gguf-split
dev: koboldcpp_openblas
dev2: koboldcpp_clblast
Expand All @@ -19,10 +19,11 @@ ifndef UNAME_M
UNAME_M := $(shell uname -m)
endif

ifndef LLAMA_MKL
ifneq ($(shell grep -e "Arch Linux" -e "ID_LIKE=arch" /etc/os-release 2>/dev/null),)
ARCH_ADD = -lcblas
ARCH_ADD = -lcblas
endif
endif


# Mac OS + Arm can report x86_64
# ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
Expand Down Expand Up @@ -54,6 +55,7 @@ FULLCFLAGS =
NONECFLAGS =

OPENBLAS_FLAGS = -DGGML_USE_OPENBLAS -DGGML_USE_BLAS -I/usr/local/include/openblas
MKL_FLAGS = -DGGML_USE_MKL -DGGML_USE_BLAS -I/opt/intel/oneapi/mkl/latest/include
CLBLAST_FLAGS = -DGGML_USE_CLBLAST
FAILSAFE_FLAGS = -DUSE_FAILSAFE
VULKAN_FLAGS = -DGGML_USE_VULKAN -DSD_USE_VULKAN
Expand Down Expand Up @@ -329,6 +331,7 @@ ifeq ($(OS),Windows_NT)
DEFAULT_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o [email protected] $(LDFLAGS)
FAILSAFE_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o [email protected] $(LDFLAGS)
OPENBLAS_BUILD = $(CXX) $(CXXFLAGS) $^ lib/libopenblas.lib -shared -o [email protected] $(LDFLAGS)
MKL_BUILD = $(CXX) $(CXXFLAGS) $^ -lmkl_rt -shared -o [email protected] $(LDFLAGS)
NOAVX2_BUILD = $(CXX) $(CXXFLAGS) $^ -shared -o [email protected] $(LDFLAGS)
CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ lib/OpenCL.lib lib/clblast.lib -shared -o [email protected] $(LDFLAGS)
VULKAN_BUILD = $(CXX) $(CXXFLAGS) $^ lib/vulkan-1.lib -shared -o [email protected] $(LDFLAGS)
Expand All @@ -349,6 +352,11 @@ else
ifdef LLAMA_OPENBLAS
OPENBLAS_BUILD = $(CXX) $(CXXFLAGS) $^ $(ARCH_ADD) -lopenblas -shared -o [email protected] $(LDFLAGS)
endif

ifdef LLAMA_MKL
MKL_BUILD = $(CXX) $(CXXFLAGS) $^ -lmkl_rt -shared -o [email protected] $(LDFLAGS)
endif

ifdef LLAMA_CLBLAST
ifeq ($(UNAME_S),Darwin)
CLBLAST_BUILD = $(CXX) $(CXXFLAGS) $^ -lclblast -framework OpenCL $(ARCH_ADD) -lopenblas -shared -o [email protected] $(LDFLAGS)
Expand All @@ -369,6 +377,7 @@ else
ifndef LLAMA_OPENBLAS
ifndef LLAMA_CLBLAST
ifndef LLAMA_CUBLAS
ifndef LLAMA_MKL
ifndef LLAMA_HIPBLAS
ifndef LLAMA_VULKAN
OPENBLAS_BUILD = @echo 'Your OS $(OS) does not appear to be Windows. For faster speeds, install and link a BLAS library. Set LLAMA_OPENBLAS=1 to compile with OpenBLAS support or LLAMA_CLBLAST=1 to compile with ClBlast support. This is just a reminder, not an error.'
Expand All @@ -377,6 +386,7 @@ else
endif
endif
endif
endif
endif

CCV := $(shell $(CC) --version | head -n 1)
Expand Down Expand Up @@ -405,6 +415,8 @@ ggml.o: ggml/src/ggml.c ggml/include/ggml.h
$(CC) $(FASTCFLAGS) $(FULLCFLAGS) -c $< -o $@
ggml_v4_openblas.o: ggml/src/ggml.c ggml/include/ggml.h
$(CC) $(FASTCFLAGS) $(FULLCFLAGS) $(OPENBLAS_FLAGS) -c $< -o $@
ggml_v4_mkl.o: ggml/src/ggml.c ggml/include/ggml.h
$(CC) $(FASTCFLAGS) $(FULLCFLAGS) $(MKL_FLAGS) -c $< -o $@
ggml_v4_failsafe.o: ggml/src/ggml.c ggml/include/ggml.h
$(CC) $(FASTCFLAGS) $(NONECFLAGS) -c $< -o $@
ggml_v4_noavx2.o: ggml/src/ggml.c ggml/include/ggml.h
Expand Down Expand Up @@ -469,6 +481,8 @@ ggml_v3.o: otherarch/ggml_v3.c otherarch/ggml_v3.h
$(CC) $(FASTCFLAGS) $(FULLCFLAGS) -c $< -o $@
ggml_v3_openblas.o: otherarch/ggml_v3.c otherarch/ggml_v3.h
$(CC) $(FASTCFLAGS) $(FULLCFLAGS) $(OPENBLAS_FLAGS) -c $< -o $@
ggml_v3_mkl.o: otherarch/ggml_v3.c otherarch/ggml_v3.h
$(CC) $(FASTCFLAGS) $(FULLCFLAGS) $(MKL_FLAGS) -c $< -o $@
ggml_v3_failsafe.o: otherarch/ggml_v3.c otherarch/ggml_v3.h
$(CC) $(FASTCFLAGS) $(NONECFLAGS) -c $< -o $@
ggml_v3_noavx2.o: otherarch/ggml_v3.c otherarch/ggml_v3.h
Expand All @@ -485,6 +499,8 @@ ggml_v2.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
$(CC) $(FASTCFLAGS) $(FULLCFLAGS) -c $< -o $@
ggml_v2_openblas.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
$(CC) $(FASTCFLAGS) $(FULLCFLAGS) $(OPENBLAS_FLAGS) -c $< -o $@
ggml_v2_mkl.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
$(CC) $(FASTCFLAGS) $(FULLCFLAGS) $(MKL_FLAGS) -c $< -o $@
ggml_v2_failsafe.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
$(CC) $(FASTCFLAGS) $(NONECFLAGS) -c $< -o $@
ggml_v2_noavx2.o: otherarch/ggml_v2.c otherarch/ggml_v2.h
Expand Down Expand Up @@ -553,6 +569,8 @@ gpttype_adapter.o: $(GPTTYPE_ADAPTER)
$(CXX) $(CXXFLAGS) -c $< -o $@
gpttype_adapter_openblas.o: $(GPTTYPE_ADAPTER)
$(CXX) $(CXXFLAGS) $(OPENBLAS_FLAGS) -c $< -o $@
gpttype_adapter_mkl.o: $(GPTTYPE_ADAPTER)
$(CXX) $(CXXFLAGS) $(MKL_FLAGS) -c $< -o $@
gpttype_adapter_clblast.o: $(GPTTYPE_ADAPTER)
$(CXX) $(CXXFLAGS) $(CLBLAST_FLAGS) -c $< -o $@
gpttype_adapter_cublas.o: $(GPTTYPE_ADAPTER)
Expand Down Expand Up @@ -602,6 +620,14 @@ koboldcpp_openblas:
$(DONOTHING)
endif

ifdef MKL_BUILD
koboldcpp_mkl: ggml_v4_mkl.o ggml_v3_mkl.o ggml_v2_mkl.o ggml_v1.o expose.o gpttype_adapter_mkl.o sdcpp_default.o whispercpp_default.o llavaclip_default.o llava.o ggml-backend_default.o ggml-blas.o $(OBJS_FULL) $(OBJS)
$(MKL_BUILD)
else
koboldcpp_mkl:
$(DONOTHING)
endif

ifdef FAILSAFE_BUILD
koboldcpp_failsafe: ggml_v4_failsafe.o ggml_v3_failsafe.o ggml_v2_failsafe.o ggml_v1_failsafe.o expose.o gpttype_adapter_failsafe.o sdcpp_default.o whispercpp_default.o llavaclip_default.o llava.o ggml-backend_default.o $(OBJS_FAILSAFE) $(OBJS)
$(FAILSAFE_BUILD)
Expand Down
21 changes: 19 additions & 2 deletions koboldcpp.py
Original file line number Diff line number Diff line change
Expand Up @@ -291,6 +291,7 @@ def pick_existant_file(ntoption,nonntoption):
lib_default = pick_existant_file("koboldcpp_default.dll","koboldcpp_default.so")
lib_failsafe = pick_existant_file("koboldcpp_failsafe.dll","koboldcpp_failsafe.so")
lib_openblas = pick_existant_file("koboldcpp_openblas.dll","koboldcpp_openblas.so")
lib_mkl = pick_existant_file("koboldcpp_mkl.dll","koboldcpp_mkl.so")
lib_noavx2 = pick_existant_file("koboldcpp_noavx2.dll","koboldcpp_noavx2.so")
lib_clblast = pick_existant_file("koboldcpp_clblast.dll","koboldcpp_clblast.so")
lib_clblast_noavx2 = pick_existant_file("koboldcpp_clblast_noavx2.dll","koboldcpp_clblast_noavx2.so")
Expand All @@ -299,6 +300,7 @@ def pick_existant_file(ntoption,nonntoption):
lib_vulkan = pick_existant_file("koboldcpp_vulkan.dll","koboldcpp_vulkan.so")
lib_vulkan_noavx2 = pick_existant_file("koboldcpp_vulkan_noavx2.dll","koboldcpp_vulkan_noavx2.so")
libname = ""

lib_option_pairs = [
(lib_openblas, "Use OpenBLAS"),
(lib_default, "Use No BLAS"),
Expand All @@ -309,8 +311,9 @@ def pick_existant_file(ntoption,nonntoption):
(lib_noavx2, "NoAVX2 Mode (Old CPU)"),
(lib_clblast_noavx2, "CLBlast NoAVX2 (Old CPU)"),
(lib_vulkan_noavx2, "Vulkan NoAVX2 (Old CPU)"),
(lib_mkl, "Use Intel MKL"),
(lib_failsafe, "Failsafe Mode (Old CPU)")]
openblas_option, default_option, clblast_option, cublas_option, hipblas_option, vulkan_option, noavx2_option, clblast_noavx2_option, vulkan_noavx2_option, failsafe_option = (opt if file_exists(lib) or (os.name == 'nt' and file_exists(opt + ".dll")) else None for lib, opt in lib_option_pairs)
openblas_option, default_option, clblast_option, cublas_option, hipblas_option, vulkan_option, noavx2_option, clblast_noavx2_option, vulkan_noavx2_option, mkl_option, failsafe_option = (opt if file_exists(lib) or (os.name == 'nt' and file_exists(opt + ".dll")) else None for lib, opt in lib_option_pairs)
runopts = [opt for lib, opt in lib_option_pairs if file_exists(lib)]

def init_library():
Expand All @@ -322,6 +325,7 @@ def init_library():
use_clblast = False #uses CLBlast instead
use_cublas = False #uses cublas instead
use_hipblas = False #uses hipblas instead
use_mkl = False #uses intel mkl
use_noavx2 = False #uses no avx2 instructions
use_failsafe = False #uses no intrinsics, failsafe mode
use_vulkan = False #uses vulkan (needs avx2)
Expand Down Expand Up @@ -370,6 +374,12 @@ def init_library():
else:
print("Attempting to use CLBlast library for faster prompt ingestion. A compatible clblast will be required.")
use_clblast = True
elif args.usemkl:
if not file_exists(lib_mkl) or (os.name=='nt' and not file_exists("libopenblas.dll")):
print("Warning: Intel MKL library file not found. Non-BLAS library will be used.")
else:
use_mkl = True
print("Attempting to use Intel MKL library for faster prompt ingestion. A compatible libopenblas will be required.")
else:
if not file_exists(lib_openblas) or (os.name=='nt' and not file_exists("libopenblas.dll")):
print("Warning: OpenBLAS library file not found. Non-BLAS library will be used.")
Expand Down Expand Up @@ -397,6 +407,8 @@ def init_library():
libname = lib_cublas
elif use_hipblas:
libname = lib_hipblas
elif use_mkl:
libname = lib_mkl
elif use_openblas:
libname = lib_openblas
elif use_vulkan:
Expand Down Expand Up @@ -2966,6 +2978,7 @@ def export_vars():
args.nommap = disablemmap.get()==1
args.smartcontext = smartcontext.get()==1
args.flashattention = flashattention.get()==1
args.use_mkl = runopts_var.get() == "Use Intel MKL"
args.noshift = contextshift.get()==0
args.remotetunnel = remotetunnel.get()==1
args.foreground = keepforeground.get()==1
Expand Down Expand Up @@ -3157,6 +3170,9 @@ def import_vars(dict):
elif "noavx2" in dict and dict["noavx2"]:
if noavx2_option is not None:
runopts_var.set(noavx2_option)
elif "usemkl" in dict and dict["usemkl"]:
if default_option is not None:
runopts_var.set("Use Intel MKL")
elif "noblas" in dict and dict["noblas"]:
if default_option is not None:
runopts_var.set(default_option)
Expand Down Expand Up @@ -4329,7 +4345,6 @@ def start_in_seperate_process(launch_args):
return (output_queue, input_queue, p)

if __name__ == '__main__':

def check_range(value_type, min_value, max_value):
def range_checker(arg: str):
try:
Expand Down Expand Up @@ -4357,8 +4372,10 @@ def range_checker(arg: str):
compatgroup.add_argument("--usecublas", help="Use CuBLAS for GPU Acceleration. Requires CUDA. Select lowvram to not allocate VRAM scratch buffer. Enter a number afterwards to select and use 1 GPU. Leaving no number will use all GPUs. For hipBLAS binaries, please check YellowRoseCx rocm fork.", nargs='*',metavar=('[lowvram|normal] [main GPU ID] [mmq] [rowsplit]'), choices=['normal', 'lowvram', '0', '1', '2', '3', 'mmq', 'rowsplit'])
compatgroup.add_argument("--usevulkan", help="Use Vulkan for GPU Acceleration. Can optionally specify GPU Device ID (e.g. --usevulkan 0).", metavar=('[Device ID]'), nargs='*', type=int, default=None)
compatgroup.add_argument("--useclblast", help="Use CLBlast for GPU Acceleration. Must specify exactly 2 arguments, platform ID and device ID (e.g. --useclblast 1 0).", type=int, choices=range(0,9), nargs=2)
compatgroup.add_argument("--usemkl", help="Use Intel MKL for BLAS acceleration.", action='store_true')
compatgroup.add_argument("--noblas", help="Do not use any accelerated prompt ingestion", action='store_true')
parser.add_argument("--contextsize", help="Controls the memory allocated for maximum context size, only change if you need more RAM for big contexts. (default 4096). Supported values are [256,512,1024,2048,3072,4096,6144,8192,12288,16384,24576,32768,49152,65536,98304,131072]. IF YOU USE ANYTHING ELSE YOU ARE ON YOUR OWN.",metavar=('[256,512,1024,2048,3072,4096,6144,8192,12288,16384,24576,32768,49152,65536,98304,131072]'), type=check_range(int,256,262144), default=4096)

parser.add_argument("--gpulayers", help="Set number of layers to offload to GPU when using GPU. Requires GPU. Set to -1 to try autodetect (experimental)",metavar=('[GPU layers]'), nargs='?', const=1, type=int, default=0)
parser.add_argument("--tensor_split", help="For CUDA and Vulkan only, ratio to split tensors across multiple GPUs, space-separated list of proportions, e.g. 7 3", metavar=('[Ratios]'), type=float, nargs='+')

Expand Down