forked from ggerganov/llama.cpp
-
Notifications
You must be signed in to change notification settings - Fork 364
/
koboldcpp.py
4720 lines (4276 loc) · 244 KB
/
koboldcpp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python3
#-*- coding: utf-8 -*-
# KoboldCpp is an easy-to-use AI text-generation software for GGML models.
# It's a single self contained distributable from Concedo, that builds off llama.cpp,
# and adds a versatile Kobold API endpoint, additional format support,
# backward compatibility, as well as a fancy UI with persistent stories,
# editing tools, save formats, memory, world info, author's note, characters,
# scenarios and everything Kobold and KoboldAI Lite have to offer.
import ctypes
import os, math, re
import argparse
import platform
import base64
import json, sys, http.server, time, asyncio, socket, threading
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime, timezone
# constants
sampler_order_max = 7
tensor_split_max = 16
images_max = 4
bias_min_value = -100.0
bias_max_value = 100.0
logprobs_max = 5
# abuse prevention
stop_token_max = 256
ban_token_max = 512
logit_bias_max = 512
dry_seq_break_max = 128
# global vars
handle = None
friendlymodelname = "inactive"
friendlysdmodelname = "inactive"
fullsdmodelpath = "" #if empty, it's not initialized
mmprojpath = "" #if empty, it's not initialized
password = "" #if empty, no auth key required
fullwhispermodelpath = "" #if empty, it's not initialized
maxctx = 4096
maxhordectx = 4096
maxhordelen = 400
modelbusy = threading.Lock()
requestsinqueue = 0
defaultport = 5001
KcppVersion = "1.78"
showdebug = True
guimode = False
showsamplerwarning = True
showmaxctxwarning = True
showusedmemwarning = True
session_kudos_earned = 0
session_jobs = 0
session_starttime = None
exitcounter = -1
punishcounter = 0 #causes a timeout if too many errors
rewardcounter = 0 #reduces error counts for successful jobs
totalgens = 0
currentusergenkey = "" #store a special key so polled streaming works even in multiuser
pendingabortkey = "" #if an abort is received for the non-active request, remember it (at least 1) to cancel later
args = None #global args
runmode_untouched = True
modelfile_extracted_meta = None
importvars_in_progress = False
preloaded_story = None
chatcompl_adapter = None
embedded_kailite = None
embedded_kcpp_docs = None
embedded_kcpp_sdui = None
sslvalid = False
nocertify = False
start_time = time.time()
last_req_time = time.time()
last_non_horde_req_time = time.time()
currfinishreason = "null"
using_gui_launcher = False
using_outdated_flags = False
saved_stdout = None
saved_stderr = None
saved_stdout_py = None
saved_stderr_py = None
stdout_nullfile = None
stdout_nullfile_py = None
CLDevices = ["1","2","3","4"]
CUDevices = ["1","2","3","4","All"]
CLDevicesNames = ["","","",""]
CUDevicesNames = ["","","","",""]
VKDevicesNames = ["","","",""]
VKIsDGPU = [0,0,0,0]
MaxMemory = [0]
MaxFreeMemory = [0]
class logit_bias(ctypes.Structure):
_fields_ = [("token_id", ctypes.c_int32),
("bias", ctypes.c_float)]
class token_count_outputs(ctypes.Structure):
_fields_ = [("count", ctypes.c_int),
("ids", ctypes.POINTER(ctypes.c_int))]
# returns top 5 logprobs per token
class logprob_item(ctypes.Structure):
_fields_ = [("option_count", ctypes.c_int),
("selected_token", ctypes.c_char_p),
("selected_logprob", ctypes.c_float),
("tokens", ctypes.c_char_p * logprobs_max),
("logprobs", ctypes.POINTER(ctypes.c_float))]
class last_logprobs_outputs(ctypes.Structure):
_fields_ = [("count", ctypes.c_int),
("logprob_items", ctypes.POINTER(logprob_item))]
class load_model_inputs(ctypes.Structure):
_fields_ = [("threads", ctypes.c_int),
("blasthreads", ctypes.c_int),
("max_context_length", ctypes.c_int),
("low_vram", ctypes.c_bool),
("use_mmq", ctypes.c_bool),
("use_rowsplit", ctypes.c_bool),
("executable_path", ctypes.c_char_p),
("model_filename", ctypes.c_char_p),
("lora_filename", ctypes.c_char_p),
("lora_base", ctypes.c_char_p),
("mmproj_filename", ctypes.c_char_p),
("use_mmap", ctypes.c_bool),
("use_mlock", ctypes.c_bool),
("use_smartcontext", ctypes.c_bool),
("use_contextshift", ctypes.c_bool),
("use_fastforward", ctypes.c_bool),
("clblast_info", ctypes.c_int),
("cublas_info", ctypes.c_int),
("vulkan_info", ctypes.c_char_p),
("blasbatchsize", ctypes.c_int),
("debugmode", ctypes.c_int),
("forceversion", ctypes.c_int),
("gpulayers", ctypes.c_int),
("rope_freq_scale", ctypes.c_float),
("rope_freq_base", ctypes.c_float),
("flash_attention", ctypes.c_bool),
("tensor_split", ctypes.c_float * tensor_split_max),
("quant_k", ctypes.c_int),
("quant_v", ctypes.c_int)]
class generation_inputs(ctypes.Structure):
_fields_ = [("seed", ctypes.c_int),
("prompt", ctypes.c_char_p),
("memory", ctypes.c_char_p),
("images", ctypes.c_char_p * images_max),
("max_context_length", ctypes.c_int),
("max_length", ctypes.c_int),
("temperature", ctypes.c_float),
("top_k", ctypes.c_int),
("top_a", ctypes.c_float),
("top_p", ctypes.c_float),
("min_p", ctypes.c_float),
("typical_p", ctypes.c_float),
("tfs", ctypes.c_float),
("rep_pen", ctypes.c_float),
("rep_pen_range", ctypes.c_int),
("rep_pen_slope", ctypes.c_float),
("presence_penalty", ctypes.c_float),
("mirostat", ctypes.c_int),
("mirostat_tau", ctypes.c_float),
("mirostat_eta", ctypes.c_float),
("xtc_threshold", ctypes.c_float),
("xtc_probability", ctypes.c_float),
("sampler_order", ctypes.c_int * sampler_order_max),
("sampler_len", ctypes.c_int),
("allow_eos_token", ctypes.c_bool),
("bypass_eos_token", ctypes.c_bool),
("render_special", ctypes.c_bool),
("stream_sse", ctypes.c_bool),
("grammar", ctypes.c_char_p),
("grammar_retain_state", ctypes.c_bool),
("quiet", ctypes.c_bool),
("dynatemp_range", ctypes.c_float),
("dynatemp_exponent", ctypes.c_float),
("smoothing_factor", ctypes.c_float),
("dry_multiplier", ctypes.c_float),
("dry_base", ctypes.c_float),
("dry_allowed_length", ctypes.c_int),
("dry_penalty_last_n", ctypes.c_int),
("dry_sequence_breakers_len", ctypes.c_int),
("dry_sequence_breakers", ctypes.POINTER(ctypes.c_char_p)),
("stop_sequence_len", ctypes.c_int),
("stop_sequence", ctypes.POINTER(ctypes.c_char_p)),
("logit_biases_len", ctypes.c_int),
("logit_biases", ctypes.POINTER(logit_bias)),
("banned_tokens_len", ctypes.c_int),
("banned_tokens", ctypes.POINTER(ctypes.c_char_p))]
class generation_outputs(ctypes.Structure):
_fields_ = [("status", ctypes.c_int),
("stopreason", ctypes.c_int),
("prompt_tokens", ctypes.c_int),
("completion_tokens", ctypes.c_int),
("text", ctypes.c_char_p)]
class sd_load_model_inputs(ctypes.Structure):
_fields_ = [("model_filename", ctypes.c_char_p),
("executable_path", ctypes.c_char_p),
("clblast_info", ctypes.c_int),
("cublas_info", ctypes.c_int),
("vulkan_info", ctypes.c_char_p),
("threads", ctypes.c_int),
("quant", ctypes.c_int),
("taesd", ctypes.c_bool),
("t5xxl_filename", ctypes.c_char_p),
("clipl_filename", ctypes.c_char_p),
("clipg_filename", ctypes.c_char_p),
("vae_filename", ctypes.c_char_p),
("lora_filename", ctypes.c_char_p),
("lora_multiplier", ctypes.c_float),
("debugmode", ctypes.c_int)]
class sd_generation_inputs(ctypes.Structure):
_fields_ = [("prompt", ctypes.c_char_p),
("negative_prompt", ctypes.c_char_p),
("init_images", ctypes.c_char_p),
("denoising_strength", ctypes.c_float),
("cfg_scale", ctypes.c_float),
("sample_steps", ctypes.c_int),
("width", ctypes.c_int),
("height", ctypes.c_int),
("seed", ctypes.c_int),
("sample_method", ctypes.c_char_p),
("clip_skip", ctypes.c_int),
("quiet", ctypes.c_bool)]
class sd_generation_outputs(ctypes.Structure):
_fields_ = [("status", ctypes.c_int),
("data", ctypes.c_char_p)]
class whisper_load_model_inputs(ctypes.Structure):
_fields_ = [("model_filename", ctypes.c_char_p),
("executable_path", ctypes.c_char_p),
("clblast_info", ctypes.c_int),
("cublas_info", ctypes.c_int),
("vulkan_info", ctypes.c_char_p),
("debugmode", ctypes.c_int)]
class whisper_generation_inputs(ctypes.Structure):
_fields_ = [("prompt", ctypes.c_char_p),
("audio_data", ctypes.c_char_p),
("quiet", ctypes.c_bool)]
class whisper_generation_outputs(ctypes.Structure):
_fields_ = [("status", ctypes.c_int),
("data", ctypes.c_char_p)]
def getdirpath():
return os.path.dirname(os.path.realpath(__file__))
def getabspath():
return os.path.dirname(os.path.abspath(__file__))
def file_exists(filename):
return os.path.exists(os.path.join(getdirpath(), filename))
def suppress_stdout():
global saved_stdout, saved_stderr, saved_stdout_py, saved_stderr_py, stdout_nullfile, stdout_nullfile_py
if not saved_stdout and not saved_stderr and not saved_stdout_py and not saved_stderr_py and not stdout_nullfile and not stdout_nullfile_py:
sys.stdout.flush()
sys.stderr.flush()
saved_stdout = os.dup(sys.stdout.fileno())
saved_stderr = os.dup(sys.stderr.fileno())
saved_stderr_py = sys.stderr
saved_stdout_py = sys.stdout
stdout_nullfile = os.open(os.devnull, os.O_WRONLY)
stdout_nullfile_py = open(os.devnull, 'w')
os.dup2(stdout_nullfile, sys.stdout.fileno())
os.dup2(stdout_nullfile, sys.stderr.fileno())
sys.stderr = sys.stdout = stdout_nullfile_py
def restore_stdout():
global saved_stdout, saved_stderr, saved_stdout_py, saved_stderr_py, stdout_nullfile, stdout_nullfile_py
if saved_stdout and saved_stderr and saved_stdout_py and saved_stderr_py and stdout_nullfile and stdout_nullfile_py:
sys.stdout = saved_stdout_py
sys.stderr = saved_stderr_py
os.dup2(saved_stdout, sys.stdout.fileno())
os.dup2(saved_stderr, sys.stderr.fileno())
os.close(stdout_nullfile)
stdout_nullfile_py.close()
os.close(saved_stdout)
os.close(saved_stderr)
saved_stdout = saved_stderr = saved_stdout_py = saved_stderr_py = stdout_nullfile = stdout_nullfile_py = None
def get_default_threads():
physical_core_limit = 1
if os.cpu_count()!=None and os.cpu_count()>1:
physical_core_limit = os.cpu_count() // 2
default_threads = (physical_core_limit if physical_core_limit<=3 else max(3,physical_core_limit-1))
processor = platform.processor()
if 'Intel' in processor:
default_threads = (8 if default_threads > 8 else default_threads) #this helps avoid e-cores.
return default_threads
def pick_existant_file(ntoption,nonntoption):
precompiled_prefix = "precompiled_"
ntexist = file_exists(ntoption)
nonntexist = file_exists(nonntoption)
precompiled_ntexist = file_exists(precompiled_prefix+ntoption)
precompiled_nonntexist = file_exists(precompiled_prefix+nonntoption)
if os.name == 'nt':
if not ntexist and precompiled_ntexist:
return (precompiled_prefix+ntoption)
if nonntexist and not ntexist:
return nonntoption
return ntoption
else:
if not nonntexist and precompiled_nonntexist:
return (precompiled_prefix+nonntoption)
if ntexist and not nonntexist:
return ntoption
return nonntoption
lib_default = pick_existant_file("koboldcpp_default.dll","koboldcpp_default.so")
lib_failsafe = pick_existant_file("koboldcpp_failsafe.dll","koboldcpp_failsafe.so")
lib_noavx2 = pick_existant_file("koboldcpp_noavx2.dll","koboldcpp_noavx2.so")
lib_clblast = pick_existant_file("koboldcpp_clblast.dll","koboldcpp_clblast.so")
lib_clblast_noavx2 = pick_existant_file("koboldcpp_clblast_noavx2.dll","koboldcpp_clblast_noavx2.so")
lib_cublas = pick_existant_file("koboldcpp_cublas.dll","koboldcpp_cublas.so")
lib_hipblas = pick_existant_file("koboldcpp_hipblas.dll","koboldcpp_hipblas.so")
lib_vulkan = pick_existant_file("koboldcpp_vulkan.dll","koboldcpp_vulkan.so")
lib_vulkan_noavx2 = pick_existant_file("koboldcpp_vulkan_noavx2.dll","koboldcpp_vulkan_noavx2.so")
libname = ""
lib_option_pairs = [
(lib_default, "Use CPU"),
(lib_clblast, "Use CLBlast"),
(lib_cublas, "Use CuBLAS"),
(lib_hipblas, "Use hipBLAS (ROCm)"),
(lib_vulkan, "Use Vulkan"),
(lib_noavx2, "Use CPU (Old CPU)"),
(lib_clblast_noavx2, "Use CLBlast (Old CPU)"),
(lib_vulkan_noavx2, "Use Vulkan (Old CPU)"),
(lib_failsafe, "Failsafe Mode (Old CPU)")]
default_option, clblast_option, cublas_option, hipblas_option, vulkan_option, noavx2_option, clblast_noavx2_option, vulkan_noavx2_option, failsafe_option = (opt if file_exists(lib) or (os.name == 'nt' and file_exists(opt + ".dll")) else None for lib, opt in lib_option_pairs)
runopts = [opt for lib, opt in lib_option_pairs if file_exists(lib)]
def init_library():
global handle, args, libname
global lib_default,lib_failsafe,lib_noavx2,lib_clblast,lib_clblast_noavx2,lib_cublas,lib_hipblas,lib_vulkan,lib_vulkan_noavx2
libname = ""
use_clblast = False #uses CLBlast instead
use_cublas = False #uses cublas instead
use_hipblas = False #uses hipblas instead
use_noavx2 = False #uses no avx2 instructions
use_failsafe = False #uses no intrinsics, failsafe mode
use_vulkan = False #uses vulkan (needs avx2)
if args.noavx2:
use_noavx2 = True
if args.useclblast:
if not file_exists(lib_clblast_noavx2) or (os.name=='nt' and not file_exists("clblast.dll")):
print("Warning: NoAVX2 CLBlast library file not found. CPU library will be used.")
else:
print("Attempting to use NoAVX2 CLBlast library for faster prompt ingestion. A compatible clblast will be required.")
use_clblast = True
elif (args.usevulkan is not None):
if not file_exists(lib_vulkan_noavx2):
print("Warning: NoAVX2 Vulkan library file not found. CPU library will be used.")
else:
print("Attempting to use NoAVX2 Vulkan library for faster prompt ingestion. A compatible Vulkan will be required.")
use_vulkan = True
else:
if not file_exists(lib_noavx2):
print("Warning: NoAVX2 library file not found. Failsafe library will be used.")
elif (args.usecpu and args.nommap):
use_failsafe = True
print("!!! Attempting to use FAILSAFE MODE !!!")
else:
print("Attempting to use non-avx2 compatibility library.")
elif (args.usecublas is not None):
if not file_exists(lib_cublas) and not file_exists(lib_hipblas):
print("Warning: CuBLAS library file not found. CPU library will be used.")
else:
if file_exists(lib_cublas):
print("Attempting to use CuBLAS library for faster prompt ingestion. A compatible CuBLAS will be required.")
use_cublas = True
elif file_exists(lib_hipblas):
print("Attempting to use hipBLAS library for faster prompt ingestion. A compatible AMD GPU will be required.")
use_hipblas = True
elif (args.usevulkan is not None):
if not file_exists(lib_vulkan):
print("Warning: Vulkan library file not found. CPU library will be used.")
else:
print("Attempting to use Vulkan library for faster prompt ingestion. A compatible Vulkan will be required.")
use_vulkan = True
elif args.useclblast:
if not file_exists(lib_clblast) or (os.name=='nt' and not file_exists("clblast.dll")):
print("Warning: CLBlast library file not found. CPU library will be used.")
else:
print("Attempting to use CLBlast library for faster prompt ingestion. A compatible clblast will be required.")
use_clblast = True
else:
print("Attempting to use CPU library.")
if use_noavx2:
if use_failsafe:
libname = lib_failsafe
elif use_clblast:
libname = lib_clblast_noavx2
elif use_vulkan:
libname = lib_vulkan_noavx2
else:
libname = lib_noavx2
else:
if use_clblast:
libname = lib_clblast
elif use_cublas:
libname = lib_cublas
elif use_hipblas:
libname = lib_hipblas
elif use_vulkan:
libname = lib_vulkan
else:
libname = lib_default
print("Initializing dynamic library: " + libname)
dir_path = getdirpath()
abs_path = getabspath()
#add all potential paths
if os.name=='nt':
os.add_dll_directory(dir_path)
os.add_dll_directory(abs_path)
os.add_dll_directory(os.getcwd())
if libname == lib_cublas and "CUDA_PATH" in os.environ:
newpath = os.path.join(os.environ["CUDA_PATH"], "bin")
if os.path.exists(newpath):
os.add_dll_directory(newpath)
if libname == lib_hipblas and "HIP_PATH" in os.environ:
newpath = os.path.join(os.environ["HIP_PATH"], "bin")
if os.path.exists(newpath):
os.add_dll_directory(newpath)
handle = ctypes.CDLL(os.path.join(dir_path, libname))
handle.load_model.argtypes = [load_model_inputs]
handle.load_model.restype = ctypes.c_bool
handle.generate.argtypes = [generation_inputs]
handle.generate.restype = generation_outputs
handle.new_token.restype = ctypes.c_char_p
handle.new_token.argtypes = [ctypes.c_int]
handle.get_stream_count.restype = ctypes.c_int
handle.has_finished.restype = ctypes.c_bool
handle.get_last_eval_time.restype = ctypes.c_float
handle.get_last_process_time.restype = ctypes.c_float
handle.get_last_token_count.restype = ctypes.c_int
handle.get_last_seed.restype = ctypes.c_int
handle.get_total_gens.restype = ctypes.c_int
handle.get_last_stop_reason.restype = ctypes.c_int
handle.abort_generate.restype = ctypes.c_bool
handle.token_count.restype = token_count_outputs
handle.get_pending_output.restype = ctypes.c_char_p
handle.sd_load_model.argtypes = [sd_load_model_inputs]
handle.sd_load_model.restype = ctypes.c_bool
handle.sd_generate.argtypes = [sd_generation_inputs]
handle.sd_generate.restype = sd_generation_outputs
handle.whisper_load_model.argtypes = [whisper_load_model_inputs]
handle.whisper_load_model.restype = ctypes.c_bool
handle.whisper_generate.argtypes = [whisper_generation_inputs]
handle.whisper_generate.restype = whisper_generation_outputs
handle.last_logprobs.restype = last_logprobs_outputs
def set_backend_props(inputs):
clblastids = 0
if args.useclblast:
clblastids = 100 + int(args.useclblast[0])*10 + int(args.useclblast[1])
inputs.clblast_info = clblastids
# we must force an explicit tensor split
# otherwise the default will divide equally and multigpu crap will slow it down badly
inputs.cublas_info = 0
if not args.tensor_split:
if (args.usecublas and "0" in args.usecublas):
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["HIP_VISIBLE_DEVICES"] = "0"
elif (args.usecublas and "1" in args.usecublas):
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
os.environ["HIP_VISIBLE_DEVICES"] = "1"
elif (args.usecublas and "2" in args.usecublas):
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "2"
os.environ["HIP_VISIBLE_DEVICES"] = "2"
elif (args.usecublas and "3" in args.usecublas):
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "3"
os.environ["HIP_VISIBLE_DEVICES"] = "3"
else:
if (args.usecublas and "0" in args.usecublas):
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
inputs.cublas_info = 0
elif (args.usecublas and "1" in args.usecublas):
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
inputs.cublas_info = 1
elif (args.usecublas and "2" in args.usecublas):
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
inputs.cublas_info = 2
elif (args.usecublas and "3" in args.usecublas):
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
inputs.cublas_info = 3
if args.usevulkan: #is an empty array if using vulkan without defined gpu
s = ""
for l in range(0,len(args.usevulkan)):
s += str(args.usevulkan[l])
inputs.vulkan_info = s.encode("UTF-8")
else:
inputs.vulkan_info = "".encode("UTF-8")
return inputs
def end_trim_to_sentence(input_text):
enders = ['.', '!', '?', '*', '"', ')', '}', '`', ']', ';', '…']
last = -1
for ender in enders:
last = max(last, input_text.rfind(ender))
nl = input_text.rfind("\n")
last = max(last, nl)
if last > 0:
return input_text[:last + 1].strip()
return input_text.strip()
def tryparseint(value):
try:
return int(value)
except ValueError:
return value
def unpack_to_dir(destpath = ""):
import shutil
srcpath = os.path.abspath(os.path.dirname(__file__))
cliunpack = False if destpath == "" else True
print("Attempt to unpack KoboldCpp into directory...")
if not cliunpack:
from tkinter.filedialog import askdirectory
from tkinter import messagebox
destpath = askdirectory(title='Select an empty folder to unpack KoboldCpp')
if not destpath:
return
if os.path.isdir(srcpath) and os.path.isdir(destpath) and not os.listdir(destpath):
try:
if cliunpack:
print(f"KoboldCpp will be extracted to {destpath}\nThis process may take several seconds to complete.")
else:
messagebox.showinfo("Unpack Starting", f"KoboldCpp will be extracted to {destpath}\nThis process may take several seconds to complete.")
for item in os.listdir(srcpath):
s = os.path.join(srcpath, item)
d = os.path.join(destpath, item)
if item.endswith('.pyd'): # Skip .pyd files
continue
if os.path.isdir(s):
shutil.copytree(s, d, False, None)
else:
shutil.copy2(s, d)
if cliunpack:
print(f"KoboldCpp successfully extracted to {destpath}")
else:
messagebox.showinfo("KoboldCpp Unpack Success", f"KoboldCpp successfully extracted to {destpath}")
except Exception as e:
if cliunpack:
print(f"An error occurred while unpacking: {e}")
else:
messagebox.showerror("Error", f"An error occurred while unpacking: {e}")
else:
if cliunpack:
print(f"The target folder is not empty or invalid. Please select an empty folder.")
else:
messagebox.showwarning("Invalid Selection", "The target folder is not empty or invalid. Please select an empty folder.")
def exit_with_error(code, message, title="Error"):
global guimode
print("")
time.sleep(1)
if guimode:
show_gui_msgbox(title, message)
else:
print(message, flush=True)
time.sleep(2)
sys.exit(code)
def utfprint(str):
maxlen = 32000
if args.debugmode >= 1:
maxlen = 64000
strlength = len(str)
if strlength > maxlen: #limit max output len
str = str[:maxlen] + f"... (+{strlength-maxlen} chars)"
try:
print(str)
except UnicodeEncodeError:
# Replace or omit the problematic character
utf_string = str.encode('ascii', 'ignore').decode('ascii',"ignore")
utf_string = utf_string.replace('\a', '') #remove bell characters
print(utf_string)
def bring_terminal_to_foreground():
if os.name=='nt':
ctypes.windll.user32.ShowWindow(ctypes.windll.kernel32.GetConsoleWindow(), 9)
ctypes.windll.user32.SetForegroundWindow(ctypes.windll.kernel32.GetConsoleWindow())
def string_has_overlap(str_a, str_b, maxcheck):
max_overlap = min(maxcheck, len(str_a), len(str_b))
for i in range(1, max_overlap + 1):
if str_a[-i:] == str_b[:i]:
return True
return False
def string_contains_or_overlaps_sequence_substring(inputstr, sequences):
if inputstr=="":
return False
for s in sequences:
if s.strip()=="":
continue
if s.strip() in inputstr.strip() or inputstr.strip() in s.strip():
return True
if string_has_overlap(inputstr, s, 10):
return True
return False
import struct
def read_gguf_metadata(file_path):
chunk_size = 8192 # read only first 8kb of file
try:
def read_gguf_key(keyname,data,maxval):
keylen = len(keyname)
index = data.find(keyname) # Search for the magic number, Read 2 chunks of 4 byte numbers
if index != -1 and index + keylen + 8 <= chunk_size:
start_index = index + keylen
first_value_bytes = data[start_index:start_index + 4]
second_value_bytes = data[start_index + 4:start_index + 8]
# Unpack each 4 bytes as an unsigned int32 in little-endian format
value1 = struct.unpack('<I', first_value_bytes)[0] #4 means its a uint32
value2 = struct.unpack('<I', second_value_bytes)[0]
if value1 == 4 and value2 > 0 and value2 <= maxval:
return value2 #contains the desired value
return 0
else:
return 0 #not found
fsize = os.path.getsize(file_path)
if fsize < 10000: #ignore files under 10kb
return None
with open(file_path, 'rb') as f:
file_header = f.read(4)
if file_header != b'GGUF': #file is not GGUF
return None
data = f.read(chunk_size)
layercount = read_gguf_key(b'.block_count',data,512)
head_count_kv = read_gguf_key(b'.attention.head_count_kv',data,8192)
key_length = read_gguf_key(b'.attention.key_length',data,8192)
val_length = read_gguf_key(b'.attention.value_length',data,8192)
return [layercount,head_count_kv, max(key_length,val_length)]
except Exception as ex:
return None
def extract_modelfile_params(filepath,sdfilepath,whisperfilepath,mmprojfilepath):
global modelfile_extracted_meta
modelfile_extracted_meta = None
sdfsize = 0
whisperfsize = 0
mmprojsize = 0
if sdfilepath and os.path.exists(sdfilepath):
sdfsize = os.path.getsize(sdfilepath)
if whisperfilepath and os.path.exists(whisperfilepath):
whisperfsize = os.path.getsize(whisperfilepath)
if mmprojfilepath and os.path.exists(mmprojfilepath):
mmprojsize = os.path.getsize(mmprojfilepath)
if filepath and os.path.exists(filepath):
try:
fsize = os.path.getsize(filepath)
if fsize>10000000: #dont bother with models < 10mb as they are probably bad
ggufmeta = read_gguf_metadata(filepath)
modelfile_extracted_meta = [ggufmeta,fsize,sdfsize,whisperfsize,mmprojsize] #extract done. note that meta may be null
except Exception as ex:
modelfile_extracted_meta = None
def autoset_gpu_layers(ctxsize,sdquanted,bbs): #shitty algo to determine how many layers to use
global showusedmemwarning, modelfile_extracted_meta # reference cached values instead
gpumem = MaxMemory[0]
usedmem = 0
if MaxFreeMemory[0]>0:
usedmem = MaxMemory[0]-MaxFreeMemory[0]
if showusedmemwarning and usedmem > (2.5*1024*1024*1024):
showusedmemwarning = False
print(f"Note: KoboldCpp has detected that a significant amount of GPU VRAM ({usedmem/1024/1024} MB) is currently used by another application.\nFor best results, you may wish to close that application and then restart KoboldCpp.\n***")
reservedmem = max(1.5*1024*1024*1024,(0.5*1024*1024*1024 + usedmem)) # determine vram overhead
try:
if not modelfile_extracted_meta:
return 0
layerlimit = 0
fsize = modelfile_extracted_meta[1]
if fsize>10000000: #dont bother with models < 10mb
cs = ctxsize
mem = gpumem
if modelfile_extracted_meta[2] > 1024*1024*1024*5: #sdxl tax
mem -= 1024*1024*1024*(6 if sdquanted else 9)
elif modelfile_extracted_meta[2] > 1024*1024*512: #normal sd tax
mem -= 1024*1024*1024*(3.25 if sdquanted else 4.25)
if modelfile_extracted_meta[3] > 1024*1024*10: #whisper tax
mem -= 350*1024*1024
if modelfile_extracted_meta[4] > 1024*1024*10: #mmproj tax
mem -= 350*1024*1024
csmul = 1.0
if cs:
csmul = (cs/4096) if cs >= 8192 else 1.8 if cs > 4096 else 1.2 if cs > 2048 else 1.0
ggufmeta = modelfile_extracted_meta[0]
if not ggufmeta or ggufmeta[0]==0: #fail to read or no layers
sizeperlayer = fsize*csmul*0.052
layerlimit = int(min(200,(mem-usedmem)/sizeperlayer))
else:
layers = ggufmeta[0]
headcount = ggufmeta[1]
headkvlen = (ggufmeta[2] if ggufmeta[2] > 0 else 128)
ratio = (mem-usedmem)/(fsize*csmul*1.6*(1.0 if bbs <= 512 else 1.2))
computemem = layers*(4 if bbs <= 512 else (bbs/128))*headkvlen*cs*4*1.5 # apply blasbatchsize calculations if over 512
contextmem = layers*headcount*headkvlen*cs*4*1.1
if headcount > 0:
ratio = max(ratio, (mem - reservedmem - computemem) / (fsize + contextmem))
layerlimit = min(int(ratio*layers), (layers + 3))
layerlimit = (0 if layerlimit<=2 else layerlimit)
return layerlimit
except Exception as ex:
return 0
def fetch_gpu_properties(testCL,testCU,testVK):
import subprocess
if testCU:
FetchedCUdevices = []
FetchedCUdeviceMem = []
FetchedCUfreeMem = []
AMDgpu = None
try: # Get NVIDIA GPU names
output = subprocess.run(['nvidia-smi','--query-gpu=name,memory.total,memory.free','--format=csv,noheader'], capture_output=True, text=True, check=True, encoding='utf-8').stdout
FetchedCUdevices = [line.split(",")[0].strip() for line in output.splitlines()]
FetchedCUdeviceMem = [line.split(",")[1].strip().split(" ")[0].strip() for line in output.splitlines()]
FetchedCUfreeMem = [line.split(",")[2].strip().split(" ")[0].strip() for line in output.splitlines()]
except Exception as e:
pass
if len(FetchedCUdevices)==0:
try: # Get AMD ROCm GPU names
output = subprocess.run(['rocminfo'], capture_output=True, text=True, check=True, encoding='utf-8').stdout
device_name = None
for line in output.splitlines(): # read through the output line by line
line = line.strip()
if line.startswith("Marketing Name:"): device_name = line.split(":", 1)[1].strip() # if we find a named device, temporarily save the name
elif line.startswith("Device Type:") and "GPU" in line and device_name is not None: # if the following Device Type is a GPU (not a CPU) then add it to devices list
FetchedCUdevices.append(device_name)
AMDgpu = True
elif line.startswith("Device Type:") and "GPU" not in line: device_name = None
if FetchedCUdevices:
getamdvram = subprocess.run(['rocm-smi', '--showmeminfo', 'vram', '--csv'], capture_output=True, text=True, check=True, encoding='utf-8').stdout # fetch VRAM of devices
if getamdvram:
FetchedCUdeviceMem = [line.split(",")[1].strip() for line in getamdvram.splitlines()[1:] if line.strip()]
except Exception as e:
pass
lowestcumem = 0
lowestfreecumem = 0
for idx in range(0,4):
if(len(FetchedCUdevices)>idx):
CUDevicesNames[idx] = FetchedCUdevices[idx]
if len(FetchedCUdeviceMem)>idx:
dmem = int(FetchedCUdeviceMem[idx]) if AMDgpu else (int(FetchedCUdeviceMem[idx])*1024*1024)
lowestcumem = dmem if lowestcumem==0 else (dmem if dmem<lowestcumem else lowestcumem)
if len(FetchedCUfreeMem)>idx:
dmem = (int(FetchedCUfreeMem[idx])*1024*1024)
lowestfreecumem = dmem if lowestfreecumem==0 else (dmem if dmem<lowestfreecumem else lowestfreecumem)
MaxMemory[0] = max(lowestcumem,MaxMemory[0])
MaxFreeMemory[0] = max(lowestfreecumem,MaxFreeMemory[0])
if testVK:
try: # Get Vulkan names
output = subprocess.run(['vulkaninfo','--summary'], capture_output=True, text=True, check=True, encoding='utf-8').stdout
devicelist = [line.split("=")[1].strip() for line in output.splitlines() if "deviceName" in line]
devicetypes = [line.split("=")[1].strip() for line in output.splitlines() if "deviceType" in line]
idx = 0
for dname in devicelist:
if idx<len(VKDevicesNames):
VKDevicesNames[idx] = dname
idx += 1
if len(devicetypes) == len(devicelist):
idx = 0
for dvtype in devicetypes:
if idx<len(VKIsDGPU):
VKIsDGPU[idx] = (1 if dvtype=="PHYSICAL_DEVICE_TYPE_DISCRETE_GPU" else 0)
idx += 1
except Exception as e:
pass
if testCL:
try: # Get OpenCL GPU names on windows using a special binary. overwrite at known index if found.
basepath = os.path.abspath(os.path.dirname(__file__))
output = ""
data = None
try:
output = subprocess.run(["clinfo","--json"], capture_output=True, text=True, check=True, encoding='utf-8').stdout
data = json.loads(output)
except Exception as e1:
output = subprocess.run([((os.path.join(basepath, "winclinfo.exe")) if os.name == 'nt' else "clinfo"),"--json"], capture_output=True, text=True, check=True, creationflags=subprocess.CREATE_NO_WINDOW | subprocess.DETACHED_PROCESS, encoding='utf-8').stdout
data = json.loads(output)
plat = 0
dev = 0
lowestclmem = 0
for platform in data["devices"]:
dev = 0
for device in platform["online"]:
dname = device["CL_DEVICE_NAME"]
dmem = int(device["CL_DEVICE_GLOBAL_MEM_SIZE"])
idx = plat+dev*2
if idx<len(CLDevices):
CLDevicesNames[idx] = dname
lowestclmem = dmem if lowestclmem==0 else (dmem if dmem<lowestclmem else lowestclmem)
dev += 1
plat += 1
MaxMemory[0] = max(lowestclmem,MaxMemory[0])
except Exception as e:
pass
return
def auto_set_backend_cli():
fetch_gpu_properties(False,True,True)
found_new_backend = False
if exitcounter < 100 and MaxMemory[0]>3500000000 and (("Use CuBLAS" in runopts and CUDevicesNames[0]!="") or "Use hipBLAS (ROCm)" in runopts) and any(CUDevicesNames):
if "Use CuBLAS" in runopts or "Use hipBLAS (ROCm)" in runopts:
args.usecublas = ["normal","mmq"]
print("Auto Selected CUDA Backend...\n")
found_new_backend = True
elif exitcounter < 100 and (1 in VKIsDGPU) and "Use Vulkan" in runopts:
for i in range(0,len(VKIsDGPU)):
if VKIsDGPU[i]==1:
args.usevulkan = []
print("Auto Selected Vulkan Backend...\n")
found_new_backend = True
break
if not found_new_backend:
print("No GPU Backend found...\n")
def load_model(model_filename):
global args
inputs = load_model_inputs()
inputs.model_filename = model_filename.encode("UTF-8")
inputs.max_context_length = maxctx #initial value to use for ctx, can be overwritten
inputs.threads = args.threads
inputs.low_vram = (True if (args.usecublas and "lowvram" in args.usecublas) else False)
inputs.use_mmq = (True if (args.usecublas and "mmq" in args.usecublas) else False)
inputs.use_rowsplit = (True if (args.usecublas and "rowsplit" in args.usecublas) else False)
inputs.vulkan_info = "0".encode("UTF-8")
inputs.blasthreads = args.blasthreads
inputs.use_mmap = (not args.nommap)
inputs.use_mlock = args.usemlock
inputs.lora_filename = "".encode("UTF-8")
inputs.lora_base = "".encode("UTF-8")
if args.lora:
inputs.lora_filename = args.lora[0].encode("UTF-8")
inputs.use_mmap = False
if len(args.lora) > 1:
inputs.lora_base = args.lora[1].encode("UTF-8")
inputs.mmproj_filename = args.mmproj.encode("UTF-8") if args.mmproj else "".encode("UTF-8")
inputs.use_smartcontext = args.smartcontext
inputs.use_contextshift = (0 if args.noshift else 1)
inputs.use_fastforward = (0 if args.nofastforward else 1)
inputs.flash_attention = args.flashattention
if args.quantkv>0:
inputs.quant_k = inputs.quant_v = args.quantkv
inputs.flash_attention = True
inputs.use_contextshift = 0
else:
inputs.quant_k = inputs.quant_v = 0
inputs.blasbatchsize = args.blasbatchsize
inputs.forceversion = args.forceversion
inputs.gpulayers = args.gpulayers
inputs.rope_freq_scale = args.ropeconfig[0]
if len(args.ropeconfig)>1:
inputs.rope_freq_base = args.ropeconfig[1]
else:
inputs.rope_freq_base = 10000
for n in range(tensor_split_max):
if args.tensor_split and n < len(args.tensor_split):
inputs.tensor_split[n] = float(args.tensor_split[n])
else:
inputs.tensor_split[n] = 0
inputs = set_backend_props(inputs)
inputs.executable_path = (getdirpath()+"/").encode("UTF-8")
inputs.debugmode = args.debugmode
ret = handle.load_model(inputs)
return ret
def generate(genparams, is_quiet=False, stream_flag=False):
global maxctx, args, currentusergenkey, totalgens, pendingabortkey
prompt = genparams.get('prompt', "")
memory = genparams.get('memory', "")
images = genparams.get('images', [])
max_context_length = genparams.get('max_context_length', maxctx)
max_length = genparams.get('max_length', 200)
temperature = genparams.get('temperature', 0.7)
top_k = genparams.get('top_k', 100)
top_a = genparams.get('top_a', 0.0)
top_p = genparams.get('top_p', 0.92)
min_p = genparams.get('min_p', 0.0)
typical_p = genparams.get('typical', 1.0)
tfs = genparams.get('tfs', 1.0)
rep_pen = genparams.get('rep_pen', 1.0)
rep_pen_range = genparams.get('rep_pen_range', 320)
rep_pen_slope = genparams.get('rep_pen_slope', 1.0)
presence_penalty = genparams.get('presence_penalty', 0.0)
mirostat = genparams.get('mirostat', 0)
mirostat_tau = genparams.get('mirostat_tau', 5.0)
mirostat_eta = genparams.get('mirostat_eta', 0.1)
dry_multiplier = genparams.get('dry_multiplier', 0.0)
dry_base = genparams.get('dry_base', 1.75)
dry_allowed_length = genparams.get('dry_allowed_length', 2)
dry_penalty_last_n = genparams.get('dry_penalty_last_n', 320)
dry_sequence_breakers = genparams.get('dry_sequence_breakers', [])
xtc_threshold = genparams.get('xtc_threshold', 0.2)
xtc_probability = genparams.get('xtc_probability', 0)
sampler_order = genparams.get('sampler_order', [6, 0, 1, 3, 4, 2, 5])
seed = tryparseint(genparams.get('sampler_seed', -1))
stop_sequence = genparams.get('stop_sequence', [])
ban_eos_token = genparams.get('ban_eos_token', False)
stream_sse = stream_flag
grammar = genparams.get('grammar', '')
grammar_retain_state = genparams.get('grammar_retain_state', False)
genkey = genparams.get('genkey', '')
trimstop = genparams.get('trim_stop', False)
quiet = is_quiet
dynatemp_range = genparams.get('dynatemp_range', 0.0)
dynatemp_exponent = genparams.get('dynatemp_exponent', 1.0)
smoothing_factor = genparams.get('smoothing_factor', 0.0)
logit_biases = genparams.get('logit_bias', {})
render_special = genparams.get('render_special', False)
banned_strings = genparams.get('banned_strings', []) # SillyTavern uses that name
banned_tokens = genparams.get('banned_tokens', banned_strings)
bypass_eos_token = genparams.get('bypass_eos', False)
custom_token_bans = genparams.get('custom_token_bans', '')
for tok in custom_token_bans.split(','):
tok = tok.strip() # Remove leading/trailing whitespace
if tok.isdigit():
logit_biases[tok] = bias_min_value
inputs = generation_inputs()
inputs.prompt = prompt.encode("UTF-8")
inputs.memory = memory.encode("UTF-8")
for n in range(images_max):
if not images or n >= len(images):
inputs.images[n] = "".encode("UTF-8")
else:
inputs.images[n] = images[n].encode("UTF-8")
global showmaxctxwarning
if max_context_length > maxctx:
if showmaxctxwarning:
print(f"\n(Warning! Request max_context_length={max_context_length} exceeds allocated context size of {maxctx}. It will be reduced to fit. Consider launching with increased --contextsize to avoid errors. This message will only show once per session.)")
showmaxctxwarning = False
max_context_length = maxctx
min_remain = min(max_context_length-4, 16)
if max_length >= (max_context_length-min_remain):
max_length = max_context_length-min_remain
print("\nWarning: You are trying to generate with max_length near or exceeding max_context_length. Most of the context will be removed, and your outputs will not be very coherent.")
inputs.max_context_length = max_context_length # this will resize the context buffer if changed
inputs.max_length = max_length
inputs.temperature = temperature
inputs.top_k = top_k
inputs.top_a = top_a
inputs.top_p = top_p
inputs.min_p = min_p
inputs.typical_p = typical_p
inputs.tfs = tfs
inputs.rep_pen = rep_pen
inputs.rep_pen_range = rep_pen_range
inputs.rep_pen_slope = rep_pen_slope
inputs.presence_penalty = presence_penalty
inputs.stream_sse = stream_sse
inputs.quiet = quiet
inputs.dynatemp_range = dynatemp_range
inputs.dynatemp_exponent = dynatemp_exponent
inputs.smoothing_factor = smoothing_factor
inputs.grammar = grammar.encode("UTF-8")
inputs.grammar_retain_state = grammar_retain_state
inputs.allow_eos_token = not ban_eos_token
inputs.bypass_eos_token = bypass_eos_token
inputs.render_special = render_special
if mirostat in (1, 2):
inputs.mirostat = mirostat