-
Notifications
You must be signed in to change notification settings - Fork 25
/
Copy pathserver.py
1466 lines (1375 loc) · 67.3 KB
/
server.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/python3
"""
Web based ChatBot Example
Web chat client for OpenAI and the llama-cpp-python[server] OpenAI API Compatible
Python FastAPI / socket.io based Web Server. Provides a simple web based chat session.
Features:
* Uses OpenAI API to talk to LLM
* Works with local hosted OpenAI API compatible LLMs, e.g. llama-cpp-python[server]
* Retains conversational context for LLM
* Uses response stream to render LLM chunks instead of waiting for full response
* Supports multiple concurrent client sessions
* Supports commands to reset context, get version, etc.
* Uses FastAPI and Uvicorn ASGI high speed web server implementation
* (Optional) Supports RAG prompts using Qdrant Vector Database
Requirements:
* pip install fastapi uvicorn python-socketio jinja2 openai bs4 pypdf requests lxml aiohttp
* pip install weaviate-client pdfreader pypandoc
* pip install pandas openpyxl
* pip install python-multipart
* pip install pillow-heif
Environmental variables:
* PORT - Port that Chatbot will listen on
* PROMPT_FILE - File to store prompts
* DEBUG - Set to True to enable debug mode
* OPENAI_API_KEY - Required only for OpenAI
* OPENAI_API_BASE - URL to OpenAI API Server or locally hosted version
* LLM_MODEL - LLM Model to Use
* USE_SYSTEM - Use system in chat prompt if True
* MAXCLIENTS - Maximum number of clients to allow
* MAXTOKENS - Maximum number of tokens to send to LLM
* TEMPERATURE - LLM temperature
* AGENT_NAME - Name for Bot
* ALPHA_KEY - Alpha Vantage API Key for Stocks (Optional) - https://www.alphavantage.co/support/#api-key
* WEAVIATE_HOST - Weaviate Host for RAG (Optional)
* WEAVIATE_LIBRARY - Weaviate Library for RAG (Optional)
* RESULTS - Number of results to return from RAG query
* ONESHOT - Set to True to enable one-shot mode
* RAG_ONLY - Set to True to enable RAG only mode
* TOKEN - TinyLLM token for admin functions
* PROMPT_FILE - File to store system prompts
* PROMPT_RO - Set to True to enable read-only prompts
* EXTRA_BODY - Extra body parameters for OpenAI API
* TOXIC_THRESHOLD - Toxicity threshold for responses 0-1 or 99 disable
* THINKING - Set to True to enable thinking mode by default
* MAX_IMAGES - Maximum number of images the chatbot will keep in context (default 1)
Running a llama-cpp-python server:
* CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python
* pip install llama-cpp-python[server]
* python3 -m llama_cpp.server --model models/7B/ggml-model.bin
Web APIs:
* GET / - Chatbot HTML main page
* GET /upload - Chatbot Document Upload page
* GET /version - Get version
* POST /alert - Send alert to all clients
Author: Jason A. Cox
23 Sept 2023
https://github.com/jasonacox/TinyLLM
"""
# pylint: disable=invalid-name
# pylint: disable=global-statement
# pylint: disable=global-variable-not-assigned
# Import Libraries
import asyncio
import datetime
import io
import json
import logging
import os
import time
import re
import base64
from documents import Documents
import openai
import requests
import socketio
import uvicorn
from bs4 import BeautifulSoup
from fastapi import FastAPI, Request, File, UploadFile, Form
from fastapi.responses import HTMLResponse
from fastapi.templating import Jinja2Templates
from fastapi.responses import FileResponse
from contextlib import asynccontextmanager
from pypdf import PdfReader
import aiohttp
# TinyLLM Version
from version import VERSION
from PIL import Image
import pillow_heif
# Enable tracemalloc for memory usage
import tracemalloc
tracemalloc.start()
# Ensure pillow_heif is properly registered with PIL
pillow_heif.register_heif_opener()
# Set up logging
logging.basicConfig(level=logging.INFO,
format='%(asctime)s %(levelname)s %(message)s',
datefmt='%Y-%m-%d %H:%M:%S')
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
logger.info("TinyLLM %s" % VERSION)
def log(text):
logger.info(text)
def debug(text):
logger.debug(text)
# Configuration Settings
API_KEY = os.environ.get("OPENAI_API_KEY", "Asimov-3-Laws") # Required, use bogus string for local LLMs
API_BASE = os.environ.get("OPENAI_API_BASE", "http://localhost:8000/v1") # Required, use https://api.openai.com for OpenAI
LITELLM_PROXY = os.environ.get("LITELLM_PROXY", None) # Optional - LITELLM Proxy URL
LITELLM_KEY = os.environ.get("LITELLM_KEY", "") # Optional - LITELLM Secret Key - Begins with sk-
AGENTNAME = os.environ.get("AGENT_NAME", "") # Set the name of your bot
MYMODEL = os.environ.get("LLM_MODEL", "models/7B/gguf-model.bin") # Pick model to use e.g. gpt-3.5-turbo for OpenAI
DEBUG = os.environ.get("DEBUG", "false").lower() == "true" # Set to True to enable debug mode
MAXCLIENTS = int(os.environ.get("MAXCLIENTS", 1000)) # Maximum number of concurrent clients
MAXTOKENS = int(os.environ.get("MAXTOKENS", 0)) # Maximum number of tokens to send to LLM for RAG
TEMPERATURE = float(os.environ.get("TEMPERATURE", 0.0)) # LLM temperature
PORT = int(os.environ.get("PORT", 5000)) # Port to listen on
PROMPT_FILE = os.environ.get("PROMPT_FILE", f".tinyllm/prompts.json") # File to store system prompts
PROMPT_RO = os.environ.get("PROMPT_RO", "false").lower() == "true" # Set to True to enable read-only prompts
USE_SYSTEM = os.environ.get("USE_SYSTEM", "false").lower == "true" # Use system in chat prompt if True
TOKEN = os.environ.get("TOKEN", "secret") # Secret TinyLLM token for admin functions
ONESHOT = os.environ.get("ONESHOT", "false").lower() == "true" # Set to True to enable one-shot mode
RAG_ONLY = os.environ.get("RAG_ONLY", "false").lower() == "true" # Set to True to enable RAG only mode
EXTRA_BODY = os.environ.get("EXTRA_BODY", None) # Extra body parameters for OpenAI API
TOXIC_THRESHOLD = float(os.environ.get("TOXIC_THRESHOLD", 99)) # Toxicity threshold for responses 0-1 or 99 disable
THINKING = os.environ.get("THINKING", "false").lower() == "true" # Set to True to enable thinking mode by default
THINK_FILTER = os.environ.get("THINK_FILTER", "false").lower() == "true" # Set to True to enable thinking filter
MAX_IMAGES = int(os.environ.get("MAX_IMAGES", 1)) # Maximum number of images to keep in context
# Convert EXTRA_BODY to dictionary if it is proper JSON
if EXTRA_BODY:
try:
EXTRA_BODY = json.loads(EXTRA_BODY)
except:
log("EXTRA_BODY is not valid JSON")
EXTRA_BODY = {}
else:
if API_BASE.startswith("https://api.openai.com") or LITELLM_PROXY:
EXTRA_BODY = {}
else:
# Extra stop tokens are needed for some non-OpenAI LLMs
EXTRA_BODY = {"stop_token_ids":[128001, 128009]}
# LiteLLM Proxy
if LITELLM_PROXY:
log(f"Using LiteLLM Proxy at {LITELLM_PROXY}")
API_BASE = LITELLM_PROXY
API_KEY = LITELLM_KEY
# RAG Configuration Settings
WEAVIATE_HOST = os.environ.get("WEAVIATE_HOST", "") # Empty = no Weaviate support
WEAVIATE_GRPC_HOST = os.environ.get("WEAVIATE_GRPC_HOST", WEAVIATE_HOST) # Empty = no Weaviate gRPC support
WEAVIATE_PORT = os.getenv('WEAVIATE_PORT', '8080')
WEAVIATE_GRPC_PORT = os.getenv('WEAVIATE_GRPC_PORT', '50051')
WEAVIATE_LIBRARY = os.environ.get("WEAVIATE_LIBRARY", "tinyllm") # Weaviate library to use
WEAVIATE_AUTH_KEY = os.getenv('WEAVIATE_AUTH_KEY', None) # Weaviate Auth Key
RESULTS = int(os.environ.get("RESULTS", 1)) # Number of results to return from RAG query
ALPHA_KEY = os.environ.get("ALPHA_KEY", "alpha_key") # Optional - Alpha Vantage API Key
UPLOAD_FOLDER = os.environ.get("UPLOAD_FOLDER", "/tmp") # Folder to store uploaded documents
# Debug Mode
if DEBUG:
logger.setLevel(logging.DEBUG)
log("Debug mode enabled.")
# Display all default settings
debug("Configuration Settings:")
vars = globals()
for n in list(vars):
if n.isupper():
if vars[n] and n in ["API_KEY", "TOKEN", "WEAVIATE_AUTH_KEY", "ALPHA_KEY"]:
debug(f" {n}: {'*' * len(vars[n])}")
else:
debug(f" {n}: {vars[n]}")
# Document Management Settings
rag_documents = Documents(host=WEAVIATE_HOST, grpc_host=WEAVIATE_GRPC_HOST, port=WEAVIATE_PORT,
grpc_port=WEAVIATE_GRPC_PORT, retry=3, filepath=UPLOAD_FOLDER,
auth_key=WEAVIATE_AUTH_KEY)
# Prompt Defaults
default_prompts = {}
default_prompts["greeting"] = "Hi"
default_prompts["agentname"] = "Jarvis"
default_prompts["baseprompt"] = "You are {agentname}, a highly intelligent assistant. The current date is {date} and time is {time}. You should give concise responses to very simple questions, but provide thorough responses to more complex and open-ended questions. Don't mention any of the above unless asked and keep your greetings brief."
default_prompts["weather"] = "You are a weather forecaster. Keep your answers brief and accurate. Current date is {date} and weather conditions:\n[DATA]{context_str}[/DATA]\nProvide a weather update, current weather alerts, conditions, precipitation and forecast for {location} and answer this: {prompt}."
default_prompts["stock"] = "You are a stock analyst. Keep your answers brief and accurate. Current date is {date}."
default_prompts["news"] = "You are a newscaster who specializes in providing headline news. Use only the following context provided by Google News to summarize the top 10 headlines for today. Rank headlines by most important to least important. Always include the news organization and ID. Do not add any commentary.\nAlways use this format:\n#. [News Item] - [News Source] - LnkID:[ID]\nHere are some examples, but do not use them: \n1. The World is Round - Science - LnkID:91\n2. The Election is over and Children have won - US News - LnkID:22\n3. Storms Hit the Southern Coast - ABC - LnkID:55\nContext: {context_str}\nTop 10 Headlines with Source and LnkID:"
default_prompts["clarify"] = "You are a highly intelligent assistant. Keep your answers brief and accurate. {format}."
default_prompts["location"] = "What location is specified in this prompt, state None if there isn't one. Use a single word answer. [BEGIN] {prompt} [END]"
default_prompts["company"] = "What company is related to the stock price in this prompt? Please state none if there isn't one. Use a single word answer: [BEGIN] {prompt} [END]"
default_prompts["rag"] = "You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Back up your answer using facts from the following context.\\nContext: {context_str}\\nQuestion: {prompt}\\nAnswer:"
default_prompts["website"] = "Summarize the following text from URL {url}:\n[BEGIN] {website_text} [END]\nExplain what the link is about and provide a summary with the main points."
default_prompts["LLM_temperature"] = TEMPERATURE
default_prompts["LLM_max_tokens"] = MAXTOKENS
default_prompts["toxic_filter"] = "You are a highly intelligent assistant. Review the following text and filter out any toxic or inappropriate content. Please respond with a toxicity rating. Use a scale of 0 to 1, where 0 is not toxic and 1 is highly toxic. [BEGIN] {prompt} [END]"
default_prompts["chain_of_thought_check"] = """You are a language expert.
Consider this prompt:
<prompt>{prompt}</prompt>
Categorize the request using one of these:
a) A request for information
b) A request for code
c) A greeting or word of appreciation
d) Something else
Answer with a, b, c or d only:
"""
default_prompts["chain_of_thought"] = """First, outline how you will approach answering the problem.
Break down the solution into clear steps.
Continuously adjust your reasoning based on intermediate results and reflections, adapting your strategy as you progress.
Regularly evaluate progress.
Be critical and honest about your reasoning process.
Use thoughts as a scratchpad, writing out all calculations and reasoning explicitly.
Synthesize the final answer within <answer> tags, providing a clear informed and detailed conclusion.
Include relevant scientific and factual details to support your answer.
If providing an equation, make sure you define the variables and units.
Don't over analyze simple questions.
If asked to produce code, include the code block in the answer.
Answer the following in an accurate way that a young student would understand:
{prompt}"""
default_prompts["chain_of_thought_summary"] = """Examine the following context:\n{context_str}
Provide the best conclusion based on the context.
Do not provide an analysis of the context. Do not include <answer> tags.
Include relevant scientific and factual details to support the answer.
If there is an equation, make sure you define the variables and units. Do not include an equation section if not needed.
If source code provided, include the code block and describe what it does. Do not include a code section otherwise.
Make sure the answer addresses the original prompt: {prompt}
"""
# Log ONE_SHOT mode
if ONESHOT:
log("ONESHOT mode enabled.")
# Helper Functions
# Test OpenAI API
def test_model():
global API_KEY, API_BASE, MYMODEL, MAXTOKENS
log("Testing OpenAI API...")
try:
log(f"Using openai library version {openai.__version__}")
log(f"Connecting to OpenAI API at {API_BASE} using model {MYMODEL}")
llm = openai.OpenAI(api_key=API_KEY, base_url=API_BASE)
# Get models
try:
models = llm.models.list()
if len(models.data) == 0:
log("LLM: No models available - proceeding.")
except Exception as erro:
log(f"LLM: Unable to get models, using default: {str(erro)}")
models = MYMODEL
else:
# build list of models
model_list = [model.id for model in models.data]
log(f"LLM: Models available: {model_list}")
if not MYMODEL in model_list:
log(f"LLM: Model {MYMODEL} not found in models list.")
log("LLM: Switching to an available model: %s" % model_list[0])
MYMODEL = model_list[0]
# Test LLM
log(f"LLM: Using model: {MYMODEL}")
llm.chat.completions.create(
model=MYMODEL,
stream=False,
temperature=TEMPERATURE,
messages=[{"role": "user", "content": "Hello"}],
extra_body=EXTRA_BODY,
)
log("OpenAI API connection successful.")
# Close the openai client
llm.close()
return True
except Exception as erro:
log("OpenAI API Error: %s" % erro)
log(f"Unable to connect to OpenAI API at {API_BASE} using model {MYMODEL}.")
return False
# Fetch list of LLM models
def get_models():
global API_KEY, API_BASE
try:
llm = openai.OpenAI(api_key=API_KEY, base_url=API_BASE)
models = llm.models.list()
model_list = [model.id for model in models.data]
llm.close()
return model_list
except Exception as erro:
log(f"Unable to get models: {str(erro)}")
return []
while True:
if test_model():
break
else:
log("Sleeping 5 seconds...")
time.sleep(5)
# Test Weaviate Connection
if WEAVIATE_HOST != "":
try:
rag_documents.connect()
log(f"RAG: Connected to Weaviate at {WEAVIATE_HOST}")
except Exception as err:
log(f"RAG: Unable to connect to Weaviate at {WEAVIATE_HOST} - {str(err)}")
WEAVIATE_HOST = ""
log("RAG support disabled.")
# Find document closely related to query
def query_index(query, library, num_results=RESULTS):
references = "References:"
content = ""
try:
results = rag_documents.get_documents(library, query=query, num_results=num_results)
except Exception as erro:
log(f"Error querying Weaviate: {str(erro)}")
return None, None
previous_title = ""
previous_file = ""
previous_content = ""
for ans in results:
# Skip duplicate titles and files
if ans['title'] == previous_title and ans['file'] == previous_file:
continue
references = references + f"\n - {ans['title']} - {ans['file']}"
# Skip duplicates of content
if ans['content'] == previous_content:
continue
new_content = ans['content']
if MAXTOKENS and len(new_content) > MAXTOKENS:
debug("RAG: Content size exceeded maximum size using chunk.")
# Cut the middle and insert the chunk in the middle
new_content = ans['content'][:MAXTOKENS//4] + "..." + (ans.get('chunk') or " ") + "..." + ans['content'][-MAXTOKENS//4:]
content = content + f"Document: {ans['title']}\nDocument Source: {ans['file']}\nContent: {new_content}\n---\n"
if (len(content)/4) > MAXTOKENS/2:
debug("RAG: Content size reached maximum.")
break
previous_title = ans['title']
previous_file = ans['file']
previous_content = ans['content']
debug(f"RAG: Retrieved ({len(content)} bytes)")
return content, references
# Globals
client = {}
prompts = {}
stats = {
"start_time": time.time(),
"errors": 0,
"ask": 0,
"ask_llm": 0,
"ask_context": 0,
}
llm_stream = None
#
# Configure FastAPI App and SocketIO
#
sio = socketio.AsyncServer(async_mode="asgi")
socket_app = socketio.ASGIApp(sio)
app = FastAPI()
@asynccontextmanager
async def lifespan(app: FastAPI):
log("Starting chatbot...")
yield
log("Shutting down chatbot...")
rag_documents.close()
app.router.lifespan_context = lifespan
# Load system prompts from PROMPT_FILE
def load_prompts():
global prompts
try:
with open(PROMPT_FILE, "r") as f:
prompts = json.load(f)
# Ensure prompts always include all keys from default_prompts
for k in default_prompts:
if k not in prompts:
prompts[k] = default_prompts[k]
except:
log(f"Unable to load system prompts file {PROMPT_FILE}, creating with defaults.")
reset_prompts()
save_prompts()
# Save prompts to PROMPT_FILE
def save_prompts():
global prompts
try:
os.makedirs(os.path.dirname(PROMPT_FILE), exist_ok=True) # Create path if it doesn't exist
with open(PROMPT_FILE, "w") as f:
json.dump(prompts, f)
log(f"Saved {len(prompts)} prompts.")
except:
log("Unable to save prompts.")
# Expand variables in prompt to values
def expand_prompt(prompt, values):
# Always use current {date} and {time}
current_date = datetime.datetime.now()
values["date"] = current_date.strftime("%B %-d, %Y")
values["time"] = current_date.strftime("%-I:%M %p")
for k in values:
prompt = prompt.replace(f"{{{k}}}", values[k])
return prompt
# Reset prompts
def reset_prompts():
global prompts
prompts = {}
for k in default_prompts:
prompts[k] = default_prompts[k]
# Load prompts
load_prompts()
log(f"Loaded {len(prompts)} prompts.")
# Function to return base conversation prompt
def base_prompt(content=None):
global baseprompt, AGENTNAME, USE_SYSTEM, prompts
if AGENTNAME == "":
AGENTNAME = prompts["agentname"]
current_date = datetime.datetime.now()
formatted_date = current_date.strftime("%B %-d, %Y")
values = {"agentname": AGENTNAME, "date": formatted_date}
baseprompt = expand_prompt(prompts["baseprompt"], values)
if not content:
content = baseprompt
if USE_SYSTEM:
return [{"role": "system", "content": content}]
else:
return [{"role": "user", "content": content}, {"role": "assistant", "content": "Okay, let's get started."}]
# Function - Send user prompt to LLM for streaming response
async def ask(prompt, sid=None):
global client, stats, llm_stream
stats["ask"] += 1
response = False
debug(f"Context size = {len(client[sid]['context'])}")
while not response:
try:
# Remember context
if ONESHOT:
client[sid]["context"] = base_prompt()
# Process image upload if present
if client[sid]["image_data"]:
# go through context and count images, remove if too many
image_count = 0
for turn in reversed(client[sid]["context"]):
if "content" in turn and isinstance(turn["content"], list):
for item in turn["content"]:
if "image_url" in item:
image_count += 1
if image_count >= MAX_IMAGES:
# remove image from context
debug("Too many images - Found image in context, removing 1...")
turn["content"] = ' '.join([x.get("text", "") for x in turn["content"]])
message = {
"role": "user",
"content": [
{"type": "text", "text": prompt},
{"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{client[sid]['image_data']}"}}
]
}
client[sid]["context"].append(message)
else:
client[sid]["context"].append({"role": "user", "content": prompt})
debug(f"context -> LLM [{sid}] = {client[sid]['context']} - model = {MYMODEL}")
if not llm_stream:
llm_stream = openai.OpenAI(api_key=API_KEY, base_url=API_BASE)
response = llm_stream.chat.completions.create(
model=client[sid]["model"],
stream=True, # Send response chunks as LLM computes next tokens
temperature=TEMPERATURE,
messages=client[sid]["context"],
extra_body=EXTRA_BODY,
)
client[sid]["image_data"] = ""
except openai.OpenAIError as erro:
# If we get an error, try to recover
client[sid]["context"].pop()
if "does not exist" in str(erro):
await sio.emit('update', {'update': '[Model Unavailable... Retrying]', 'voice': 'user'},room=sid)
log("Model does not exist - retrying")
test_model()
# set client model to default
client[sid]["model"] = MYMODEL
# update footer
await sio.emit('update', {'update': f"TinyLLM Chatbot {VERSION} - {client[sid]['model']} ",
'voice': 'footer', 'model': client[sid]['model']},room=sid)
elif "maximum context length" in str(erro):
if len(prompt) > 1000:
# assume we have very large prompt - cut out the middle
prompt = prompt[:len(prompt)//4] + " ... " + prompt[-len(prompt)//4:]
log(f"Session {sid} - Reduce prompt size - Now: ~{len(prompt)/4} tokens") # tokens are ~4 bytes
elif len(client[sid]["context"]) > 4:
# our context has grown too large, truncate the top
client[sid]["context"] = client[sid]["context"][:1] + client[sid]["context"][3:]
log(f"Session {sid} - Truncate context array: Now: {len(client[sid]['context'])} blocks")
else:
# our context has grown too large, reset
client[sid]["context"] = base_prompt()
log(f"Session {sid} - Reset context to base prompt - Now: ~{len(client[sid]['context'])/4} tokens")
elif "At most" in str(erro) and "image" in str(erro):
# Remove oldest image from context
for turn in reversed(client[sid]["context"]):
# if turn["content"] is a list, remove image_url
if "content" in turn and isinstance(turn["content"], list):
debug("Too many images - Found last image in context, removing...")
turn["content"] = ' '.join([x.get("text", "") for x in turn["content"]])
break
continue
elif "Internal Server Error" in str(erro):
# Check to see if our context has images - if so, remove them
debug("Internal Server Error - Checking for images in context...")
removed_image_data = False
for turn in client[sid]["context"]:
# if turn["content"] is a list, remove image_url
if "content" in turn and isinstance(turn["content"], list):
log("Found image in context, removing...")
removed_image_data = True
turn["content"] = ' '.join([x.get("text", "") for x in turn["content"]])
if removed_image_data:
# remove last turn in context and retry
await sio.emit('update', {'update': '[Images do not seem to be supported by model... Removing]', 'voice': 'user'},room=sid)
client[sid]["context"].pop()
continue
log(f"ERROR: {str(erro)}")
stats["errors"] += 1
await sio.emit('update', {'update': str(erro), 'voice': 'user'},room=sid)
break
else:
# If all else fails, log the error and break
log(f"ERROR: {str(erro)}")
stats["errors"] += 1
await sio.emit('update', {'update': str(erro), 'voice': 'user'},room=sid)
break
if not client[sid]["remember"]:
client[sid]["remember"] =True
client[sid]["context"].pop()
client[sid]["context"].append({"role": "user", "content": "Help me remember."})
return response
async def ask_llm(query, format="", model=MYMODEL):
# Ask LLM a question
global stats
stats["ask_llm"] += 1
if format == "":
format = f"Respond in {format}."
content = base_prompt(expand_prompt(prompts["clarify"], {"format": format})) + [{"role": "user",
"content": query}]
debug(f"ask_llm: {content}")
llm = openai.AsyncOpenAI(api_key=API_KEY, base_url=API_BASE)
response = await llm.chat.completions.create(
model=model,
stream=False,
temperature=TEMPERATURE,
messages=content,
extra_body=EXTRA_BODY,
)
# close the openai client
await llm.close()
debug(f"ask_llm -> {response.choices[0].message.content.strip()}")
return response.choices[0].message.content.strip()
async def ask_context(messages, model=MYMODEL):
# Ask LLM a simple question
global stats
stats["ask_context"] += 1
debug(f"ask_context: {messages}")
llm = openai.AsyncOpenAI(api_key=API_KEY, base_url=API_BASE)
response = await llm.chat.completions.create(
model=model,
stream=False,
temperature=TEMPERATURE,
messages=messages,
extra_body=EXTRA_BODY,
)
# close the openai client
await llm.close()
debug(f"ask_context -> {response.choices[0].message.content.strip()}")
return response.choices[0].message.content.strip()
# Function - Get weather for location
async def get_weather(location):
# Look up weather for location
if location == "":
location = "Los Angeles"
location = location.replace(" ", "+")
url = "https://wttr.in/%s?format=j2" % location
debug(f"Fetching weather for {location} from {url}")
response = requests.get(url)
if response.status_code == 200:
return response.text
else:
return "Unable to fetch weather for %s" % location
# Function - Get stock price for company
async def get_stock(company, model=MYMODEL):
if ALPHA_KEY == "alpha_key":
return "Unable to fetch stock price for %s - No Alpha Vantage API Key" % company
# First try to get the ticker symbol
symbol = await ask_llm(f"What is the stock symbol for {company}? Respond with symbol.", model=model)
if "none" in symbol.lower():
return "Unable to fetch stock price for %s - No matching symbol" % company
# Check to see if response has multiple lines and if so, pick the first one
symbol = symbol.split("\n")[0].strip()
# Check to see if there are multiple words and if so, pick the last one
if len(symbol.split()) > 1:
symbol = symbol.split()[-1]
# Strip off any spaces or non-alpha characters
symbol = ''.join(e for e in symbol if e.isalnum())
# Now get the stock price
url = "https://www.alphavantage.co/query?function=GLOBAL_QUOTE&symbol=%s&apikey=%s" % (symbol.upper(), ALPHA_KEY)
debug(f"Fetching stock price for {company} from {url}")
response = requests.get(url)
if response.status_code == 200:
try:
price = response.json()["Global Quote"]["05. price"]
return f"The price of {company} (symbol {symbol}) is ${price}."
except:
return "Unable to fetch stock price for %s - No data available." % company
# Function - Get news for topic
async def get_top_articles(url, max=10):
async with aiohttp.ClientSession() as session:
async with session.get(url) as response:
soup = BeautifulSoup(await response.text(), 'xml')
items = soup.findAll('item')
articles = ""
links = {}
count = 1
for item in items:
title = item.find('title').string.strip()
#pubdate = item.find('pubDate').string.strip()
#description = item.find('description').string.strip()
link = item.find('link').string.strip()
links[f"LnkID:{count+100}"] = link
articles += f"Headline: {title} - LnkID:{count+100}\n"
count += 1
if count > max:
break
return articles, links
# Function - Fetch news for topic
async def get_news(topic, max=10):
if "none" in topic.lower() or "current" in topic.lower():
url = "https://news.google.com/rss/"
else:
topic = topic.replace(" ", "+")
url = "https://news.google.com/rss/search?q=%s" % topic
debug(f"Fetching news for {topic} from {url}")
async with aiohttp.ClientSession() as session:
response, links = await get_top_articles(url, max)
return response, links
# Function - Extract text from URL
async def extract_text_from_url(url):
try:
async with aiohttp.ClientSession() as session:
async with session.get(url, allow_redirects=True) as response:
if response.status == 200:
# Route extraction based on content type
if ";" in response.headers["Content-Type"]:
content_type = response.headers["Content-Type"].split(";")[0]
else:
content_type = response.headers["Content-Type"]
content_handlers = {
"application/pdf": extract_text_from_pdf,
"text/plain": extract_text_from_text,
"text/csv": extract_text_from_text,
"text/xml": extract_text_from_text,
"application/json": extract_text_from_text,
"text/html": extract_text_from_html,
"application/xml": extract_text_from_text,
}
if content_type in content_handlers:
return await content_handlers[content_type](response)
else:
return "Unsupported content type"
else:
m = f"Failed to fetch the webpage. Status code: {response.status}"
debug(m)
return m
except Exception as erro:
log(f"An error occurred: {str(erro)}")
# Function - Extract text from PDF
async def extract_text_from_pdf(response):
# Convert PDF to text
pdf_content = await response.read()
pdf2text = ""
f = io.BytesIO(pdf_content)
reader = PdfReader(f)
for page in reader.pages:
pdf2text = pdf2text + page.extract_text() + "\n"
return pdf2text
# Function - Extract text from text
async def extract_text_from_text(response):
return await response.text()
# Function - Extract text from HTML
async def extract_text_from_html(response):
html_content = await response.text()
# get title of page from html
source = "Document Source: " + str(response.url)
soup = BeautifulSoup(html_content, 'html.parser')
title = ("Document Title: " + soup.title.string + "\n") if soup.title else ""
paragraphs = soup.find_all(['p', 'code', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'pre', 'ol'])
website_text = f"{title}{source}\nDocument Content:\n" + '\n\n'.join([p.get_text() for p in paragraphs])
return website_text
#
# FastAPI Routes
#
templates = Jinja2Templates(directory="templates")
# Display the main chatbot page
@app.get("/", response_class=HTMLResponse)
async def index(request: Request):
return templates.TemplateResponse(request, "index.html")
# Serve static socket.io.js
@app.get("/socket.io.js")
def serve_socket_io_js():
return FileResponse("templates/socket.io.js", media_type="application/javascript")
# Display settings and stats
@app.get("/stats")
async def home(format: str = None):
global client, stats
# Create a simple status page
data = {
"TinyLLM Chatbot Version": VERSION,
"Start Time": datetime.datetime.fromtimestamp(stats["start_time"]).strftime("%Y-%m-%d %H:%M:%S"),
"Uptime": str(datetime.timedelta(seconds=int(time.time() - stats["start_time"]))),
"Errors": stats["errors"],
"LLM Main User Queries": stats["ask"],
"LLM Helper Queries": stats["ask_llm"],
"LLM CoT Context Queries": stats["ask_context"],
"OpenAI API URL (OPENAI_API_URL)": API_BASE if not LITELLM_PROXY else "Disabled",
"OpenAI API Key (OPENAI_API_KEY)": "************" if API_KEY != "" else "Not Set",
"LiteLLM Proxy (LITELLM_PROXY)": LITELLM_PROXY or "Disabled",
"LiteLLM Secret Key (LITELLM_KEY)": "************" if LITELLM_KEY != "" else "Not Set",
"Agent Name (AGENT_NAME)": AGENTNAME,
"LLM Model (LLM_MODEL)": MYMODEL,
"Debug Mode (DEBUG)": DEBUG,
"Current Clients (MAXCLIENTS)": f"{len(client)} of {MAXCLIENTS}",
"LLM Max Tokens to Send (MAXTOKENS)": MAXTOKENS,
"LLM Temperature (TEMPERATURE)": TEMPERATURE,
"Server Port (PORT)": PORT,
"Saved Prompts (PROMPT_FILE)": PROMPT_FILE,
"Read-Only Prompts (PROMPT_RO)": PROMPT_RO,
"LLM System Tags in Prompts (USE_SYSTEM)": USE_SYSTEM,
"Run without conversation context (ONESHOT).": ONESHOT,
"RAG: Run in RAG Only Mode (RAG_ONLY)": RAG_ONLY,
"RAG: Weaviate (WEAVIATE_HOST)": WEAVIATE_HOST,
"RAG: Weaviate gRPC (WEAVIATE_GRPC_HOST)": WEAVIATE_GRPC_HOST,
"RAG: Weaviate Port (WEAVIATE_PORT)": WEAVIATE_PORT,
"RAG: Weaviate gRPC Port (WEAVIATE_GRPC_PORT)": WEAVIATE_GRPC_PORT,
"RAG: default Library (WEAVIATE_LIBRARY)": WEAVIATE_LIBRARY,
"RAG: Default Results Retrieved (RESULTS)": RESULTS,
"Alpha Vantage API Key (ALPHA_KEY)": "************" if ALPHA_KEY != "" else "Not Set",
"Toxicity Threshold (TOXIC_THRESHOLD)": TOXIC_THRESHOLD,
"Extra Body Parameters (EXTRA_BODY)": EXTRA_BODY,
"Thinking Mode (THINKING)": THINKING,
"Think Tag Filter (THINK_FILTER)": THINK_FILTER,
}
if format == "json":
return data
# Build a simple HTML page based on data facets
html = "<html><head><title>TinyLLM Chatbot Status</title>"
html += "<style>body { font-family: Helvetica, Arial, sans-serif; }</style>"
html += "</head><body>"
html += "<h1>TinyLLM Chatbot Status</h1>"
# Provide link to project
html += "<p>Settings and Current Status for <a href='https://github.com/jasonacox/TinyLLM/tree/main/chatbot'>TinyLLM Chatbot</a></p>"
html += "<table>"
for key in data:
html += f"<tr><td>{key}</td><td>{data[key]}</td></tr>"
html += "</table>"
# Add JS to refresh page every 5 seconds
html += "<script>setTimeout(function(){location.reload()},5000);</script>"
html += "</body></html>"
return HTMLResponse(content=html, status_code=200)
# Return the current prompts
@app.get('/prompts')
async def get_prompts():
global prompts
# Update TEMPERATURE and MAXTOKENS
prompts["LLM_temperature"] = TEMPERATURE
prompts["LLM_max_tokens"] = MAXTOKENS
if PROMPT_RO:
prompts["READONLY"] = True
return prompts
# POST requests to update prompts
@app.post('/saveprompts')
async def update_prompts(data: dict):
global prompts, baseprompt, sio, TEMPERATURE, MAXTOKENS, AGENTNAME
if PROMPT_RO:
return ({"Result": "Prompts are read-only"})
oldbaseprompt = prompts["baseprompt"]
oldagentname = prompts["agentname"]
debug(f"Received prompts: {data}")
# Update prompts
for key in data:
prompts[key] = data[key]
save_prompts()
if oldbaseprompt != prompts["baseprompt"] or oldagentname != prompts["agentname"]:
# Update baseprompt
AGENTNAME = prompts["agentname"]
current_date = datetime.datetime.now()
formatted_date = current_date.strftime("%B %-d, %Y")
values = {"agentname": AGENTNAME, "date": formatted_date}
baseprompt = expand_prompt(prompts["baseprompt"], values)
# Update TEMPERATURE and MAXTOKENS
if "LLM_temperature" in data:
TEMPERATURE = float(data["LLM_temperature"])
if "LLM_max_tokens" in data:
MAXTOKENS = int(data["LLM_max_tokens"])
# Notify all clients of update
debug("Base prompt updated - notifying users")
await sio.emit('update', {'update': '[Prompts Updated - Refresh to reload]', 'voice': 'user'})
return ({"Result": "Prompts updated"})
# Reset prompts to default
@app.get('/resetprompts')
async def reset_prompts_route():
# Send the user the default prompts
global default_prompts
return (default_prompts)
# Return the current version and LLM model
@app.get('/version')
async def show_version_api():
global VERSION, DEBUG
debug(f"Version requested - DEBUG={DEBUG}")
if DEBUG:
return {'version': "%s DEBUG MODE" % VERSION}
return {'version': VERSION, 'model': MYMODEL}
# Send an alert to all clients
@app.post('/alert')
async def alert(data: dict):
# Send an alert to all clients
# Make sure TOKEN is set and matches
if "token" in data and "message" in data and data["token"] == TOKEN:
debug(f"Received alert: {data}")
await sio.emit('update', {'update': data["message"], 'voice': 'user'})
return ({'status': 'Alert sent'})
else:
log(f"Alert: Invalid token or missing message: {data}")
return ({'status': 'Invalid Token or missing message'})
# Return list of available models
@app.get('/models')
async def list_models():
return get_models()
# Upload a file
@app.post('/upload')
async def upload_file(file: UploadFile = File(...), session_id: str = Form(...)):
global client
file_name = file.filename
session_id = session_id.strip()
content = await file.read() # Read file content
# Open the image, checking for HEIC format
try:
image = Image.open(io.BytesIO(content))
except Exception as e:
await sio.emit('update', {'update': f"Image error: {str(e)}", 'voice': 'user'}, room=session_id)
return {"error": f"Unable to open image: {str(e)}"}
# Resize image if height or width is greater than 1024
if image.height > 1024 or image.width > 1024:
image.thumbnail((1024, 1024))
# Convert image to RGB if it has an alpha channel
if image.mode == 'RGBA':
image = image.convert('RGB')
# Save image to memory as JPEG
img_byte_arr = io.BytesIO()
image.save(img_byte_arr, format='JPEG')
content = img_byte_arr.getvalue()
# Convert image to base64
image_data = base64.b64encode(content).decode('utf-8')
# Validate session
if session_id not in client:
log(f"Invalid session {session_id}")
return {"result": "Bad Session ID", "filename": file.filename, "size": len(content)}
debug(f"Received image upload from {session_id} - {file_name} [{len(image_data)} bytes]")
# Add to client session
client[session_id]["image_data"] = image_data
# Determine file size in a human-readable format
file_size = len(content)
if file_size < 1024:
file_size = f"{file_size} bytes"
elif file_size < 1024 * 1024:
file_size = f"{file_size / 1024:.1f} KB"
else:
file_size = f"{file_size / 1024 / 1024:.1f} MB"
update = f"Uploaded image: {file_name} [{file_size}]"
await sio.emit('update', {'update': update, 'voice': 'user'}, room=session_id)
return {"result": "Success", "filename": file.filename, "size": len(content), "image_data": image_data}
#
# SocketIO Events
#
app.mount("/", socket_app) # Here we mount socket app to main fastapi app
# Client connected - start thread to send updates
@sio.on('connect')
async def handle_connect(session_id, env):
debug(f"Client connected: {session_id}")
# Convert each character to its hex representation
def string_to_hex(input_string):
hex_values = [hex(ord(char)) for char in input_string]
return hex_values
# Continuous thread to send updates to connected clients
async def send_update(session_id):
global client
debug(f"Starting send_update thread for {session_id}")
# Verify session is valid
if session_id not in client:
debug(f"Invalid session {session_id}")
return
try:
while not client[session_id]["stop_thread_flag"]:
if client[session_id]["prompt"] == "":
await sio.sleep(0.1)
else:
# Check to see of CoT is enabled but not while processing a file/image
client_cot = client[session_id]["cot"]
client_image_data = client[session_id]["image_data"]
client_visible = client[session_id]["visible"]
if client_cot and not client_image_data and client_visible:
try:
# Remember original prompt
client[session_id]["cot_prompt"] = client[session_id]["prompt"]
# Check to see if the prompt needs COT processing
cot_check = expand_prompt(prompts["chain_of_thought_check"], {"prompt": client[session_id]["prompt"]})
debug("Running CoT check")
# Ask LLM for answers
response = await ask_llm(cot_check, model=client[session_id]["model"])
if "a" in response.lower() or "d" in response.lower() or client[session_id]["cot_always"]:
debug("Running deep thinking CoT to answer")
# Build prompt for Chain of Thought and create copy of context
cot_prompt = expand_prompt(prompts["chain_of_thought"], {"prompt": client[session_id]["prompt"]})
temp_context = client[session_id]["context"].copy()
temp_context.append({"role": "user", "content": cot_prompt})
# Send thinking status to client and ask LLM for answer
await sio.emit('update', {'update': 'Thinking... ', 'voice': 'ai'},room=session_id)
answer = await ask_context(temp_context, model=client[session_id]["model"])
await sio.emit('update', {'update': '\n\n', 'voice': 'ai'},room=session_id)
# Load request for CoT conclusion into conversational thread
cot_prompt = expand_prompt(prompts["chain_of_thought_summary"], {"context_str": answer,
"prompt": client[session_id]["cot_prompt"]})
client[session_id]["prompt"] = cot_prompt
except Exception as erro:
log(f"CoT error - continuing with original prompt: {erro}")
await sio.emit('update', {'update': '\n\n', 'voice': 'ai'},room=session_id)
else:
client_cot = False
try:
# Ask LLM for answers
response= await ask(client[session_id]["prompt"],session_id)
completion_text = ''
tokens = 0
in_thinking = False
# Iterate through the stream of tokens and send to client
stime = time.time()
for event in response:
event_text = event.choices[0].delta.content
# check for no tokens or a string just full of nany number of newlines only
if tokens == 0 and event_text.strip() == "":
continue
if event_text:
if client[session_id]["think"]:
if "<think>" in event_text: