From de671021de765cf6df819b5625ae6a8258895026 Mon Sep 17 00:00:00 2001
From: isamu-isozaki <isamu.website@gmail.com>
Date: Mon, 22 Apr 2024 22:48:49 -0400
Subject: [PATCH 01/11] Attempt adding outlines

---
 llm_exl2_client_multi.py | 40 ++++++++++++++++++++++++++++++----------
 1 file changed, 30 insertions(+), 10 deletions(-)

diff --git a/llm_exl2_client_multi.py b/llm_exl2_client_multi.py
index 1717527..af98faa 100644
--- a/llm_exl2_client_multi.py
+++ b/llm_exl2_client_multi.py
@@ -26,6 +26,7 @@
 import numpy as np
 
 import sys, os
+import outlines
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 
 from exllamav2 import(
@@ -87,6 +88,7 @@ class ChatCompletionRequest(BaseModel):
 
 # Add argument for port with default type as integer
 parser.add_argument('--port', type=int, help='Port to run the server on.')
+parser.add_argument('--use_outlines', action='store_true', help='Use outlines.')
 
 # Parse the arguments
 args = parser.parse_args()
@@ -117,11 +119,29 @@ class ChatCompletionRequest(BaseModel):
 config.max_seq_len = max_context
 base_model_native_max = 4096
 
-model = ExLlamaV2(config)
+if args.use_outlines:
+    model = outlines.models.exl2(
+        config.model_dir,
+        "cuda",
+        max_seq_len = config.max_seq_len,
+        scale_pos_emb = config.scale_pos_emb,
+        scale_alpha_value = config.scale_alpha_value,
+        no_flash_attn = config.no_flash_attn,
+        num_experts_per_token = config.num_experts_per_token,
+        cache_8bit = True,
+        cache_q4 = False,
+        tokenizer_kwargs = {},
+        gpu_split = "17,19,19,19", # we might be able to make this auto
+        low_mem = None,
+        verbose = None
+    )
+else:
+    model = ExLlamaV2(config)
 print("Loading model: " + repo_id)
 #cache = ExLlamaV2Cache(model, lazy=True, max_seq_len = 20480)
 #model.load_autosplit(cache)
-model.load([17,19,19,19])
+if not args.use_outlines:
+    model.load([17,19,19,19])
 
 tokenizer = ExLlamaV2Tokenizer(config)
 
@@ -235,19 +255,19 @@ def process_prompts():
 
                             tensors.sin = ropesin[g]
                             tensors.cos = ropecos[g]
-
-                if cache_8bit:
-                    ncache = ExLlamaV2Cache_8bit(model, max_seq_len = new_tokens)  # (max_seq_len could be different for each cache)
-                else:
-                    ncache = ExLlamaV2Cache(model, max_seq_len = new_tokens)  # (max_seq_len could be different for each cache)
+                if not args.use_outlines:
+                    if cache_8bit:
+                        ncache = ExLlamaV2Cache_8bit(model, max_seq_len = new_tokens)  # (max_seq_len could be different for each cache)
+                    else:
+                        ncache = ExLlamaV2Cache(model, max_seq_len = new_tokens)  # (max_seq_len could be different for each cache)
 
                 #print("Setting up Cache: " + str(prompt_id))
-                
+
                 if use_dynamic_rope_scaling:
                     sin_arr.append(ropesin)
                     cos_arr.append(ropecos)
-
-                model.forward(ids[:, :-1], ncache, preprocess_only = True)
+                if not args.use_outlines:
+                    model.forward(ids[:, :-1], ncache, preprocess_only = True)
                 print("Cache setup: " + str(np.shape(ids[:1, :-1])))
                 input_ids.append(ids)
                 prompt_ids.append(prompt_id)

From 18f96ba5c57486bd3d0c7e2ada03fb193233b5a0 Mon Sep 17 00:00:00 2001
From: isamu-isozaki <isamu.website@gmail.com>
Date: Wed, 24 Apr 2024 16:22:55 -0400
Subject: [PATCH 02/11] Setup basic outlines logic

---
 llm_exl2_client_multi.py | 213 ++++++++++++++++++++++++++++++++++-----
 1 file changed, 189 insertions(+), 24 deletions(-)

diff --git a/llm_exl2_client_multi.py b/llm_exl2_client_multi.py
index af98faa..7210885 100644
--- a/llm_exl2_client_multi.py
+++ b/llm_exl2_client_multi.py
@@ -15,7 +15,7 @@
 import subprocess
 import re
 
-from fastapi import FastAPI
+from fastapi import FastAPI, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import StreamingResponse
 from pydantic import BaseModel
@@ -27,6 +27,8 @@
 
 import sys, os
 import outlines
+from outlines.samplers import multinomial
+
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 
 from exllamav2 import(
@@ -34,6 +36,7 @@
     ExLlamaV2Config,
     ExLlamaV2Cache,
     ExLlamaV2Cache_8bit,
+    ExLlamaV2Cache_Q4,
     ExLlamaV2Tokenizer,
 )
 
@@ -81,6 +84,21 @@ class ChatCompletionRequest(BaseModel):
     top_p: Optional[float] = 0.0  # default value of 0.0
     user: Optional[str] = None
 
+class OutlinesRequest(ChatCompletionRequest):
+    ends_at: str | None = None
+
+class OutlinesChoiceRequest(OutlinesRequest):
+    choices: list[str]
+
+class OutlinesRegexRequest(OutlinesRequest):
+    regex: str
+
+class OutlinesJsonRequest(OutlinesRequest):
+    json: str
+
+class OutlinesTextRequest(OutlinesRequest):
+    pass
+
 repo_str = 'commandr-exl2'
 #repo_str = 'theprofessor-exl2-speculative'
 
@@ -118,6 +136,8 @@ class ChatCompletionRequest(BaseModel):
 config.scale_alpha_value = ropescale
 config.max_seq_len = max_context
 base_model_native_max = 4096
+cache_8bit = True
+cache_q4 = False
 
 if args.use_outlines:
     model = outlines.models.exl2(
@@ -128,8 +148,8 @@ class ChatCompletionRequest(BaseModel):
         scale_alpha_value = config.scale_alpha_value,
         no_flash_attn = config.no_flash_attn,
         num_experts_per_token = config.num_experts_per_token,
-        cache_8bit = True,
-        cache_q4 = False,
+        cache_8bit = cache_8bit,
+        cache_q4 = cache_q4,
         tokenizer_kwargs = {},
         gpu_split = "17,19,19,19", # we might be able to make this auto
         low_mem = None,
@@ -147,7 +167,7 @@ class ChatCompletionRequest(BaseModel):
 
 # Cache mode
 
-cache_8bit = True
+
 
 settings_proto = ExLlamaV2Sampler.Settings()
 settings_proto.temperature = 0
@@ -166,6 +186,7 @@ class ChatCompletionRequest(BaseModel):
 prompt_ids = []
 streamer = []
 caches = []
+past_seq_length = []
 settings = []
 future_tokens = []
 future_logits = []
@@ -200,15 +221,144 @@ async def stream_response(prompt_id, timeout=180):
                 yield f'data: {{"id":"chatcmpl-{prompt_id}","object":"chat.completion.chunk","created":{int(time.time())},"model":"{repo_str}","choices":[{{"index":0,"delta":{{}},"finish_reason":"stop"}}]}}\n\n'
                 break
 
+# Worker thread function
+def process_outline_prompts():
+    # TODO: Somehow make this streams
+    global partial_responses
+    assert args.use_outlines
+    assert not use_dynamic_rope_scaling, "Currently ROPE scaling is not supported with outlines"
+    base_model = model.model
+    while True:
+        while not prompts.empty() or len(input_ids):
+            while len(input_ids) < max_parallel_seqs and not prompts.empty():
+                prompt_id, prompt, max_tokens, stream, temperature, outlines_dict = prompts.get()
+                sampler = multinomial(top_k=50., top_p=1.0, temperature=temperature)
+                ids = tokenizer.encode(prompt)
+                prompt_tokens = ids.shape[-1]
+                new_tokens = prompt_tokens + max_tokens
+                print("Processing prompt: " + str(prompt_id) + "  Req tokens: " + str(new_tokens))
+                # Truncate if new_tokens exceed max_context
+                if new_tokens > max_context:
+                    # Calculate how many tokens to truncate
+                    ids = tokenizer.encode("Say, 'Prompt exceeds allowed length. Please try again.'")
+                    # Update new_tokens after truncation
+                    prompt_tokens = ids.shape[-1]
+                    new_tokens = prompt_tokens + max_tokens
+                    print("Truncating prompt: " + str(prompt_id) + "  Req tokens: " + str(new_tokens))
+                prompt_length.append(prompt_tokens)
+                if use_dynamic_rope_scaling:
+                    # Dynamic Rope Scaling
+                    head_dim = base_model.config.head_dim
+                    model_base = base_model.config.rotary_embedding_base
+                    max_seq_len = base_model.config.max_seq_len
+                    ratio = new_tokens / base_model_native_max
+                    alpha = 1.0
+                    ropesin = [None] * num_of_gpus
+                    ropecos = [None] * num_of_gpus
+                    if ratio > 1.0:
+                        alpha = ((0.2500*ratio**2) + (0.3500*ratio) + 0.4000)*dynamic_rope_mult + dynamic_rope_offset
+                        print("DYNAMIC ROPE SCALE Alpha: " + str(alpha) + "  Ratio: " + str(ratio))
+
+                    for g in range(num_of_gpus):
+                        base = model_base
+                        try:
+                            tensors = base_model.get_device_tensors(g)
+                        except IndexError:
+                            tensors = None
+
+                        if tensors is not None:
+                            if alpha != 1.0: base *= alpha ** (head_dim / (head_dim - 2))
+
+                            inv_freq = 1.0 / (base ** (torch.arange(0, head_dim, 2, device = "cuda:"+str(g)).float() / head_dim))
+                            t = torch.arange(max_seq_len, device = "cuda:"+str(g), dtype = torch.float32)
+
+                            freqs = torch.einsum("i,j->ij", t, inv_freq)
+                            emb = torch.cat((freqs, freqs), dim=-1)
+
+                            ropesin[g] = emb.sin()[None, None, :, :].half()
+                            ropecos[g] = emb.cos()[None, None, :, :].half()
+
+                            tensors.sin = ropesin[g]
+                            tensors.cos = ropecos[g]
+                if cache_8bit:
+                    ncache = ExLlamaV2Cache_8bit(base_model, lazy=not base_model.loaded, max_seq_len = new_tokens)  # (max_seq_len could be different for each cache)
+                elif cache_q4:
+                    ncache = ExLlamaV2Cache_Q4(base_model, lazy=not base_model.loaded, max_seq_len = new_tokens)
+                else:
+                    ncache = ExLlamaV2Cache(base_model, lazy=not base_model.loaded, max_seq_len = new_tokens)  # (max_seq_len could be different for each cache)
+                model.cache = ncache
+                model.past_seq = None
+                stop_at = outlines["stop_at"]
+                if outlines_dict["type"] == "choices":
+                    generator = outlines.generate.choice(model, outlines_dict["choices"], sampler=sampler, max_tokens=max_tokens, stop_at=stop_at)
+                elif outlines_dict["type"] == "json":
+                    generator = outlines.generate.json(model, outlines_dict["json"], sampler=sampler, max_tokens=max_tokens, stop_at=stop_at)
+                elif outlines_dict["type"] == "regex":
+                    generator = outlines.generate.regex(model, outlines_dict["regex"], sampler=sampler, max_tokens=max_tokens, stop_at=stop_at)
+                else:
+                    generator = outlines.generate.text(model, sampler=sampler, max_tokens=max_tokens, stop_at=stop_at)
+                output = generator(prompt)
+                completion_tokens = (tokenizer.encode(output)).shape[-1]
+                prompt_tokens = (tokenizer.encode(prompt)).shape[-1]
+                full_tokens = completion_tokens + prompt_tokens
+
+
+                if(streamer[i]):
+                    ## Generator, yield here..
+                    partial_response_data = {
+                        "id": f"chatcmpl-{prompt_id}",
+                        "object": "chat.completion.chunk",
+                        "created": int(time.time()),
+                        "model": repo_str,
+                        "choices": [
+                            {
+                                "index": 0,
+                                "delta": {
+                                    "content": output
+                                },
+                                "finish_reason": "stop"
+                            }
+                        ]
+                    }
+
+                    # Initialize a list for new prompt_id or append to existing one
+                    if prompt_id not in partial_responses:
+                        partial_responses[prompt_id] = []
+                    partial_responses[prompt_id].append(partial_response_data)
+                else:# Construct the response based on the format
+                    response_data = {
+                        "id": f"chatcmpl-{prompt_id}",
+                        "object": "chat.completion",
+                        "created": int(time.time()),
+                        "model": repo_str,
+                        "choices": [{
+                            "index": 0,
+                            "message": {
+                                "role": "assistant",
+                                "content": output,
+                            },
+                            "finish_reason": "stop"
+                        }],
+                        "usage": {
+                            "prompt_tokens": prompt_tokens,
+                            "completion_tokens": completion_tokens,
+                            "total_tokens": full_tokens
+                        }
+                    }
+                    responses[prompt_id] = response_data
+        else:
+            # Sleep for a short duration when there's no work
+            time.sleep(0.1)  # Sleep for 100 milliseconds
+
 
 # Worker thread function
 def process_prompts():
-    global partial_responses  
-
+    global partial_responses
+    base_model = model
     while True:
         while not prompts.empty() or len(input_ids):
             while len(input_ids) < max_parallel_seqs and not prompts.empty():
-                prompt_id, prompt, max_tokens, stream, temperature = prompts.get()
+                prompt_id, prompt, max_tokens, stream, temperature, outlines_dict = prompts.get()
                 ids = tokenizer.encode(prompt)
                 prompt_tokens = ids.shape[-1]
                 new_tokens = prompt_tokens + max_tokens
@@ -224,8 +374,9 @@ def process_prompts():
                 prompt_length.append(prompt_tokens)
                 if use_dynamic_rope_scaling:
                     # Dynamic Rope Scaling
-                    head_dim = model.config.head_dim
-                    model_base = model.config.rotary_embedding_base
+                    head_dim = base_model.config.head_dim
+                    model_base = base_model.config.rotary_embedding_base
+                    max_seq_len = base_model.config.max_seq_len
                     ratio = new_tokens / base_model_native_max
                     alpha = 1.0
                     ropesin = [None] * num_of_gpus
@@ -237,15 +388,15 @@ def process_prompts():
                     for g in range(num_of_gpus):
                         base = model_base
                         try:
-                            tensors = model.get_device_tensors(g)
+                            tensors = base_model.get_device_tensors(g)
                         except IndexError:
                             tensors = None
 
                         if tensors is not None:
-                            if alpha != 1.0: base *= alpha ** (model.config.head_dim / (model.config.head_dim - 2))
+                            if alpha != 1.0: base *= alpha ** (head_dim / (head_dim - 2))
 
                             inv_freq = 1.0 / (base ** (torch.arange(0, head_dim, 2, device = "cuda:"+str(g)).float() / head_dim))
-                            t = torch.arange(model.config.max_seq_len, device = "cuda:"+str(g), dtype = torch.float32)
+                            t = torch.arange(max_seq_len, device = "cuda:"+str(g), dtype = torch.float32)
 
                             freqs = torch.einsum("i,j->ij", t, inv_freq)
                             emb = torch.cat((freqs, freqs), dim=-1)
@@ -255,19 +406,18 @@ def process_prompts():
 
                             tensors.sin = ropesin[g]
                             tensors.cos = ropecos[g]
-                if not args.use_outlines:
-                    if cache_8bit:
-                        ncache = ExLlamaV2Cache_8bit(model, max_seq_len = new_tokens)  # (max_seq_len could be different for each cache)
-                    else:
-                        ncache = ExLlamaV2Cache(model, max_seq_len = new_tokens)  # (max_seq_len could be different for each cache)
-
+                if cache_8bit:
+                    ncache = ExLlamaV2Cache_8bit(base_model, lazy=not base_model.loaded, max_seq_len = new_tokens)  # (max_seq_len could be different for each cache)
+                elif cache_q4:
+                    ncache = ExLlamaV2Cache_Q4(base_model, lazy=not base_model.loaded, max_seq_len = new_tokens)
+                else:
+                    ncache = ExLlamaV2Cache(base_model, lazy=not base_model.loaded, max_seq_len = new_tokens)  # (max_seq_len could be different for each cache)
                 #print("Setting up Cache: " + str(prompt_id))
 
                 if use_dynamic_rope_scaling:
                     sin_arr.append(ropesin)
                     cos_arr.append(ropecos)
-                if not args.use_outlines:
-                    model.forward(ids[:, :-1], ncache, preprocess_only = True)
+                model.forward(ids[:, :-1], ncache, preprocess_only = True)
                 print("Cache setup: " + str(np.shape(ids[:1, :-1])))
                 input_ids.append(ids)
                 prompt_ids.append(prompt_id)
@@ -342,7 +492,7 @@ def process_prompts():
 
                     if token.item() == tokenizer.eos_token_id or diff == """<|im_end|>""" or caches[i].current_seq_len == caches[i].max_seq_len:
                         eos.insert(0, i)
-                        
+
                 # Generate and store response
                 for i in eos:
                     generated_part = input_ids[i][:, prompt_length[i]:]
@@ -383,7 +533,6 @@ def process_prompts():
                                 "total_tokens": full_tokens
                             }
                         }
-                        
                         responses[eos_prompt_id] = response_data
 
                     # Clean up
@@ -403,7 +552,7 @@ def process_prompts():
 
 
 # Start worker thread
-worker = Thread(target=process_prompts)
+worker = Thread(target=process_prompts if not args.use_outlines else process_outline_prompts)
 worker.start()
 
 
@@ -588,7 +737,23 @@ async def mainchat(request: ChatCompletionRequest):
         timeout = 180  # seconds
         start_time = time.time()
         prompt_id = generate_unique_id() # Replace with a function to generate unique IDs
-        prompts.put((prompt_id, prompt, request.max_tokens, request.stream, request.temperature))
+        outlines_dict = {}
+        if isinstance(request, OutlinesRequest):
+            outlines_dict["ends_at"] = request.ends_at
+        if isinstance(request, OutlinesChoiceRequest):
+            outlines_dict["type"] = "choices"
+            outlines_dict["choices"] = request.choices
+        elif isinstance(request, OutlinesJsonRequest):
+            outlines_dict["type"] = "json"
+            outlines_dict["json"] = request.json
+        elif isinstance(request, OutlinesRegexRequest):
+            outlines_dict["type"] = "regex"
+            outlines_dict["regex"] = request.regex
+        elif isinstance(request, OutlinesTextRequest):
+            outlines_dict["type"] = "text"
+        elif args.use_outliens:
+            raise NotImplementedError("If outlines is used, the request must be an OutlinesRequest")
+        prompts.put((prompt_id, prompt, request.max_tokens, request.stream, request.temperature, outlines_dict))
 
         if request.stream:
             #response = StreamingResponse(streaming_request(prompt, request.max_tokens, tempmodel=repo_str, response_format='chat_completion'), media_type="text/event-stream")

From 7667151d2886ac88a02bc92a881c0b2d224a0f14 Mon Sep 17 00:00:00 2001
From: isamu-isozaki <isamu.website@gmail.com>
Date: Wed, 24 Apr 2024 16:29:17 -0400
Subject: [PATCH 03/11] Remove diff

---
 llm_exl2_client_multi.py | 38 +-------------------------------------
 1 file changed, 1 insertion(+), 37 deletions(-)

diff --git a/llm_exl2_client_multi.py b/llm_exl2_client_multi.py
index 7210885..d8d653a 100644
--- a/llm_exl2_client_multi.py
+++ b/llm_exl2_client_multi.py
@@ -186,7 +186,6 @@ class OutlinesTextRequest(OutlinesRequest):
 prompt_ids = []
 streamer = []
 caches = []
-past_seq_length = []
 settings = []
 future_tokens = []
 future_logits = []
@@ -245,41 +244,6 @@ def process_outline_prompts():
                     prompt_tokens = ids.shape[-1]
                     new_tokens = prompt_tokens + max_tokens
                     print("Truncating prompt: " + str(prompt_id) + "  Req tokens: " + str(new_tokens))
-                prompt_length.append(prompt_tokens)
-                if use_dynamic_rope_scaling:
-                    # Dynamic Rope Scaling
-                    head_dim = base_model.config.head_dim
-                    model_base = base_model.config.rotary_embedding_base
-                    max_seq_len = base_model.config.max_seq_len
-                    ratio = new_tokens / base_model_native_max
-                    alpha = 1.0
-                    ropesin = [None] * num_of_gpus
-                    ropecos = [None] * num_of_gpus
-                    if ratio > 1.0:
-                        alpha = ((0.2500*ratio**2) + (0.3500*ratio) + 0.4000)*dynamic_rope_mult + dynamic_rope_offset
-                        print("DYNAMIC ROPE SCALE Alpha: " + str(alpha) + "  Ratio: " + str(ratio))
-
-                    for g in range(num_of_gpus):
-                        base = model_base
-                        try:
-                            tensors = base_model.get_device_tensors(g)
-                        except IndexError:
-                            tensors = None
-
-                        if tensors is not None:
-                            if alpha != 1.0: base *= alpha ** (head_dim / (head_dim - 2))
-
-                            inv_freq = 1.0 / (base ** (torch.arange(0, head_dim, 2, device = "cuda:"+str(g)).float() / head_dim))
-                            t = torch.arange(max_seq_len, device = "cuda:"+str(g), dtype = torch.float32)
-
-                            freqs = torch.einsum("i,j->ij", t, inv_freq)
-                            emb = torch.cat((freqs, freqs), dim=-1)
-
-                            ropesin[g] = emb.sin()[None, None, :, :].half()
-                            ropecos[g] = emb.cos()[None, None, :, :].half()
-
-                            tensors.sin = ropesin[g]
-                            tensors.cos = ropecos[g]
                 if cache_8bit:
                     ncache = ExLlamaV2Cache_8bit(base_model, lazy=not base_model.loaded, max_seq_len = new_tokens)  # (max_seq_len could be different for each cache)
                 elif cache_q4:
@@ -303,7 +267,7 @@ def process_outline_prompts():
                 full_tokens = completion_tokens + prompt_tokens
 
 
-                if(streamer[i]):
+                if(stream):
                     ## Generator, yield here..
                     partial_response_data = {
                         "id": f"chatcmpl-{prompt_id}",

From 74694709a2d106c6c2241e605c6d8b20bcb3dd01 Mon Sep 17 00:00:00 2001
From: isamu-isozaki <isamu.website@gmail.com>
Date: Wed, 24 Apr 2024 16:35:45 -0400
Subject: [PATCH 04/11] Remove diff

---
 llm_exl2_client_multi.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llm_exl2_client_multi.py b/llm_exl2_client_multi.py
index d8d653a..aa8c21b 100644
--- a/llm_exl2_client_multi.py
+++ b/llm_exl2_client_multi.py
@@ -168,7 +168,6 @@ class OutlinesTextRequest(OutlinesRequest):
 # Cache mode
 
 
-
 settings_proto = ExLlamaV2Sampler.Settings()
 settings_proto.temperature = 0
 settings_proto.top_k = 50

From 9161a6f9b100d49c3d962b32e6f4adf6b439fdf9 Mon Sep 17 00:00:00 2001
From: isamu-isozaki <isamu.website@gmail.com>
Date: Sat, 27 Apr 2024 00:51:56 -0400
Subject: [PATCH 05/11] Remove diffs

---
 llm_exl2_client_multi.py | 44 ++++++++++++++++++++++------------------
 1 file changed, 24 insertions(+), 20 deletions(-)

diff --git a/llm_exl2_client_multi.py b/llm_exl2_client_multi.py
index aa8c21b..7d6d041 100644
--- a/llm_exl2_client_multi.py
+++ b/llm_exl2_client_multi.py
@@ -28,6 +28,7 @@
 import sys, os
 import outlines
 from outlines.samplers import multinomial
+from outlines.generate import continuous
 
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 
@@ -253,14 +254,15 @@ def process_outline_prompts():
                 model.past_seq = None
                 stop_at = outlines["stop_at"]
                 if outlines_dict["type"] == "choices":
-                    generator = outlines.generate.choice(model, outlines_dict["choices"], sampler=sampler, max_tokens=max_tokens, stop_at=stop_at)
+                    generator = outlines.generate.choice(model, outlines_dict["choices"], sampler=sampler)
                 elif outlines_dict["type"] == "json":
-                    generator = outlines.generate.json(model, outlines_dict["json"], sampler=sampler, max_tokens=max_tokens, stop_at=stop_at)
+                    generator = outlines.generate.json(model, outlines_dict["json"], sampler=sampler)
                 elif outlines_dict["type"] == "regex":
-                    generator = outlines.generate.regex(model, outlines_dict["regex"], sampler=sampler, max_tokens=max_tokens, stop_at=stop_at)
+                    generator = outlines.generate.regex(model, outlines_dict["regex"], sampler=sampler)
                 else:
-                    generator = outlines.generate.text(model, sampler=sampler, max_tokens=max_tokens, stop_at=stop_at)
-                output = generator(prompt)
+                    generator = outlines.generate.text(model, sampler=sampler)
+                # generator_c = continuous(generator)
+                output = generator(prompt, max_tokens=max_tokens, stop_at=stop_at)
                 completion_tokens = (tokenizer.encode(output)).shape[-1]
                 prompt_tokens = (tokenizer.encode(prompt)).shape[-1]
                 full_tokens = completion_tokens + prompt_tokens
@@ -316,12 +318,12 @@ def process_outline_prompts():
 
 # Worker thread function
 def process_prompts():
-    global partial_responses
-    base_model = model
+    global partial_responses  
+
     while True:
         while not prompts.empty() or len(input_ids):
             while len(input_ids) < max_parallel_seqs and not prompts.empty():
-                prompt_id, prompt, max_tokens, stream, temperature, outlines_dict = prompts.get()
+                prompt_id, prompt, max_tokens, stream, temperature = prompts.get()
                 ids = tokenizer.encode(prompt)
                 prompt_tokens = ids.shape[-1]
                 new_tokens = prompt_tokens + max_tokens
@@ -337,9 +339,8 @@ def process_prompts():
                 prompt_length.append(prompt_tokens)
                 if use_dynamic_rope_scaling:
                     # Dynamic Rope Scaling
-                    head_dim = base_model.config.head_dim
-                    model_base = base_model.config.rotary_embedding_base
-                    max_seq_len = base_model.config.max_seq_len
+                    head_dim = model.config.head_dim
+                    model_base = model.config.rotary_embedding_base
                     ratio = new_tokens / base_model_native_max
                     alpha = 1.0
                     ropesin = [None] * num_of_gpus
@@ -351,15 +352,15 @@ def process_prompts():
                     for g in range(num_of_gpus):
                         base = model_base
                         try:
-                            tensors = base_model.get_device_tensors(g)
+                            tensors = model.get_device_tensors(g)
                         except IndexError:
                             tensors = None
 
                         if tensors is not None:
-                            if alpha != 1.0: base *= alpha ** (head_dim / (head_dim - 2))
+                            if alpha != 1.0: base *= alpha ** (model.config.head_dim / (model.config.head_dim - 2))
 
                             inv_freq = 1.0 / (base ** (torch.arange(0, head_dim, 2, device = "cuda:"+str(g)).float() / head_dim))
-                            t = torch.arange(max_seq_len, device = "cuda:"+str(g), dtype = torch.float32)
+                            t = torch.arange(model.config.max_seq_len, device = "cuda:"+str(g), dtype = torch.float32)
 
                             freqs = torch.einsum("i,j->ij", t, inv_freq)
                             emb = torch.cat((freqs, freqs), dim=-1)
@@ -369,17 +370,18 @@ def process_prompts():
 
                             tensors.sin = ropesin[g]
                             tensors.cos = ropecos[g]
+
                 if cache_8bit:
-                    ncache = ExLlamaV2Cache_8bit(base_model, lazy=not base_model.loaded, max_seq_len = new_tokens)  # (max_seq_len could be different for each cache)
-                elif cache_q4:
-                    ncache = ExLlamaV2Cache_Q4(base_model, lazy=not base_model.loaded, max_seq_len = new_tokens)
+                    ncache = ExLlamaV2Cache_8bit(model, max_seq_len = new_tokens)  # (max_seq_len could be different for each cache)
                 else:
-                    ncache = ExLlamaV2Cache(base_model, lazy=not base_model.loaded, max_seq_len = new_tokens)  # (max_seq_len could be different for each cache)
-                #print("Setting up Cache: " + str(prompt_id))
+                    ncache = ExLlamaV2Cache(model, max_seq_len = new_tokens)  # (max_seq_len could be different for each cache)
 
+                #print("Setting up Cache: " + str(prompt_id))
+                
                 if use_dynamic_rope_scaling:
                     sin_arr.append(ropesin)
                     cos_arr.append(ropecos)
+
                 model.forward(ids[:, :-1], ncache, preprocess_only = True)
                 print("Cache setup: " + str(np.shape(ids[:1, :-1])))
                 input_ids.append(ids)
@@ -455,7 +457,7 @@ def process_prompts():
 
                     if token.item() == tokenizer.eos_token_id or diff == """<|im_end|>""" or caches[i].current_seq_len == caches[i].max_seq_len:
                         eos.insert(0, i)
-
+                        
                 # Generate and store response
                 for i in eos:
                     generated_part = input_ids[i][:, prompt_length[i]:]
@@ -514,6 +516,8 @@ def process_prompts():
 
 
+
+
 # Start worker thread
 worker = Thread(target=process_prompts if not args.use_outlines else process_outline_prompts)
 worker.start()

From edae5dc25f865f2ea373fe2b9794bd1a85d7de74 Mon Sep 17 00:00:00 2001
From: isamu-isozaki <isamu.website@gmail.com>
Date: Sat, 27 Apr 2024 00:55:29 -0400
Subject: [PATCH 06/11] Don't modify previous funtion

---
 llm_exl2_client_multi.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/llm_exl2_client_multi.py b/llm_exl2_client_multi.py
index 7d6d041..e0bdd8a 100644
--- a/llm_exl2_client_multi.py
+++ b/llm_exl2_client_multi.py
@@ -720,7 +720,11 @@ async def mainchat(request: ChatCompletionRequest):
             outlines_dict["type"] = "text"
         elif args.use_outliens:
             raise NotImplementedError("If outlines is used, the request must be an OutlinesRequest")
-        prompts.put((prompt_id, prompt, request.max_tokens, request.stream, request.temperature, outlines_dict))
+        if not args.use_outlines:
+            prompts.put((prompt_id, prompt, request.max_tokens, request.stream, request.temperature))
+        else:
+            prompts.put((prompt_id, prompt, request.max_tokens, request.stream, request.temperature, outlines_dict))
+
 
         if request.stream:
             #response = StreamingResponse(streaming_request(prompt, request.max_tokens, tempmodel=repo_str, response_format='chat_completion'), media_type="text/event-stream")

From df92442886fdaa68a710415d50db0e95bf902530 Mon Sep 17 00:00:00 2001
From: isamu-isozaki <isamu.website@gmail.com>
Date: Sat, 27 Apr 2024 12:28:36 -0400
Subject: [PATCH 07/11] Streaming logic done

---
 llm_exl2_client_multi.py | 158 +++++++++++++++++++++++++--------------
 1 file changed, 102 insertions(+), 56 deletions(-)

diff --git a/llm_exl2_client_multi.py b/llm_exl2_client_multi.py
index e0bdd8a..4db57df 100644
--- a/llm_exl2_client_multi.py
+++ b/llm_exl2_client_multi.py
@@ -28,7 +28,6 @@
 import sys, os
 import outlines
 from outlines.samplers import multinomial
-from outlines.generate import continuous
 
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 
@@ -186,6 +185,9 @@ class OutlinesTextRequest(OutlinesRequest):
 prompt_ids = []
 streamer = []
 caches = []
+input_prompts = []
+generators = []
+generations = []
 settings = []
 future_tokens = []
 future_logits = []
@@ -234,22 +236,22 @@ def process_outline_prompts():
                 sampler = multinomial(top_k=50., top_p=1.0, temperature=temperature)
                 ids = tokenizer.encode(prompt)
                 prompt_tokens = ids.shape[-1]
-                new_tokens = prompt_tokens + max_tokens
-                print("Processing prompt: " + str(prompt_id) + "  Req tokens: " + str(new_tokens))
+                full_tokens = prompt_tokens + max_tokens
+                print("Processing prompt: " + str(prompt_id) + "  Req tokens: " + str(full_tokens))
                 # Truncate if new_tokens exceed max_context
-                if new_tokens > max_context:
+                if full_tokens > max_context:
                     # Calculate how many tokens to truncate
                     ids = tokenizer.encode("Say, 'Prompt exceeds allowed length. Please try again.'")
                     # Update new_tokens after truncation
                     prompt_tokens = ids.shape[-1]
-                    new_tokens = prompt_tokens + max_tokens
-                    print("Truncating prompt: " + str(prompt_id) + "  Req tokens: " + str(new_tokens))
+                    full_tokens = prompt_tokens + max_tokens
+                    print("Truncating prompt: " + str(prompt_id) + "  Req tokens: " + str(full_tokens))
                 if cache_8bit:
-                    ncache = ExLlamaV2Cache_8bit(base_model, lazy=not base_model.loaded, max_seq_len = new_tokens)  # (max_seq_len could be different for each cache)
+                    ncache = ExLlamaV2Cache_8bit(base_model, lazy=not base_model.loaded, max_seq_len = full_tokens)  # (max_seq_len could be different for each cache)
                 elif cache_q4:
-                    ncache = ExLlamaV2Cache_Q4(base_model, lazy=not base_model.loaded, max_seq_len = new_tokens)
+                    ncache = ExLlamaV2Cache_Q4(base_model, lazy=not base_model.loaded, max_seq_len = full_tokens)
                 else:
-                    ncache = ExLlamaV2Cache(base_model, lazy=not base_model.loaded, max_seq_len = new_tokens)  # (max_seq_len could be different for each cache)
+                    ncache = ExLlamaV2Cache(base_model, lazy=not base_model.loaded, max_seq_len = full_tokens)  # (max_seq_len could be different for each cache)
                 model.cache = ncache
                 model.past_seq = None
                 stop_at = outlines["stop_at"]
@@ -261,56 +263,102 @@ def process_outline_prompts():
                     generator = outlines.generate.regex(model, outlines_dict["regex"], sampler=sampler)
                 else:
                     generator = outlines.generate.text(model, sampler=sampler)
-                # generator_c = continuous(generator)
-                output = generator(prompt, max_tokens=max_tokens, stop_at=stop_at)
-                completion_tokens = (tokenizer.encode(output)).shape[-1]
-                prompt_tokens = (tokenizer.encode(prompt)).shape[-1]
-                full_tokens = completion_tokens + prompt_tokens
-
-
-                if(stream):
-                    ## Generator, yield here..
-                    partial_response_data = {
-                        "id": f"chatcmpl-{prompt_id}",
-                        "object": "chat.completion.chunk",
-                        "created": int(time.time()),
-                        "model": repo_str,
-                        "choices": [
-                            {
+                generators.append(generator.stream(prompt, stop_at=stop_at, max_tokens=max_tokens))
+                input_ids.append(prompt_id)
+                input_prompts.append(prompt)
+                generations.append("")
+                caches.append(ncache)
+                streamer.append(stream)
+                #print("Prompt added to queue: " + str(prompt_id))
+            if(len(input_ids)):
+                eos = []
+                for i in range(len(input_ids)):
+                    model.cache = caches[i]
+                    is_finished = False
+                    try:
+                        decoded_response_token = next(generators[i])
+                        generations[i] += decoded_response_token
+                    except StopIteration:
+                        is_finished = True
+                    reason = None
+                    if(streamer[i]):
+                        outcontent = decoded_response_token
+                        if is_finished:
+                            outcontent = outcontent
+                            reason = "stop"
+                        partial_response_data = {
+                            "id": f"chatcmpl-{prompt_ids[i]}",
+                            "object": "chat.completion.chunk",
+                            "created": int(time.time()),
+                            "model": repo_str,
+                            "choices": [
+                                {
+                                    "index": 0,
+                                    "delta": {
+                                        "content": outcontent
+                                    },
+                                    "finish_reason": reason
+                                }
+                            ]
+                        }
+
+                        # Initialize a list for new prompt_id or append to existing one
+                        if prompt_ids[i] not in partial_responses:
+                            partial_responses[prompt_ids[i]] = []
+                        partial_responses[prompt_ids[i]].append(partial_response_data)
+
+                    if is_finished:
+                        eos.insert(0, i)
+
+                # Generate and store response
+                for i in eos:
+                    output = generations[i].strip()
+                    prompt = input_prompts[i]
+                    #output = tokenizer.decode(input_ids[i])[0]
+                    print("-----")
+                    print(output)
+                    generated_text = output
+                    # Calculate token counts
+                    completion_tokens = (tokenizer.encode(generated_text)).shape[-1]
+                    prompt_tokens = (tokenizer.encode(prompt)).shape[-1]
+                    full_tokens = completion_tokens + prompt_tokens
+                    eos_prompt_id = prompt_ids.pop(i)
+                    if(streamer[i]):
+                        ## Generator, yield here..
+                            partial_response_data = {
+                                "finish_reason": "stop"
+                            }
+
+                            responses[eos_prompt_id] = partial_response_data
+                    else:# Construct the response based on the format
+                        response_data = {
+                            "id": f"chatcmpl-{prompt_id}",
+                            "object": "chat.completion",
+                            "created": int(time.time()),
+                            "model": repo_str,
+                            "choices": [{
                                 "index": 0,
-                                "delta": {
-                                    "content": output
+                                "message": {
+                                    "role": "assistant",
+                                    "content": generated_text,
                                 },
                                 "finish_reason": "stop"
+                            }],
+                            "usage": {
+                                "prompt_tokens": prompt_tokens,
+                                "completion_tokens": completion_tokens,
+                                "total_tokens": full_tokens
                             }
-                        ]
-                    }
-
-                    # Initialize a list for new prompt_id or append to existing one
-                    if prompt_id not in partial_responses:
-                        partial_responses[prompt_id] = []
-                    partial_responses[prompt_id].append(partial_response_data)
-                else:# Construct the response based on the format
-                    response_data = {
-                        "id": f"chatcmpl-{prompt_id}",
-                        "object": "chat.completion",
-                        "created": int(time.time()),
-                        "model": repo_str,
-                        "choices": [{
-                            "index": 0,
-                            "message": {
-                                "role": "assistant",
-                                "content": output,
-                            },
-                            "finish_reason": "stop"
-                        }],
-                        "usage": {
-                            "prompt_tokens": prompt_tokens,
-                            "completion_tokens": completion_tokens,
-                            "total_tokens": full_tokens
                         }
-                    }
-                    responses[prompt_id] = response_data
+                        responses[eos_prompt_id] = response_data
+
+                    # Clean up
+                    input_ids.pop(i)
+                    input_prompts.pop(i)
+                    generations.pop(i)
+                    caches.pop(i)
+                    streamer.pop(i)
+
         else:
             # Sleep for a short duration when there's no work
             time.sleep(0.1)  # Sleep for 100 milliseconds
@@ -388,8 +436,6 @@ def process_prompts():
                 prompt_ids.append(prompt_id)
                 caches.append(ncache)
                 streamer.append(stream)
-                settings_proto.temperature = temperature
-                settings.append(settings_proto.clone())  # Need individual settings per prompt to support Mirostat
                 future_tokens.append(None)
                 future_logits.append(None)
                 #print("Prompt added to queue: " + str(prompt_id))

From 1c996851d794d9473b7679eb8e7b731e4c733d76 Mon Sep 17 00:00:00 2001
From: isamu-isozaki <isamu.website@gmail.com>
Date: Mon, 29 Apr 2024 00:35:12 -0400
Subject: [PATCH 08/11] Main logic done

---
 README.md                                     |   4 +
 .../llm_exl2_client_multi.cpython-310.pyc     | Bin 0 -> 17443 bytes
 llm_exl2_client_multi.py                      |  91 ++++++++----------
 3 files changed, 45 insertions(+), 50 deletions(-)
 create mode 100644 __pycache__/llm_exl2_client_multi.cpython-310.pyc

diff --git a/README.md b/README.md
index 482f6ea..81380bd 100644
--- a/README.md
+++ b/README.md
@@ -64,3 +64,7 @@ pip install flash-attn --no-build-isolation
 ```
 
 Ubuntu 23 does not require the adding of any repos for the toolkit.
+
+```
+python llm_exl2_client_multi.py --port=5000 --use_outlines --gpu_split="5" --max_context=512 --repo_str=phi3b
+```
\ No newline at end of file
diff --git a/__pycache__/llm_exl2_client_multi.cpython-310.pyc b/__pycache__/llm_exl2_client_multi.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..711d0725e3b47b67e90039425d70e44121ba982d
GIT binary patch
literal 17443
zcmbt*Yj7M_c3yW+&-;PF`@z=`1Ro$V;8UVVfuu-~lqiuPMS#+(MeXbWw*d?|FWkOE
z60<!kYN_=;X;X1_*Sn(DgY^%W632F2i4rH?WF0$>lRtKxij%bCO3F@o!(UFKq$<U&
zmGYf?X9fe1RJkex^y&LJ_kGU2=bU@G6pu$V{C=Z%|NJ-aY1&^>r2W@{$P0L!zcMwA
zX-v;+mU`=!F5`w}$hXh($+u~l^6j_$@*S`O@*T8-@*T25@*TFqcpLdhA!<bnF)LPx
zTk%4|N)$S*4qdg8w35K_0jJaIY@~HrU9x<))h*vWR*!u5TD^Fi`MyHG)n6E}1|+^e
zKUf&DhGaO9-&ELaZI<C+eoJAiwN-{g`E7-il`3qvwikx2;ld7UM`6Sok$K_#&cZHh
zmkjIq-PUf|)*fqAzV}*t<-0it`>cJ4NAhEZw3XI1d+*YCeZOr09%zFH?L(`&`jhoW
z^M?yZtRsbItY-?(TF(}aT1N}dS<e-YS;u64EPuT4y!Cux!kTFG<AimB#aZHA%{uv_
z#yVK?U5zE}t#^IaDa1Qj7vfzq{sQ9NtOxNP89!|Ymd>zV)^|_0Chf_kv#j4f#|GHo
zJs<i@@gcT}ZDw0q@vZiGwvDCO_IqXnV;CiNu#vWuohY-bt;}x3_q4@FQO91kuUVr0
z;;Lr7WWTg@!Mb4k>n|hriv3FcA{$%P(Z(zFOZEkpUNx*K3bS#xAK}Zuzrys2eqZNQ
z_Da!@+*gr%0J&G#LF=lm)u&O{jQz}Asu86#*MRfds)l(xXuZx3S)XHvtvA>a%R&ur
zT5sAGaIV{L0=~tbu|AK~jFkcWg7pQ!EZX{xy>DrDY0iFS=^A?$@GU$H&viUDo_RZC
ze<7y<dx7crJ9;l}Ef!7ti2cf4-CDGB>^Xa?U%N{qlQlZOWiQs3*fDneU0JGrgFU~h
zp9O{-{V`cCZ%@<<7)z09b}>_86M$uQ0&tlf2ITA{pkwc@3t&}pnsswQvu@$Jjb{Z<
z70(y(xOi%KzJ%v(Ja_QCgXhb5?&h@YyX=(p75nF?|EeeG^;gmB7i5q7*=cqLwDL8U
zXJ^^DcXjLQ>^yrBl=ltx61#x+H|<?=T!zzSf3v9D-(W8T|6BIA>i6Wlzrrp8>*v`e
zHih>uu*>WU-uKz7>?+>xv1vAg_updI*lT$IB72>E4)4Fs-e4Bq-)3*J>v(_1{#yOJ
z?5$OuecnDjZ?Fve!n-DV`b$W^#Ii`AWpnRh4z2IW6owR=&C8VU+wZf58^(QuFM~Vv
z`~&NO{b1<>wrF3rudp0jVmI#jK_U9Id`1foX$PDd^?UKt&Ps9ag0OjZHg9|8#hfF&
z@QZd438KV3|FvSSRP;iZ%L4CgepL5@m$Kz@uDCF2c){~oN1VQL!Hd5%GjrwK?KxYb
z0Vy$g`Reqg9AkO=R+igH>6{ka&K6Mcs_m3ZMTZi@XR?lcsl@C&`qFc{B1$rHzQiZ9
z6(^g&cnLj3>P+c|UCdQ&?uE*^vYpQrZ7()s-xjJ;r1wxvjsx9prlN(w%p#g)2t^B(
zyvP+xg&c+v@WSVAU(9C<+1Czv9j)MGsW_io@DghY*||mA>s}wvJU5#YUgw6ySC4pI
zt)y0mF-x0TsjVqko8aVHX0}ky+kB3eru}CiGKt3t0SK+GuWHlUs6K_7%}OQ5JU_uB
zA8LFn5F~hX^z8BLWt%&+WSKHAE!lIzxn9Vp=khtbD6Z%8g^YbWe<-8k896oM<rS}M
zK`p(VsT6a|6+4q-2dQo9L`df;ydLYIABivEaRR!g*&3aimT^t9eYVN;3C;E+6tI1R
z+PpESA<dY;iDw1v(E1#iXClvUg^?GL@c`mMD~fnb#zTmQtvKQd8IK?ywK`h(F`1ui
z<;P|D&Q|$^jCZx-9f&8b?pFCu#G_VEE8c~;&+0`S{YT%sW%<5V`5u|y-^%Z$IF_f+
z>zpjndJ@}QwK0?6D}R9~-3wqr3S}|nnT{xxA2Ljx7b#?KX9Q8X^Ap77&x;DXfK|+j
z3b)UD0Y@^>Lrk*f1!qxVrZn%F_S|C0>zwD*MsaQ>Q?{|@V#Q0Ax$PjuZl-WqFq|(f
z5Jk*7o?b+qmHGMHZO@O6WXhl?$L6C0UMQ0(W(#&E<3%zVbh?tKa4eHqu4MC#93Q&l
zb!0NxVzDH$G)yOx;Y17EM_`0NfWR&Q&zCC-zK25o`FtrW_$Y<V*-|Ob_Yu|@fi!_}
z0{aOZAVBoT4-q&_phe^=w2jiX102EQqyQoy)E<4ao_yk8k1nwvBVy|F3wRu={|Q7)
z#4HIHi)&&Jv>hkAfDMzEux$qCN<{&_!j8(RAN&~t&k{IF;5h<M>i7gQ&f#%rqNw9B
zom1Ll-D~mK@Y8rKAn9|Bx5D5$aGv5inWuOw0$PpAxZ<rC;&B;Qyp=$_L&k}>f)??T
z%8$x8@sf%Yuf;9mB~>5sR)^#z6;Dch;w6Poyw%Q2inoZD)_BWjt?`oLtpVf@5{B%P
z<S%QeRi;nYxv3TJr#O~q!0Vk{%!<c&kPl;ay--0dpEIS%bSDxeneyEfA~HQM3ABwM
ziF*`P?xm0<%@$R9L1N`hR(SDJMbPo&WW-9DC}Xjdo3kCy&+P^Kwr4J3PafU;bSja&
zLnP3~K$5A5i}*1D)IENj0Pzrip1_kBr`0=tno!OVm;~_5JWhFjmf}QwPo&IK$hd&V
zAwE`=`E-^_g5fxosBXJjDW<TA+_ZV>R!%IYoU%Qao6p%SRW5NcPUkSQojIP9l0r3*
zPE&ey=n7y;lu{h!#iHG;qf{KHb572sxoGFh)kHdtT`JR@W-o%7=XOT$id~IgbL><z
zhYAL=bF?#Y8c8*pPA`-z8K<1jiE8xV(XoTa)PFVl;+1RYDbk(sQ6GN^*j|v?^I1p?
zs%$)+rX5=H`E8NYu>92yNDBf|M=Ea@7sTRtHI`1#k-Q+~gKCmG^IQs|LQ1BNS0l~*
z<s&Nh$c9{QmrEHe&7=5SsZhuk8BfzGT^*dkjL5?yMJP@VLtaVIMqG`cOAuMcbHyAs
zBV>9Z#}<%t797u9EaB8KX@bu$U`+V^-&2yr8)wY_IF-|qL=bZomK}{kM8X&ydpUy<
zd2wehi+vx>W$ePN*Quh}e0edOxtYyZY%g9cW#;o)XAy)iie7KALQ(}zEaBkL;p#l|
zB5QNvg;aR?h?kUy9f@?A8;}su+i+`fykNd`D^suwUJ!INTXO7b$Hh{XVNhxoj;H2`
z9h*M`0TT}qx6=c#pt;&xftoHfSHG*jXDl1jipocI$wHtEWBh;@p%&2$CbNwP=tS`%
z6T~A}h4RT?rdp|q9*t-5dZdp%B-UE^{`ctj@Khu4KTka*=B;ksyLWGjx@)uX@l>&N
zD|MkbZ*w||$H&Kyd*->%T|5Ln@Qa+t+YkQ%%FI#q%^!^<h-VUs&esrl#|TTgj~T1_
z1(*$9g0kmo!ng@`7KV$Bk?&UTx{vu*^_sy<Vb=XF@ZX0x^?~m-lu#H>7fJ?X$w1xb
zYQXlffU5y7D1stX53NGQKrAG}fZ-2<RF*;FTsB4{Q!|aRet1es6V(9AitO>!BO?;2
z_Nnj}hjVQB`0yNd-#G|C>1uj-Y`8Q_5@;iNoTgPtM&omfmEsK`%t1&OHVRfFW5bft
zfm}UFIcftPKK>?ZE;9S}@bLp<!;qsycKG-gYh%OnAbpS^D!|r?QnV8f*WRi{BBK#6
zm?ixtSMvN$-nPqLXF1CS6ab(rCB*T3<x*MhgPxyN3S|&tD^C&&cAs3q_CihO#PJ_i
z5*&nMFG!i#a=b*dR7Q@;OQ2tE?QA5l4kgXC`tfhD(9RHmamO%{ddL?t64+HkpM?Ao
zeONbj<A3oAvgh!$euPifElK!8ARM7B>5l(p?aR6_c=Tmxdv|q&eM`QY1|kActZIBf
z1Vr$z4s9{0LLziSgB>8O^6%>og~F@ab+Ay)sQGGU&0h<M2=g)1^)vrHUo9x2^%xOY
zE#!ul;tbM9E$oI}U%yt1hy;kKgIKR0gw(Gg9HB7j+ipZ8Z<u_U%CqoO%SWD8UUc%B
z01>;|Qg;lsu_$^IE%u5Y{&ms&SbA9WZA^2c!k6i18sLB!WY8;WF*m><qp-vcga5i1
zTpC*1Bz#DNj^PGb2U>_1#~L_E;QUzPY?e65Hk_dbPA72wMB;3bIGt@cA(_4v=~S~(
zi;Hdblp9~#?iy~)^|7vdU}3Op5UJ8$B8CydOe1YaJ;l0ZXas3d8QO`EDMNBq*RLfX
zX{@!*KV8sj39(DX@4_1MR<YJ0MqJ2xaz1)$$!b#U7TYN0u3qaDJKPRGHW#<^sOHlq
zwAZy_%+$o5`})fBIyMDt3q~#ZhE~*XVS}i3iJepubX0XaYu#=Vp^I*Jt(*1Mx>%nW
zrQG+m4-D2X_KJP=eYDYtF|y3L-8tRuVnYvn3p(5Mz+dZ8*dl$CY+=g_nn<gZiB@P(
z!?=5B+_I&+ZlXTU2HcKSLq{K$``iRZcnc%MD1f_vRjc)iVG7p_(C22NMUilOB~A9$
z`XoK~t?CV0?r+j}i?;jSu%zwL`T?|}*9KbS{e8BD_2%?ikJ}>-x&sqhF~#~^Y@!(J
z)(vCb=Jv7F0|WNS?KE0&Oh0IN_<@h@co3)!x`V0)tiEgoeH?CucC<nxt<cV92xI>0
zdj56>Fw2M72+h%Q$LP+f>NArSj&n%lS5j9mUpbee6P?b~oP%RZq?}5bL<NXyQgUm`
zQ8Usc!LMSU8TBbm@rKgJNDzJaDSnT-ukaG2T!(5?dm|2oL@EW%7eXCrwTK^`&aRB5
zb^{gKZ_0+X$YxF|o5x{~HXA}z>Pj9Pxt$UaaIy>8TycE&qwX0<g>%p`+iEANFrTfi
z@ELSM3AiM-NdXs!YtT{F6eZxErM?>qOKPJK??;gKDBL=x9`#c2v7<>IbNnQ5Kg8JO
z`GaGE4j(VR?%?^*?-@7_0_voL+^-Hasc*<mJ@TjN$KR!PJ2o`tg|nDEM`VlQLxZ0H
zN~2BwWrV$8Ltl}F(UExL{83Zub*>$#4I6?N6Hv<Yjo60){&Q62R|$|rvDPaXk4TUG
ziIQc0`_$-R&y><wY@LujKryxgi8D8s;ia;jY0!?+r-G_K&~jj~fbGFY@?$txELQ~1
zNXP3Wxe_NY*4oaKfQeIg!4|xm1{ZEgf?fbZ5=<mGtQWITp8RD_Rs&L@m1x=!zxf+9
zhJH#8z?J|(i(8a3K%kG3{Zh_^u@Lr2YR?Nbso4uPEnHqei56=lf|DNNJjBAL`O|>{
zQHDW<DQiVsji@=;XakQ-AE=BZe+SaHw9>#L>F3!ZSqO?$XM7q`i84<D-HS995R;>H
zx)OE0OyCCfKahicl*3|#+Hz%{r;H+jC4kW(FER@rA<K-N=gXA7gMd^KrA|nk*`|uJ
z+??PwO7l^R_bC*eujHEq;swB7=)U9-2;$B0dp^4aJ5kPy=A4Wocbj>A%%)XhE!m|Z
z=m(St=0z~qa;1vG>)j}CWk^Z{n^Xjd!%H+5gJo#y`Fpb8Xp~G;ASIHp;w{07if)vn
z`hiG0Pg5am6m<R^H93C)5b?$IfNtnr`T*pqgdV|P2<D-X5y2mYOr+D(WdwjpHkg6d
zlQjBd2`UxQ`~EDl+k{yuLh=}JsJ@VHd}<n$62N2XF~dN<p&LHaFg^wRsqtyxr-{xO
zdJr&Tx_Kvr9)-|;2-r!I^<>?q&-^rEhJ20E27bEviD3pF8DZ3>n?DPD9O{nvR8JCm
z@*VB%hzaH3r~D5w%yn`h(LhTs{Bb}d*-sjdc|;h(2iecZEfo@`l>OdU`H=l61S#>l
zzos>%KM|-0NfxR3UH?*u!HQc8xB=I|c^DL7<`WSYXE|g&6S7{!g>2c7^&lle)_ar6
zGyhY|L$Z8wc@e9}-GG#B5|9H9iH@b@QfDnB-(8S5gQ9yA<V_Z=MVYpsi=H0Ha;;Dt
zCu<Pxf51Wrg}b4^v4|V#(a_T<dfMBgi9U=WiZR5{ho~GwT#lh7uZn)rBc-MLwS*h4
zbyOkQ)(6}KMw5g@6oo`Ixblx0@{rb)hukF7<UQU&H;O*^_#e3)kUM4n6=dN7ZIjlm
z9n)$ZEa-L!$eb+UhFF+&pf3s8m*hROzIj!1LxhQub`q8>zhw;zX<Jz*WYjJ(RNuzB
z+%C6s)mQ6Addlr~yK$Ffux?12kWcE{S0SsZzpv1(w)Pcc-h_NKj8<YUEO@}#@fkRA
zi33c?g~`lz7jYTdvAilqutpPNCnWbE>p^?H#Q@~ZR!q<uUKYEtrZKsudk~7tP{-0J
z^a|M<Km&aOjk&$JYwV?4TC7i>h={%Qki_bjJ#S#GN-aZ5b3W7uDg*vLgrM3YG=@;G
z45blLlJ$5!&Q$LHR!DNidS2jtEnc|_*||>~s2_k7*$;}4*hCfg^|z9c1Ziadr9~-w
zAZPD(2QUK6O@j{pT<m!)rA0?;*fghWn27CwT=x-Hd=&ZuYHZ9AR!+^47;B^s-18Bo
z)DNPEgW^#AushhAPc>gu0#;OUgi5H|+<p;Q^@G!gYMay_(M%(6XaSNs(GA5RcL$c9
zS=Wcb<~q@R`aOemrP}5vU^RN$hP8!lb~oSDpnqO=x74;4L+%zBCm@m2s8GvR<ZqSv
zpJ&L2Zm~_Isn;m+19xjJ<!)tL<!xsf`6;%ohTD6wk1(Xny<I#jj@FM#{o*;6V8d<-
z`o%U#*rZ>KK)=|ou*EUx7Y6?}^owIE1><alz!PzIJ5i21R6o8-<A0vobUtdf?tG+X
ze`%ue-Xi<-J8ob71UPwMRcRu>=Jr7oc@I?5wt^z2)=jNqZ5TV`Gwv|W;i_Igi8b5-
zn%^m@I_&O{I?9e3bUEz9BhXRQ9=x-;FSqvOoo)d8*)GukDYUQGb~onD$0P7w*ajVC
zySrVy;O>?>N{Y-r&{1}8n6EwV2pfftvW@MPI!eIZ>F#3tpredIN7>`<Q8fskYDIFH
zY!9I$t+ZGxlx~F*ICrjyJ+1ieX6U}L{Nwc}loP=ko`#PEX)pTd_|y!)g;R~+Ca^-F
zO5lqGNHuBLH~;w5L$akU%*)@OHf?kN=~Q*->>E?3FI|{as?YS~>5J!5r^%RkJXH-t
zQ=u!Y<0)m<@{>LEIMj#$vv0zD58Z+6Gm!kr5M2%C#+K42b4x(*v0MR0MN&c#>%g5`
zOCBlaSE}(z602a$7L_t8->c9fN{%dKof{87f%<eFMSGP{1?x2I<t@p*;zNnQ{^vj9
zU&6Z@nQ(KkuojtnQr!-`jZ}Vz0BI=v9Rgn=@O1*DY4C3l_$GmG5r7Aj#(#mpFB14R
zz~~wNJqmxH!21Lq5cq(=4+xL}l-CKY68IqjlF9k60JL-wrN8lCql{l7Fig1tWuEpz
zi@1c(V6wbk4$XwGkWJZUWYW!$EbH~INwm0o6uFxaZ?m_(DA|-t^BGubp+>CJK<Lie
z>t2^snxDs1^S@6m<0e~V)bc^LEIK9bc##WaKs?7ec{ad?KDP(~UyAAKK82Ri3(EV+
z!-t@aKnkxEJRej~&j;NW2K8)y9`Ta1rh+JiBZ%xwmUFrabW~uz#9`QGo>44wGV=1@
zAn=>iM0~zNce+hwtr^ADEN&pY0D8nq)NZ-LHxYiIq0L^F5k)Ca)Emqku8h4H3D8oj
zBKf&vPHFv3!#@9q)Q^t{Y|tx4pQ=mn-zIdaH$;l|twzWBzoY&{q#B}D5jqdp$n(ov
z0xw9KBF&o&U}8Ld0aw1X+`;COgmT538M?jpbf`NNfNlud;Qy3b{1G*vbN<ih4P8&p
z4^|Eq1y9d-A=wwwufomHA<tyF`FZ}kRQz8M_&qAyrf85x!hfG~{v`oYIru*z@CO9`
z6@jfSATI;J)=I}vN{Mqrhi(DyInD5qR?#Kp_w_nOgft0A_$1>ijUxq3BW94^kbveu
zT0_J)qVF}f>ItJuA2CLB=p(vuCjh%|%m|oNDrM}FrIpMN{o_ZG3#KpB&?Ha-btQE3
z&H>}NK4k2Y*a7Gi5wv4MQ&4)zXX_*f8al~l)cXIYlhBx{r{*2)o#Y=tJ2?sM<aaq7
zk7#cUfa}4M>n?6+ccaTqTB46Rz&kqAYr3o9#7x%=*I0ry2wQ5et0PR;621?NqtHv1
zak+s!6L}-9&M2mGeQmh`<Q_t9@UdK^5uOWwsL{}r++4w9D-M{4FQOaL)1VsJv13OH
zmw5bmMDC!~4oW!PG}IuKw$u-O`~|$cB>266dobw5icI*=Q%xI4DXy>@C`WNteN0vh
z<BloSKq93cJY67!5EhX~F?s8z%mj9G${gCL)s(>$i<=-!TeRkLeV7^_2H(ZZ8ibId
zzKkhw`kB8mwc~9#doe))l%VMgexRmrP}UGa4ablRqy7_eBglObxzWdRk=B~rn3~-4
zJcMz~R8PnQ3}!Aa+>>g)BX5){DR_vbvMIXWg;64P`a-H@4I5uO@*>slCls$n+c4C+
zb+)6yaK#(T*VmrjYN4@;rd&lHK|j47)iqoVN+*xZe5nEx(36(4vFJ@}#Y*lhR@5Qu
z3#};Dl$a2I#wVs<tuU?bx)p{e@cI>QX9^xfZj(giBzk&8j`C69+Vr`rkSXYFe-u4E
zJ$+$%=JeDIQshA&M!}85*R_uLGR0EGxsLww7~yt*RzE%#ElQJAHr~dZFm*FugT>4S
z1u6FQgGP2D7p4M9LF;pa$UThQ&||qsYf(^GQIH&aH8lOk^vt<S7-N#UEd7DtHQf+!
zx+wHF9lRiG^+xu4Y;k%&Yf_x0^^-E%HYuGnDQ$ZZ?NqqWY3)>jbrVHXG2S*6Yq?;*
z)>K5=r{Xm2MbmQ2N?nn*vNcXBreHR;^Z6XCYl7_Y3p`sO&&xKxiYPLwb~ZP;RKuky
zqIN(ct7>B1h8I!Dtqm}O>+HYre3QlSCNW3*c1NehI%eqqtoi0W$Q@lr5MX}AtM182
z)jzS{IoTLic)}&lN3z<w6BBNW<0<i}WYV*qN2+#tafRO{F6krkg3tYWZl(Pg1~z$w
zIp1j!1dcJaAD?LKwRDX2<BCu1rRo?nurJeIO~mo6Yw$5eAn$7ptiT5*Ek4x+Od=jt
zx=9|YG!f5PNjw{KpGG`wXI%G-S7wf+CQG;%niJ}<OI152TqqeNc0)=nLfMeSFt84_
zMFmiwhMtxKB9ljC`{tThzqvkfTkTuF2y|f3Mb&+xpA_b}vANPgNtRdGXajl=Lh{(e
zcFQ+?<}%SvzTwzxE)yM_Ys+LxX`{JJw4F7V30qognSzaF!VxT&Y2$IKmWhs2TBhc4
z8fokRSXf9Q(W$CA;haoKSwwDjZHp9q^TO2h%v+?VzPVoo)wZvv6wx>K{6V774FX5+
z`l&4BU}ewde}!g}Fh6T1F^^lGG><Lu<L^8^k53mrwmwe4E(t8{!pOnLah6k(MTp2>
zKyGLQ3#i;Ma;K0Rc`O%ct^G5)UKlwvapv-LX6EuM=cZ0hxHvqo&SWm1&&*uAI;B!w
zN|pysGhXeTaL-Mh-H4{dj-5}fgih*a6^fVkJQa8HrPDKKF3+^k<wg~M!oqIgk*34g
zN3^m}7C=b8SNrX{Jr|z{Y1N+nHxKToOUeCfcZm-4Hs!gzP68$0qzE1o+MWN3=H4Mc
zD_j83#d~!MyOl0<DYnCa3O5a0Hz=>AnqIt%9AdE@!rYK^6FIaG9uWQwIXUDI>zxqr
zQ(9Nsb>u+EcP`-#0+Da9nS2vTye5JhN=yP1lB+Y1ceL@2HQsTQD2dR95_^FOw-DzR
z-W|ZWBf=Xn29X1&5a(-nljFyGbe*xTpCJ)lpQ8NI;6Tv@XM|Wi27{kZ#P4b~GpFH_
zMO#Xc*_i(pg<$fA(>X#NGVOOLgfuwE5`;;(7j6C#QstF{^RLPIhhoTuvuyEj1FH{M
ze}VK)s>S)w4XjS&!iBb&YGCyP>;C`?AD{v2uNzoh$n6If%<Hnn0T?=U#}M61JurM5
z(%q<gsTXNIqOacXno!rYr2#yHc!uVsQNEaXQl9Id4@e^`a8cKOu}N%p@!1O-WJC9i
z*Wq{OnqrG|1Gy}=;(BA#nh%_sIoCI!xyB6>rC+Acx^jAZDd&jTQkM=gl#0um?QmZy
z>A1ksA>IgbufTPOG_;PuSpx=Cw7u<~@nsmJYk@BXZt1tQ(UjUas(n`$;Q^4IgeW6F
z9Z4TPawdKAP&J{Xg!Iwb^sFti)t>3bft9{^DSh-zT3SP@+b*fSH~rjgq+KZDYPT{6
z7ncL$2h%t}U#%wJ$jKTf!G<bja0()N`X!q~6G<ORH}WO&7C6a5vYUq(TT))4X)+yF
z^3rxz?P9b1b{h68)rj;W?US`lX3HzHFnY<RWL2FDrF<?c7Sjh09hg-OuTPSt``2Mn
zQks|HKvIp7D4o8YSFWl9?XmVQ@k`h*s)NmaN@B0eCkyTdM~=0tc}{0dwUaK1_uDri
z)~3}xI}8hvg3WSS>259UnQ%fN179_Hwc+BIYPjg-{x^0y_>Dv%Pl$>GXI{7(z``rC
zx%}vEW!EfICrAoa>L7`GQkak@Atzl;DoUIb15P`q)X=%SzE8CVaE3yY1kMpSOMqlU
z{;LGOOMpzSa6PPu4Y&3@O1b3er!M?+6e2}K8tdq*rN&2A@SEV-IeRuccf*S`4te<0
z@G#+$d6qmD8y{K)`8Mk64$2}k3pq_g2hY&ND4X9urc~YDC_RJ<@W7#C;g9K9p~+Ep
z3DRY8oU}bDgy~5(AG+SgLzm)B^HofmzS`jsFgEL(jgSw5oJnPY`?Huo38NG)#_t%P
z@O|ic)xV#<IGaQJ>wZ4Oeh-h603eAD42}E197$9LzOQL~rsjhcm;YO!v<**+Nz%Fp
zcMtf8NRk*pg$_yd#+?{6GHN-xzj!kT`bs-+eL+s^N5>#Nvi)}vpkr`JUx3f$4{`kA
z&xbo%+PICn4;wskpc3eHlbNfyIecpmDF%pPbG+X4u=KkoTzIh|!vD_1apy-YC}V!a
z;IP|>LHfi^5PUJ=Zy0(YZ#9=8wPMuasZn1wvexD5k#rg#2JsbgeLqnVxqPllCfedS
z;$zD9c^sM&YlfxYP_;LmRwQ*|&bc{OERprz<}+ZP#9X#4ZP&_!L-M86#;e9ys%oTm
zR?XC<+!=U^McZ0~`;K(8&QMcctPzoIjfOytm09d%<i_Fo;Oa%6Ix*BQjnwdgA;)}W
zpg)FrmRAg;L72&HhpqrR)o3$VAaVjBw~`P!c1uSSeCmKt84nQkA0%*yzy=W+wqcQ3
zt7J0~!3Y|HZ0sZQ4SW(YO;}}<u+&O>D|vU|FY+_f&*2IQPQzDHV3bUzI{3sd>yE>V
zh}_0zD+@EDCci?Xcoo2tUt{I5q2a4AZj&z)E|X{@9Q^^8oKO3QXTWj}m-vMR@POQE
zyqLOHfhikGt`~x3RR+C~$lZ|EasFr2#zQI{z?~Hp1P3=h%_0g{Uuu!&%6HM65Gm7+
zA-`0E547Xh>f1nmm_}%D*zD)wO_eR@c!)~TKI8Q_lEw?IPd(v6#~n@rBECvf1;(2j
zkt26ExMi&{c)I20c#P0u*vN_w(Hp*4<Z-H;_?dSQND}A-z<wvc<fNf=QQVKN;v)n<
zZivb&{I4j<2i;Qcv0f)AftKDGZ(N-?%Gt~;<DioigbzG&oIas|kNB)Z$+V2~Ir*tu
z80Q;(9D4H*@1YL#66hn)Pk?L~d=S9vZXu4hyr%dNWz)qC-%KDyU^{^c0;d4H#A&`z
zDc~HtqLwhwSjt$IF<c+1Jaxq*_wtC$ryNI}B(xWMu}S&yhpL1B2_@khkSvq;b|k&y
zbCMiuqqhJ^9kwA{!c?3UFO1?WN4tR+20=B0>OKdv<H4`bM#Hwc(EAV6`~OIdgyqE!
zeI6q30QqZ_Lzid#-xGL)fJNZD0A4^=h|f_-XyDf<{dEGLBXFI-Fo7Qez^4;Dz@H_G
zq-#enG~N7|CDHt>g^aDZyCWTk-z0E|5akz)S#LmHd!gGHD2=T7^`qy1mB>Y1brz}E
z0|N3?khgQcNl|%4_xC9DcM1GD0lIYKKOykn3H&92|3=^j0eMaOK85}i!1ICA)#?3*
zlty<SUf@=V-ynxTdFU#EVS^+gC5{ck21yt6@eTwXIs@f#B)1#+@t7B^;AXnSi~I;>
zu5D>+2U}<Y6OT_fPXa-6$b1#T7ls@E;0Y)1qf|N_y!3~=Dv6oI0Ug8bRLV$(LuSZG
z`jVYVGuDeY4r@B@=^es8NxEeWrMDV%LluB89SHzr{5GXuo!m(w4ulY+TQHd;q1sI;
z&Co+1hmvU7NB(&sKl$USzmQ4wtd-dig0CIK1?AD#t-T%U(BAI5fD!pZ@z4Ms_17Kh
V4h;eCMR)*`A6<V)A27_H{XgY}lgR)8

literal 0
HcmV?d00001

diff --git a/llm_exl2_client_multi.py b/llm_exl2_client_multi.py
index 4db57df..4e93872 100644
--- a/llm_exl2_client_multi.py
+++ b/llm_exl2_client_multi.py
@@ -9,7 +9,8 @@
 import torch
 import random
 from typing import AsyncIterable, List, Generator, Union, Optional
-
+import traceback
+from typing import Mapping
 import requests
 import sseclient
 import subprocess
@@ -21,7 +22,6 @@
 from pydantic import BaseModel
 from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, TextStreamer, TextIteratorStreamer
 from threading import Thread
-from auto_gptq import exllama_set_max_input_length
 import queue
 import numpy as np
 
@@ -83,23 +83,12 @@ class ChatCompletionRequest(BaseModel):
     n: Optional[int] = 1  # default value of 1, batch size
     top_p: Optional[float] = 0.0  # default value of 0.0
     user: Optional[str] = None
+    stop_at: Optional[str] = None
+    outlines_type: Optional[str] = None
+    choices: Optional[list[str]] = None
+    regex: Optional[str] = None
+    json: Optional[str] = None
 
-class OutlinesRequest(ChatCompletionRequest):
-    ends_at: str | None = None
-
-class OutlinesChoiceRequest(OutlinesRequest):
-    choices: list[str]
-
-class OutlinesRegexRequest(OutlinesRequest):
-    regex: str
-
-class OutlinesJsonRequest(OutlinesRequest):
-    json: str
-
-class OutlinesTextRequest(OutlinesRequest):
-    pass
-
-repo_str = 'commandr-exl2'
 #repo_str = 'theprofessor-exl2-speculative'
 
 parser = argparse.ArgumentParser(description='Run server with specified port.')
@@ -107,9 +96,15 @@ class OutlinesTextRequest(OutlinesRequest):
 # Add argument for port with default type as integer
 parser.add_argument('--port', type=int, help='Port to run the server on.')
 parser.add_argument('--use_outlines', action='store_true', help='Use outlines.')
+parser.add_argument('--gpu_split', type=str, default="17,19,19,19", help='GPU splits.')
+parser.add_argument('--max_context', type=int, default=12288, help='Context length.')
+parser.add_argument('--cache_8bit', action='store_true', help='Use 8 bit cache.')
+parser.add_argument('--cache_q4', action='store_true', help='Use 4 bit cache.')
+parser.add_argument('--repo_str', type=str, default='commandr-exl2', help='The model repository name')
 
 # Parse the arguments
 args = parser.parse_args()
+repo_str = args.repo_str
 
 config = configparser.ConfigParser()
 config.read('config.ini')
@@ -132,12 +127,12 @@ class OutlinesTextRequest(OutlinesRequest):
 dynamic_rope_offset = 0.0
 
 ropescale = 1.0
-max_context = 12288
+max_context = args.max_context
 config.scale_alpha_value = ropescale
 config.max_seq_len = max_context
 base_model_native_max = 4096
-cache_8bit = True
-cache_q4 = False
+cache_8bit = args.cache_8bit
+cache_q4 = args.cache_q4
 
 if args.use_outlines:
     model = outlines.models.exl2(
@@ -151,7 +146,7 @@ class OutlinesTextRequest(OutlinesRequest):
         cache_8bit = cache_8bit,
         cache_q4 = cache_q4,
         tokenizer_kwargs = {},
-        gpu_split = "17,19,19,19", # we might be able to make this auto
+        gpu_split = args.gpu_split, # we might be able to make this auto
         low_mem = None,
         verbose = None
     )
@@ -161,7 +156,7 @@ class OutlinesTextRequest(OutlinesRequest):
 #cache = ExLlamaV2Cache(model, lazy=True, max_seq_len = 20480)
 #model.load_autosplit(cache)
 if not args.use_outlines:
-    model.load([17,19,19,19])
+    model.load([int(gpu_memory) for gpu_memory in args.gpu_split.split(",")])
 
 tokenizer = ExLlamaV2Tokenizer(config)
 
@@ -198,7 +193,7 @@ class OutlinesTextRequest(OutlinesRequest):
 partial_responses = {}
 
 max_parallel_seqs = 3 
-num_of_gpus = 4
+num_of_gpus = len(args.gpu_split.split(","))
 
 print("*** Loaded.. now Inference...:")
 
@@ -224,16 +219,16 @@ async def stream_response(prompt_id, timeout=180):
 
 # Worker thread function
 def process_outline_prompts():
-    # TODO: Somehow make this streams
     global partial_responses
     assert args.use_outlines
     assert not use_dynamic_rope_scaling, "Currently ROPE scaling is not supported with outlines"
     base_model = model.model
     while True:
-        while not prompts.empty() or len(input_ids):
-            while len(input_ids) < max_parallel_seqs and not prompts.empty():
+        while not prompts.empty() or len(prompt_ids):
+            while len(prompt_ids) < max_parallel_seqs and not prompts.empty():
                 prompt_id, prompt, max_tokens, stream, temperature, outlines_dict = prompts.get()
-                sampler = multinomial(top_k=50., top_p=1.0, temperature=temperature)
+                print(f"got prompt with outlines dict {outlines_dict}")
+                sampler = multinomial(top_k=50, top_p=1.0, temperature=temperature)
                 ids = tokenizer.encode(prompt)
                 prompt_tokens = ids.shape[-1]
                 full_tokens = prompt_tokens + max_tokens
@@ -254,7 +249,7 @@ def process_outline_prompts():
                     ncache = ExLlamaV2Cache(base_model, lazy=not base_model.loaded, max_seq_len = full_tokens)  # (max_seq_len could be different for each cache)
                 model.cache = ncache
                 model.past_seq = None
-                stop_at = outlines["stop_at"]
+                stop_at = outlines_dict["stop_at"]
                 if outlines_dict["type"] == "choices":
                     generator = outlines.generate.choice(model, outlines_dict["choices"], sampler=sampler)
                 elif outlines_dict["type"] == "json":
@@ -264,15 +259,14 @@ def process_outline_prompts():
                 else:
                     generator = outlines.generate.text(model, sampler=sampler)
                 generators.append(generator.stream(prompt, stop_at=stop_at, max_tokens=max_tokens))
-                input_ids.append(prompt_id)
+                prompt_ids.append(prompt_id)
                 input_prompts.append(prompt)
                 generations.append("")
                 caches.append(ncache)
                 streamer.append(stream)
-                #print("Prompt added to queue: " + str(prompt_id))
-            if(len(input_ids)):
+            if(len(prompt_ids)):
                 eos = []
-                for i in range(len(input_ids)):
+                for i in range(len(prompt_ids)):
                     model.cache = caches[i]
                     is_finished = False
                     try:
@@ -284,7 +278,7 @@ def process_outline_prompts():
                     if(streamer[i]):
                         outcontent = decoded_response_token
                         if is_finished:
-                            outcontent = outcontent
+                            outcontent = ""
                             reason = "stop"
                         partial_response_data = {
                             "id": f"chatcmpl-{prompt_ids[i]}",
@@ -351,9 +345,7 @@ def process_outline_prompts():
                             }
                         }
                         responses[eos_prompt_id] = response_data
-
                     # Clean up
-                    input_ids.pop(i)
                     input_prompts.pop(i)
                     generations.pop(i)
                     caches.pop(i)
@@ -724,7 +716,6 @@ async def format_prompt_commandr(messages):
 
 @app.post('/v1/chat/completions')
 async def mainchat(request: ChatCompletionRequest):
-
     try:
         prompt = ''
         if repo_str == 'Phind-CodeLlama-34B-v2':
@@ -745,27 +736,26 @@ async def mainchat(request: ChatCompletionRequest):
             prompt = await format_prompt_commandr(request.messages)
         else:
             prompt = await format_prompt(request.messages)
-        print(prompt)
 
         timeout = 180  # seconds
         start_time = time.time()
         prompt_id = generate_unique_id() # Replace with a function to generate unique IDs
         outlines_dict = {}
-        if isinstance(request, OutlinesRequest):
-            outlines_dict["ends_at"] = request.ends_at
-        if isinstance(request, OutlinesChoiceRequest):
-            outlines_dict["type"] = "choices"
+        if request.stop_at is not None:
+            outlines_dict["stop_at"] = request.stop_at
+        if request.outlines_type is not None:
+            outlines_dict["type"] = request.outlines_type
+        if outlines_dict["type"] == "choices":
+            assert request.choices is not None
             outlines_dict["choices"] = request.choices
-        elif isinstance(request, OutlinesJsonRequest):
-            outlines_dict["type"] = "json"
+        elif outlines_dict["type"] == "json":
+            assert request.json is not None
             outlines_dict["json"] = request.json
-        elif isinstance(request, OutlinesRegexRequest):
-            outlines_dict["type"] = "regex"
+        elif outlines_dict["type"] == "regex":
+            assert request.regex is not None
             outlines_dict["regex"] = request.regex
-        elif isinstance(request, OutlinesTextRequest):
-            outlines_dict["type"] = "text"
-        elif args.use_outliens:
-            raise NotImplementedError("If outlines is used, the request must be an OutlinesRequest")
+        else:
+            assert (outlines_dict["type"] == "text") or not args.outlines
         if not args.use_outlines:
             prompts.put((prompt_id, prompt, request.max_tokens, request.stream, request.temperature))
         else:
@@ -786,6 +776,7 @@ async def mainchat(request: ChatCompletionRequest):
             return responses.pop(prompt_id)
 
     except Exception as e:
+        print(traceback.format_exc())
         raise HTTPException(status_code=500, detail=str(e))
 
     return response

From 36bec6c392edf2a98faa7a0a7c1fc269f58c442a Mon Sep 17 00:00:00 2001
From: isamu-isozaki <isamu.website@gmail.com>
Date: Mon, 29 Apr 2024 00:36:31 -0400
Subject: [PATCH 09/11] Remove unnecessary diffs

---
 README.md                                       |   4 ----
 .../llm_exl2_client_multi.cpython-310.pyc       | Bin 17443 -> 0 bytes
 2 files changed, 4 deletions(-)
 delete mode 100644 __pycache__/llm_exl2_client_multi.cpython-310.pyc

diff --git a/README.md b/README.md
index 81380bd..482f6ea 100644
--- a/README.md
+++ b/README.md
@@ -64,7 +64,3 @@ pip install flash-attn --no-build-isolation
 ```
 
 Ubuntu 23 does not require the adding of any repos for the toolkit.
-
-```
-python llm_exl2_client_multi.py --port=5000 --use_outlines --gpu_split="5" --max_context=512 --repo_str=phi3b
-```
\ No newline at end of file
diff --git a/__pycache__/llm_exl2_client_multi.cpython-310.pyc b/__pycache__/llm_exl2_client_multi.cpython-310.pyc
deleted file mode 100644
index 711d0725e3b47b67e90039425d70e44121ba982d..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 17443
zcmbt*Yj7M_c3yW+&-;PF`@z=`1Ro$V;8UVVfuu-~lqiuPMS#+(MeXbWw*d?|FWkOE
z60<!kYN_=;X;X1_*Sn(DgY^%W632F2i4rH?WF0$>lRtKxij%bCO3F@o!(UFKq$<U&
zmGYf?X9fe1RJkex^y&LJ_kGU2=bU@G6pu$V{C=Z%|NJ-aY1&^>r2W@{$P0L!zcMwA
zX-v;+mU`=!F5`w}$hXh($+u~l^6j_$@*S`O@*T8-@*T25@*TFqcpLdhA!<bnF)LPx
zTk%4|N)$S*4qdg8w35K_0jJaIY@~HrU9x<))h*vWR*!u5TD^Fi`MyHG)n6E}1|+^e
zKUf&DhGaO9-&ELaZI<C+eoJAiwN-{g`E7-il`3qvwikx2;ld7UM`6Sok$K_#&cZHh
zmkjIq-PUf|)*fqAzV}*t<-0it`>cJ4NAhEZw3XI1d+*YCeZOr09%zFH?L(`&`jhoW
z^M?yZtRsbItY-?(TF(}aT1N}dS<e-YS;u64EPuT4y!Cux!kTFG<AimB#aZHA%{uv_
z#yVK?U5zE}t#^IaDa1Qj7vfzq{sQ9NtOxNP89!|Ymd>zV)^|_0Chf_kv#j4f#|GHo
zJs<i@@gcT}ZDw0q@vZiGwvDCO_IqXnV;CiNu#vWuohY-bt;}x3_q4@FQO91kuUVr0
z;;Lr7WWTg@!Mb4k>n|hriv3FcA{$%P(Z(zFOZEkpUNx*K3bS#xAK}Zuzrys2eqZNQ
z_Da!@+*gr%0J&G#LF=lm)u&O{jQz}Asu86#*MRfds)l(xXuZx3S)XHvtvA>a%R&ur
zT5sAGaIV{L0=~tbu|AK~jFkcWg7pQ!EZX{xy>DrDY0iFS=^A?$@GU$H&viUDo_RZC
ze<7y<dx7crJ9;l}Ef!7ti2cf4-CDGB>^Xa?U%N{qlQlZOWiQs3*fDneU0JGrgFU~h
zp9O{-{V`cCZ%@<<7)z09b}>_86M$uQ0&tlf2ITA{pkwc@3t&}pnsswQvu@$Jjb{Z<
z70(y(xOi%KzJ%v(Ja_QCgXhb5?&h@YyX=(p75nF?|EeeG^;gmB7i5q7*=cqLwDL8U
zXJ^^DcXjLQ>^yrBl=ltx61#x+H|<?=T!zzSf3v9D-(W8T|6BIA>i6Wlzrrp8>*v`e
zHih>uu*>WU-uKz7>?+>xv1vAg_updI*lT$IB72>E4)4Fs-e4Bq-)3*J>v(_1{#yOJ
z?5$OuecnDjZ?Fve!n-DV`b$W^#Ii`AWpnRh4z2IW6owR=&C8VU+wZf58^(QuFM~Vv
z`~&NO{b1<>wrF3rudp0jVmI#jK_U9Id`1foX$PDd^?UKt&Ps9ag0OjZHg9|8#hfF&
z@QZd438KV3|FvSSRP;iZ%L4CgepL5@m$Kz@uDCF2c){~oN1VQL!Hd5%GjrwK?KxYb
z0Vy$g`Reqg9AkO=R+igH>6{ka&K6Mcs_m3ZMTZi@XR?lcsl@C&`qFc{B1$rHzQiZ9
z6(^g&cnLj3>P+c|UCdQ&?uE*^vYpQrZ7()s-xjJ;r1wxvjsx9prlN(w%p#g)2t^B(
zyvP+xg&c+v@WSVAU(9C<+1Czv9j)MGsW_io@DghY*||mA>s}wvJU5#YUgw6ySC4pI
zt)y0mF-x0TsjVqko8aVHX0}ky+kB3eru}CiGKt3t0SK+GuWHlUs6K_7%}OQ5JU_uB
zA8LFn5F~hX^z8BLWt%&+WSKHAE!lIzxn9Vp=khtbD6Z%8g^YbWe<-8k896oM<rS}M
zK`p(VsT6a|6+4q-2dQo9L`df;ydLYIABivEaRR!g*&3aimT^t9eYVN;3C;E+6tI1R
z+PpESA<dY;iDw1v(E1#iXClvUg^?GL@c`mMD~fnb#zTmQtvKQd8IK?ywK`h(F`1ui
z<;P|D&Q|$^jCZx-9f&8b?pFCu#G_VEE8c~;&+0`S{YT%sW%<5V`5u|y-^%Z$IF_f+
z>zpjndJ@}QwK0?6D}R9~-3wqr3S}|nnT{xxA2Ljx7b#?KX9Q8X^Ap77&x;DXfK|+j
z3b)UD0Y@^>Lrk*f1!qxVrZn%F_S|C0>zwD*MsaQ>Q?{|@V#Q0Ax$PjuZl-WqFq|(f
z5Jk*7o?b+qmHGMHZO@O6WXhl?$L6C0UMQ0(W(#&E<3%zVbh?tKa4eHqu4MC#93Q&l
zb!0NxVzDH$G)yOx;Y17EM_`0NfWR&Q&zCC-zK25o`FtrW_$Y<V*-|Ob_Yu|@fi!_}
z0{aOZAVBoT4-q&_phe^=w2jiX102EQqyQoy)E<4ao_yk8k1nwvBVy|F3wRu={|Q7)
z#4HIHi)&&Jv>hkAfDMzEux$qCN<{&_!j8(RAN&~t&k{IF;5h<M>i7gQ&f#%rqNw9B
zom1Ll-D~mK@Y8rKAn9|Bx5D5$aGv5inWuOw0$PpAxZ<rC;&B;Qyp=$_L&k}>f)??T
z%8$x8@sf%Yuf;9mB~>5sR)^#z6;Dch;w6Poyw%Q2inoZD)_BWjt?`oLtpVf@5{B%P
z<S%QeRi;nYxv3TJr#O~q!0Vk{%!<c&kPl;ay--0dpEIS%bSDxeneyEfA~HQM3ABwM
ziF*`P?xm0<%@$R9L1N`hR(SDJMbPo&WW-9DC}Xjdo3kCy&+P^Kwr4J3PafU;bSja&
zLnP3~K$5A5i}*1D)IENj0Pzrip1_kBr`0=tno!OVm;~_5JWhFjmf}QwPo&IK$hd&V
zAwE`=`E-^_g5fxosBXJjDW<TA+_ZV>R!%IYoU%Qao6p%SRW5NcPUkSQojIP9l0r3*
zPE&ey=n7y;lu{h!#iHG;qf{KHb572sxoGFh)kHdtT`JR@W-o%7=XOT$id~IgbL><z
zhYAL=bF?#Y8c8*pPA`-z8K<1jiE8xV(XoTa)PFVl;+1RYDbk(sQ6GN^*j|v?^I1p?
zs%$)+rX5=H`E8NYu>92yNDBf|M=Ea@7sTRtHI`1#k-Q+~gKCmG^IQs|LQ1BNS0l~*
z<s&Nh$c9{QmrEHe&7=5SsZhuk8BfzGT^*dkjL5?yMJP@VLtaVIMqG`cOAuMcbHyAs
zBV>9Z#}<%t797u9EaB8KX@bu$U`+V^-&2yr8)wY_IF-|qL=bZomK}{kM8X&ydpUy<
zd2wehi+vx>W$ePN*Quh}e0edOxtYyZY%g9cW#;o)XAy)iie7KALQ(}zEaBkL;p#l|
zB5QNvg;aR?h?kUy9f@?A8;}su+i+`fykNd`D^suwUJ!INTXO7b$Hh{XVNhxoj;H2`
z9h*M`0TT}qx6=c#pt;&xftoHfSHG*jXDl1jipocI$wHtEWBh;@p%&2$CbNwP=tS`%
z6T~A}h4RT?rdp|q9*t-5dZdp%B-UE^{`ctj@Khu4KTka*=B;ksyLWGjx@)uX@l>&N
zD|MkbZ*w||$H&Kyd*->%T|5Ln@Qa+t+YkQ%%FI#q%^!^<h-VUs&esrl#|TTgj~T1_
z1(*$9g0kmo!ng@`7KV$Bk?&UTx{vu*^_sy<Vb=XF@ZX0x^?~m-lu#H>7fJ?X$w1xb
zYQXlffU5y7D1stX53NGQKrAG}fZ-2<RF*;FTsB4{Q!|aRet1es6V(9AitO>!BO?;2
z_Nnj}hjVQB`0yNd-#G|C>1uj-Y`8Q_5@;iNoTgPtM&omfmEsK`%t1&OHVRfFW5bft
zfm}UFIcftPKK>?ZE;9S}@bLp<!;qsycKG-gYh%OnAbpS^D!|r?QnV8f*WRi{BBK#6
zm?ixtSMvN$-nPqLXF1CS6ab(rCB*T3<x*MhgPxyN3S|&tD^C&&cAs3q_CihO#PJ_i
z5*&nMFG!i#a=b*dR7Q@;OQ2tE?QA5l4kgXC`tfhD(9RHmamO%{ddL?t64+HkpM?Ao
zeONbj<A3oAvgh!$euPifElK!8ARM7B>5l(p?aR6_c=Tmxdv|q&eM`QY1|kActZIBf
z1Vr$z4s9{0LLziSgB>8O^6%>og~F@ab+Ay)sQGGU&0h<M2=g)1^)vrHUo9x2^%xOY
zE#!ul;tbM9E$oI}U%yt1hy;kKgIKR0gw(Gg9HB7j+ipZ8Z<u_U%CqoO%SWD8UUc%B
z01>;|Qg;lsu_$^IE%u5Y{&ms&SbA9WZA^2c!k6i18sLB!WY8;WF*m><qp-vcga5i1
zTpC*1Bz#DNj^PGb2U>_1#~L_E;QUzPY?e65Hk_dbPA72wMB;3bIGt@cA(_4v=~S~(
zi;Hdblp9~#?iy~)^|7vdU}3Op5UJ8$B8CydOe1YaJ;l0ZXas3d8QO`EDMNBq*RLfX
zX{@!*KV8sj39(DX@4_1MR<YJ0MqJ2xaz1)$$!b#U7TYN0u3qaDJKPRGHW#<^sOHlq
zwAZy_%+$o5`})fBIyMDt3q~#ZhE~*XVS}i3iJepubX0XaYu#=Vp^I*Jt(*1Mx>%nW
zrQG+m4-D2X_KJP=eYDYtF|y3L-8tRuVnYvn3p(5Mz+dZ8*dl$CY+=g_nn<gZiB@P(
z!?=5B+_I&+ZlXTU2HcKSLq{K$``iRZcnc%MD1f_vRjc)iVG7p_(C22NMUilOB~A9$
z`XoK~t?CV0?r+j}i?;jSu%zwL`T?|}*9KbS{e8BD_2%?ikJ}>-x&sqhF~#~^Y@!(J
z)(vCb=Jv7F0|WNS?KE0&Oh0IN_<@h@co3)!x`V0)tiEgoeH?CucC<nxt<cV92xI>0
zdj56>Fw2M72+h%Q$LP+f>NArSj&n%lS5j9mUpbee6P?b~oP%RZq?}5bL<NXyQgUm`
zQ8Usc!LMSU8TBbm@rKgJNDzJaDSnT-ukaG2T!(5?dm|2oL@EW%7eXCrwTK^`&aRB5
zb^{gKZ_0+X$YxF|o5x{~HXA}z>Pj9Pxt$UaaIy>8TycE&qwX0<g>%p`+iEANFrTfi
z@ELSM3AiM-NdXs!YtT{F6eZxErM?>qOKPJK??;gKDBL=x9`#c2v7<>IbNnQ5Kg8JO
z`GaGE4j(VR?%?^*?-@7_0_voL+^-Hasc*<mJ@TjN$KR!PJ2o`tg|nDEM`VlQLxZ0H
zN~2BwWrV$8Ltl}F(UExL{83Zub*>$#4I6?N6Hv<Yjo60){&Q62R|$|rvDPaXk4TUG
ziIQc0`_$-R&y><wY@LujKryxgi8D8s;ia;jY0!?+r-G_K&~jj~fbGFY@?$txELQ~1
zNXP3Wxe_NY*4oaKfQeIg!4|xm1{ZEgf?fbZ5=<mGtQWITp8RD_Rs&L@m1x=!zxf+9
zhJH#8z?J|(i(8a3K%kG3{Zh_^u@Lr2YR?Nbso4uPEnHqei56=lf|DNNJjBAL`O|>{
zQHDW<DQiVsji@=;XakQ-AE=BZe+SaHw9>#L>F3!ZSqO?$XM7q`i84<D-HS995R;>H
zx)OE0OyCCfKahicl*3|#+Hz%{r;H+jC4kW(FER@rA<K-N=gXA7gMd^KrA|nk*`|uJ
z+??PwO7l^R_bC*eujHEq;swB7=)U9-2;$B0dp^4aJ5kPy=A4Wocbj>A%%)XhE!m|Z
z=m(St=0z~qa;1vG>)j}CWk^Z{n^Xjd!%H+5gJo#y`Fpb8Xp~G;ASIHp;w{07if)vn
z`hiG0Pg5am6m<R^H93C)5b?$IfNtnr`T*pqgdV|P2<D-X5y2mYOr+D(WdwjpHkg6d
zlQjBd2`UxQ`~EDl+k{yuLh=}JsJ@VHd}<n$62N2XF~dN<p&LHaFg^wRsqtyxr-{xO
zdJr&Tx_Kvr9)-|;2-r!I^<>?q&-^rEhJ20E27bEviD3pF8DZ3>n?DPD9O{nvR8JCm
z@*VB%hzaH3r~D5w%yn`h(LhTs{Bb}d*-sjdc|;h(2iecZEfo@`l>OdU`H=l61S#>l
zzos>%KM|-0NfxR3UH?*u!HQc8xB=I|c^DL7<`WSYXE|g&6S7{!g>2c7^&lle)_ar6
zGyhY|L$Z8wc@e9}-GG#B5|9H9iH@b@QfDnB-(8S5gQ9yA<V_Z=MVYpsi=H0Ha;;Dt
zCu<Pxf51Wrg}b4^v4|V#(a_T<dfMBgi9U=WiZR5{ho~GwT#lh7uZn)rBc-MLwS*h4
zbyOkQ)(6}KMw5g@6oo`Ixblx0@{rb)hukF7<UQU&H;O*^_#e3)kUM4n6=dN7ZIjlm
z9n)$ZEa-L!$eb+UhFF+&pf3s8m*hROzIj!1LxhQub`q8>zhw;zX<Jz*WYjJ(RNuzB
z+%C6s)mQ6Addlr~yK$Ffux?12kWcE{S0SsZzpv1(w)Pcc-h_NKj8<YUEO@}#@fkRA
zi33c?g~`lz7jYTdvAilqutpPNCnWbE>p^?H#Q@~ZR!q<uUKYEtrZKsudk~7tP{-0J
z^a|M<Km&aOjk&$JYwV?4TC7i>h={%Qki_bjJ#S#GN-aZ5b3W7uDg*vLgrM3YG=@;G
z45blLlJ$5!&Q$LHR!DNidS2jtEnc|_*||>~s2_k7*$;}4*hCfg^|z9c1Ziadr9~-w
zAZPD(2QUK6O@j{pT<m!)rA0?;*fghWn27CwT=x-Hd=&ZuYHZ9AR!+^47;B^s-18Bo
z)DNPEgW^#AushhAPc>gu0#;OUgi5H|+<p;Q^@G!gYMay_(M%(6XaSNs(GA5RcL$c9
zS=Wcb<~q@R`aOemrP}5vU^RN$hP8!lb~oSDpnqO=x74;4L+%zBCm@m2s8GvR<ZqSv
zpJ&L2Zm~_Isn;m+19xjJ<!)tL<!xsf`6;%ohTD6wk1(Xny<I#jj@FM#{o*;6V8d<-
z`o%U#*rZ>KK)=|ou*EUx7Y6?}^owIE1><alz!PzIJ5i21R6o8-<A0vobUtdf?tG+X
ze`%ue-Xi<-J8ob71UPwMRcRu>=Jr7oc@I?5wt^z2)=jNqZ5TV`Gwv|W;i_Igi8b5-
zn%^m@I_&O{I?9e3bUEz9BhXRQ9=x-;FSqvOoo)d8*)GukDYUQGb~onD$0P7w*ajVC
zySrVy;O>?>N{Y-r&{1}8n6EwV2pfftvW@MPI!eIZ>F#3tpredIN7>`<Q8fskYDIFH
zY!9I$t+ZGxlx~F*ICrjyJ+1ieX6U}L{Nwc}loP=ko`#PEX)pTd_|y!)g;R~+Ca^-F
zO5lqGNHuBLH~;w5L$akU%*)@OHf?kN=~Q*->>E?3FI|{as?YS~>5J!5r^%RkJXH-t
zQ=u!Y<0)m<@{>LEIMj#$vv0zD58Z+6Gm!kr5M2%C#+K42b4x(*v0MR0MN&c#>%g5`
zOCBlaSE}(z602a$7L_t8->c9fN{%dKof{87f%<eFMSGP{1?x2I<t@p*;zNnQ{^vj9
zU&6Z@nQ(KkuojtnQr!-`jZ}Vz0BI=v9Rgn=@O1*DY4C3l_$GmG5r7Aj#(#mpFB14R
zz~~wNJqmxH!21Lq5cq(=4+xL}l-CKY68IqjlF9k60JL-wrN8lCql{l7Fig1tWuEpz
zi@1c(V6wbk4$XwGkWJZUWYW!$EbH~INwm0o6uFxaZ?m_(DA|-t^BGubp+>CJK<Lie
z>t2^snxDs1^S@6m<0e~V)bc^LEIK9bc##WaKs?7ec{ad?KDP(~UyAAKK82Ri3(EV+
z!-t@aKnkxEJRej~&j;NW2K8)y9`Ta1rh+JiBZ%xwmUFrabW~uz#9`QGo>44wGV=1@
zAn=>iM0~zNce+hwtr^ADEN&pY0D8nq)NZ-LHxYiIq0L^F5k)Ca)Emqku8h4H3D8oj
zBKf&vPHFv3!#@9q)Q^t{Y|tx4pQ=mn-zIdaH$;l|twzWBzoY&{q#B}D5jqdp$n(ov
z0xw9KBF&o&U}8Ld0aw1X+`;COgmT538M?jpbf`NNfNlud;Qy3b{1G*vbN<ih4P8&p
z4^|Eq1y9d-A=wwwufomHA<tyF`FZ}kRQz8M_&qAyrf85x!hfG~{v`oYIru*z@CO9`
z6@jfSATI;J)=I}vN{Mqrhi(DyInD5qR?#Kp_w_nOgft0A_$1>ijUxq3BW94^kbveu
zT0_J)qVF}f>ItJuA2CLB=p(vuCjh%|%m|oNDrM}FrIpMN{o_ZG3#KpB&?Ha-btQE3
z&H>}NK4k2Y*a7Gi5wv4MQ&4)zXX_*f8al~l)cXIYlhBx{r{*2)o#Y=tJ2?sM<aaq7
zk7#cUfa}4M>n?6+ccaTqTB46Rz&kqAYr3o9#7x%=*I0ry2wQ5et0PR;621?NqtHv1
zak+s!6L}-9&M2mGeQmh`<Q_t9@UdK^5uOWwsL{}r++4w9D-M{4FQOaL)1VsJv13OH
zmw5bmMDC!~4oW!PG}IuKw$u-O`~|$cB>266dobw5icI*=Q%xI4DXy>@C`WNteN0vh
z<BloSKq93cJY67!5EhX~F?s8z%mj9G${gCL)s(>$i<=-!TeRkLeV7^_2H(ZZ8ibId
zzKkhw`kB8mwc~9#doe))l%VMgexRmrP}UGa4ablRqy7_eBglObxzWdRk=B~rn3~-4
zJcMz~R8PnQ3}!Aa+>>g)BX5){DR_vbvMIXWg;64P`a-H@4I5uO@*>slCls$n+c4C+
zb+)6yaK#(T*VmrjYN4@;rd&lHK|j47)iqoVN+*xZe5nEx(36(4vFJ@}#Y*lhR@5Qu
z3#};Dl$a2I#wVs<tuU?bx)p{e@cI>QX9^xfZj(giBzk&8j`C69+Vr`rkSXYFe-u4E
zJ$+$%=JeDIQshA&M!}85*R_uLGR0EGxsLww7~yt*RzE%#ElQJAHr~dZFm*FugT>4S
z1u6FQgGP2D7p4M9LF;pa$UThQ&||qsYf(^GQIH&aH8lOk^vt<S7-N#UEd7DtHQf+!
zx+wHF9lRiG^+xu4Y;k%&Yf_x0^^-E%HYuGnDQ$ZZ?NqqWY3)>jbrVHXG2S*6Yq?;*
z)>K5=r{Xm2MbmQ2N?nn*vNcXBreHR;^Z6XCYl7_Y3p`sO&&xKxiYPLwb~ZP;RKuky
zqIN(ct7>B1h8I!Dtqm}O>+HYre3QlSCNW3*c1NehI%eqqtoi0W$Q@lr5MX}AtM182
z)jzS{IoTLic)}&lN3z<w6BBNW<0<i}WYV*qN2+#tafRO{F6krkg3tYWZl(Pg1~z$w
zIp1j!1dcJaAD?LKwRDX2<BCu1rRo?nurJeIO~mo6Yw$5eAn$7ptiT5*Ek4x+Od=jt
zx=9|YG!f5PNjw{KpGG`wXI%G-S7wf+CQG;%niJ}<OI152TqqeNc0)=nLfMeSFt84_
zMFmiwhMtxKB9ljC`{tThzqvkfTkTuF2y|f3Mb&+xpA_b}vANPgNtRdGXajl=Lh{(e
zcFQ+?<}%SvzTwzxE)yM_Ys+LxX`{JJw4F7V30qognSzaF!VxT&Y2$IKmWhs2TBhc4
z8fokRSXf9Q(W$CA;haoKSwwDjZHp9q^TO2h%v+?VzPVoo)wZvv6wx>K{6V774FX5+
z`l&4BU}ewde}!g}Fh6T1F^^lGG><Lu<L^8^k53mrwmwe4E(t8{!pOnLah6k(MTp2>
zKyGLQ3#i;Ma;K0Rc`O%ct^G5)UKlwvapv-LX6EuM=cZ0hxHvqo&SWm1&&*uAI;B!w
zN|pysGhXeTaL-Mh-H4{dj-5}fgih*a6^fVkJQa8HrPDKKF3+^k<wg~M!oqIgk*34g
zN3^m}7C=b8SNrX{Jr|z{Y1N+nHxKToOUeCfcZm-4Hs!gzP68$0qzE1o+MWN3=H4Mc
zD_j83#d~!MyOl0<DYnCa3O5a0Hz=>AnqIt%9AdE@!rYK^6FIaG9uWQwIXUDI>zxqr
zQ(9Nsb>u+EcP`-#0+Da9nS2vTye5JhN=yP1lB+Y1ceL@2HQsTQD2dR95_^FOw-DzR
z-W|ZWBf=Xn29X1&5a(-nljFyGbe*xTpCJ)lpQ8NI;6Tv@XM|Wi27{kZ#P4b~GpFH_
zMO#Xc*_i(pg<$fA(>X#NGVOOLgfuwE5`;;(7j6C#QstF{^RLPIhhoTuvuyEj1FH{M
ze}VK)s>S)w4XjS&!iBb&YGCyP>;C`?AD{v2uNzoh$n6If%<Hnn0T?=U#}M61JurM5
z(%q<gsTXNIqOacXno!rYr2#yHc!uVsQNEaXQl9Id4@e^`a8cKOu}N%p@!1O-WJC9i
z*Wq{OnqrG|1Gy}=;(BA#nh%_sIoCI!xyB6>rC+Acx^jAZDd&jTQkM=gl#0um?QmZy
z>A1ksA>IgbufTPOG_;PuSpx=Cw7u<~@nsmJYk@BXZt1tQ(UjUas(n`$;Q^4IgeW6F
z9Z4TPawdKAP&J{Xg!Iwb^sFti)t>3bft9{^DSh-zT3SP@+b*fSH~rjgq+KZDYPT{6
z7ncL$2h%t}U#%wJ$jKTf!G<bja0()N`X!q~6G<ORH}WO&7C6a5vYUq(TT))4X)+yF
z^3rxz?P9b1b{h68)rj;W?US`lX3HzHFnY<RWL2FDrF<?c7Sjh09hg-OuTPSt``2Mn
zQks|HKvIp7D4o8YSFWl9?XmVQ@k`h*s)NmaN@B0eCkyTdM~=0tc}{0dwUaK1_uDri
z)~3}xI}8hvg3WSS>259UnQ%fN179_Hwc+BIYPjg-{x^0y_>Dv%Pl$>GXI{7(z``rC
zx%}vEW!EfICrAoa>L7`GQkak@Atzl;DoUIb15P`q)X=%SzE8CVaE3yY1kMpSOMqlU
z{;LGOOMpzSa6PPu4Y&3@O1b3er!M?+6e2}K8tdq*rN&2A@SEV-IeRuccf*S`4te<0
z@G#+$d6qmD8y{K)`8Mk64$2}k3pq_g2hY&ND4X9urc~YDC_RJ<@W7#C;g9K9p~+Ep
z3DRY8oU}bDgy~5(AG+SgLzm)B^HofmzS`jsFgEL(jgSw5oJnPY`?Huo38NG)#_t%P
z@O|ic)xV#<IGaQJ>wZ4Oeh-h603eAD42}E197$9LzOQL~rsjhcm;YO!v<**+Nz%Fp
zcMtf8NRk*pg$_yd#+?{6GHN-xzj!kT`bs-+eL+s^N5>#Nvi)}vpkr`JUx3f$4{`kA
z&xbo%+PICn4;wskpc3eHlbNfyIecpmDF%pPbG+X4u=KkoTzIh|!vD_1apy-YC}V!a
z;IP|>LHfi^5PUJ=Zy0(YZ#9=8wPMuasZn1wvexD5k#rg#2JsbgeLqnVxqPllCfedS
z;$zD9c^sM&YlfxYP_;LmRwQ*|&bc{OERprz<}+ZP#9X#4ZP&_!L-M86#;e9ys%oTm
zR?XC<+!=U^McZ0~`;K(8&QMcctPzoIjfOytm09d%<i_Fo;Oa%6Ix*BQjnwdgA;)}W
zpg)FrmRAg;L72&HhpqrR)o3$VAaVjBw~`P!c1uSSeCmKt84nQkA0%*yzy=W+wqcQ3
zt7J0~!3Y|HZ0sZQ4SW(YO;}}<u+&O>D|vU|FY+_f&*2IQPQzDHV3bUzI{3sd>yE>V
zh}_0zD+@EDCci?Xcoo2tUt{I5q2a4AZj&z)E|X{@9Q^^8oKO3QXTWj}m-vMR@POQE
zyqLOHfhikGt`~x3RR+C~$lZ|EasFr2#zQI{z?~Hp1P3=h%_0g{Uuu!&%6HM65Gm7+
zA-`0E547Xh>f1nmm_}%D*zD)wO_eR@c!)~TKI8Q_lEw?IPd(v6#~n@rBECvf1;(2j
zkt26ExMi&{c)I20c#P0u*vN_w(Hp*4<Z-H;_?dSQND}A-z<wvc<fNf=QQVKN;v)n<
zZivb&{I4j<2i;Qcv0f)AftKDGZ(N-?%Gt~;<DioigbzG&oIas|kNB)Z$+V2~Ir*tu
z80Q;(9D4H*@1YL#66hn)Pk?L~d=S9vZXu4hyr%dNWz)qC-%KDyU^{^c0;d4H#A&`z
zDc~HtqLwhwSjt$IF<c+1Jaxq*_wtC$ryNI}B(xWMu}S&yhpL1B2_@khkSvq;b|k&y
zbCMiuqqhJ^9kwA{!c?3UFO1?WN4tR+20=B0>OKdv<H4`bM#Hwc(EAV6`~OIdgyqE!
zeI6q30QqZ_Lzid#-xGL)fJNZD0A4^=h|f_-XyDf<{dEGLBXFI-Fo7Qez^4;Dz@H_G
zq-#enG~N7|CDHt>g^aDZyCWTk-z0E|5akz)S#LmHd!gGHD2=T7^`qy1mB>Y1brz}E
z0|N3?khgQcNl|%4_xC9DcM1GD0lIYKKOykn3H&92|3=^j0eMaOK85}i!1ICA)#?3*
zlty<SUf@=V-ynxTdFU#EVS^+gC5{ck21yt6@eTwXIs@f#B)1#+@t7B^;AXnSi~I;>
zu5D>+2U}<Y6OT_fPXa-6$b1#T7ls@E;0Y)1qf|N_y!3~=Dv6oI0Ug8bRLV$(LuSZG
z`jVYVGuDeY4r@B@=^es8NxEeWrMDV%LluB89SHzr{5GXuo!m(w4ulY+TQHd;q1sI;
z&Co+1hmvU7NB(&sKl$USzmQ4wtd-dig0CIK1?AD#t-T%U(BAI5fD!pZ@z4Ms_17Kh
V4h;eCMR)*`A6<V)A27_H{XgY}lgR)8


From 1513e5edd6a1bf678ee979b7694bcba3cfb38fe0 Mon Sep 17 00:00:00 2001
From: isamu-isozaki <isamu.website@gmail.com>
Date: Mon, 29 Apr 2024 00:37:59 -0400
Subject: [PATCH 10/11] Remove diffs

---
 llm_exl2_client_multi.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llm_exl2_client_multi.py b/llm_exl2_client_multi.py
index 4e93872..fd734ca 100644
--- a/llm_exl2_client_multi.py
+++ b/llm_exl2_client_multi.py
@@ -736,6 +736,7 @@ async def mainchat(request: ChatCompletionRequest):
             prompt = await format_prompt_commandr(request.messages)
         else:
             prompt = await format_prompt(request.messages)
+        print(prompt)
 
         timeout = 180  # seconds
         start_time = time.time()

From 08380200c9f4a4ba0839720b9af74974d5e331ca Mon Sep 17 00:00:00 2001
From: isamu-isozaki <isamu.website@gmail.com>
Date: Mon, 29 Apr 2024 18:07:40 -0400
Subject: [PATCH 11/11] Changed file name

---
 llm_exl2_client_multi.py          | 226 +--------
 llm_exl2_client_multi_outlines.py | 819 ++++++++++++++++++++++++++++++
 2 files changed, 835 insertions(+), 210 deletions(-)
 create mode 100644 llm_exl2_client_multi_outlines.py

diff --git a/llm_exl2_client_multi.py b/llm_exl2_client_multi.py
index fd734ca..1717527 100644
--- a/llm_exl2_client_multi.py
+++ b/llm_exl2_client_multi.py
@@ -9,26 +9,23 @@
 import torch
 import random
 from typing import AsyncIterable, List, Generator, Union, Optional
-import traceback
-from typing import Mapping
+
 import requests
 import sseclient
 import subprocess
 import re
 
-from fastapi import FastAPI, HTTPException
+from fastapi import FastAPI
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import StreamingResponse
 from pydantic import BaseModel
 from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, TextStreamer, TextIteratorStreamer
 from threading import Thread
+from auto_gptq import exllama_set_max_input_length
 import queue
 import numpy as np
 
 import sys, os
-import outlines
-from outlines.samplers import multinomial
-
 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 
 from exllamav2 import(
@@ -36,7 +33,6 @@
     ExLlamaV2Config,
     ExLlamaV2Cache,
     ExLlamaV2Cache_8bit,
-    ExLlamaV2Cache_Q4,
     ExLlamaV2Tokenizer,
 )
 
@@ -83,28 +79,17 @@ class ChatCompletionRequest(BaseModel):
     n: Optional[int] = 1  # default value of 1, batch size
     top_p: Optional[float] = 0.0  # default value of 0.0
     user: Optional[str] = None
-    stop_at: Optional[str] = None
-    outlines_type: Optional[str] = None
-    choices: Optional[list[str]] = None
-    regex: Optional[str] = None
-    json: Optional[str] = None
 
+repo_str = 'commandr-exl2'
 #repo_str = 'theprofessor-exl2-speculative'
 
 parser = argparse.ArgumentParser(description='Run server with specified port.')
 
 # Add argument for port with default type as integer
 parser.add_argument('--port', type=int, help='Port to run the server on.')
-parser.add_argument('--use_outlines', action='store_true', help='Use outlines.')
-parser.add_argument('--gpu_split', type=str, default="17,19,19,19", help='GPU splits.')
-parser.add_argument('--max_context', type=int, default=12288, help='Context length.')
-parser.add_argument('--cache_8bit', action='store_true', help='Use 8 bit cache.')
-parser.add_argument('--cache_q4', action='store_true', help='Use 4 bit cache.')
-parser.add_argument('--repo_str', type=str, default='commandr-exl2', help='The model repository name')
 
 # Parse the arguments
 args = parser.parse_args()
-repo_str = args.repo_str
 
 config = configparser.ConfigParser()
 config.read('config.ini')
@@ -127,41 +112,22 @@ class ChatCompletionRequest(BaseModel):
 dynamic_rope_offset = 0.0
 
 ropescale = 1.0
-max_context = args.max_context
+max_context = 12288
 config.scale_alpha_value = ropescale
 config.max_seq_len = max_context
 base_model_native_max = 4096
-cache_8bit = args.cache_8bit
-cache_q4 = args.cache_q4
-
-if args.use_outlines:
-    model = outlines.models.exl2(
-        config.model_dir,
-        "cuda",
-        max_seq_len = config.max_seq_len,
-        scale_pos_emb = config.scale_pos_emb,
-        scale_alpha_value = config.scale_alpha_value,
-        no_flash_attn = config.no_flash_attn,
-        num_experts_per_token = config.num_experts_per_token,
-        cache_8bit = cache_8bit,
-        cache_q4 = cache_q4,
-        tokenizer_kwargs = {},
-        gpu_split = args.gpu_split, # we might be able to make this auto
-        low_mem = None,
-        verbose = None
-    )
-else:
-    model = ExLlamaV2(config)
+
+model = ExLlamaV2(config)
 print("Loading model: " + repo_id)
 #cache = ExLlamaV2Cache(model, lazy=True, max_seq_len = 20480)
 #model.load_autosplit(cache)
-if not args.use_outlines:
-    model.load([int(gpu_memory) for gpu_memory in args.gpu_split.split(",")])
+model.load([17,19,19,19])
 
 tokenizer = ExLlamaV2Tokenizer(config)
 
 # Cache mode
 
+cache_8bit = True
 
 settings_proto = ExLlamaV2Sampler.Settings()
 settings_proto.temperature = 0
@@ -180,9 +146,6 @@ class ChatCompletionRequest(BaseModel):
 prompt_ids = []
 streamer = []
 caches = []
-input_prompts = []
-generators = []
-generations = []
 settings = []
 future_tokens = []
 future_logits = []
@@ -193,7 +156,7 @@ class ChatCompletionRequest(BaseModel):
 partial_responses = {}
 
 max_parallel_seqs = 3 
-num_of_gpus = len(args.gpu_split.split(","))
+num_of_gpus = 4
 
 print("*** Loaded.. now Inference...:")
 
@@ -217,144 +180,6 @@ async def stream_response(prompt_id, timeout=180):
                 yield f'data: {{"id":"chatcmpl-{prompt_id}","object":"chat.completion.chunk","created":{int(time.time())},"model":"{repo_str}","choices":[{{"index":0,"delta":{{}},"finish_reason":"stop"}}]}}\n\n'
                 break
 
-# Worker thread function
-def process_outline_prompts():
-    global partial_responses
-    assert args.use_outlines
-    assert not use_dynamic_rope_scaling, "Currently ROPE scaling is not supported with outlines"
-    base_model = model.model
-    while True:
-        while not prompts.empty() or len(prompt_ids):
-            while len(prompt_ids) < max_parallel_seqs and not prompts.empty():
-                prompt_id, prompt, max_tokens, stream, temperature, outlines_dict = prompts.get()
-                print(f"got prompt with outlines dict {outlines_dict}")
-                sampler = multinomial(top_k=50, top_p=1.0, temperature=temperature)
-                ids = tokenizer.encode(prompt)
-                prompt_tokens = ids.shape[-1]
-                full_tokens = prompt_tokens + max_tokens
-                print("Processing prompt: " + str(prompt_id) + "  Req tokens: " + str(full_tokens))
-                # Truncate if new_tokens exceed max_context
-                if full_tokens > max_context:
-                    # Calculate how many tokens to truncate
-                    ids = tokenizer.encode("Say, 'Prompt exceeds allowed length. Please try again.'")
-                    # Update new_tokens after truncation
-                    prompt_tokens = ids.shape[-1]
-                    full_tokens = prompt_tokens + max_tokens
-                    print("Truncating prompt: " + str(prompt_id) + "  Req tokens: " + str(full_tokens))
-                if cache_8bit:
-                    ncache = ExLlamaV2Cache_8bit(base_model, lazy=not base_model.loaded, max_seq_len = full_tokens)  # (max_seq_len could be different for each cache)
-                elif cache_q4:
-                    ncache = ExLlamaV2Cache_Q4(base_model, lazy=not base_model.loaded, max_seq_len = full_tokens)
-                else:
-                    ncache = ExLlamaV2Cache(base_model, lazy=not base_model.loaded, max_seq_len = full_tokens)  # (max_seq_len could be different for each cache)
-                model.cache = ncache
-                model.past_seq = None
-                stop_at = outlines_dict["stop_at"]
-                if outlines_dict["type"] == "choices":
-                    generator = outlines.generate.choice(model, outlines_dict["choices"], sampler=sampler)
-                elif outlines_dict["type"] == "json":
-                    generator = outlines.generate.json(model, outlines_dict["json"], sampler=sampler)
-                elif outlines_dict["type"] == "regex":
-                    generator = outlines.generate.regex(model, outlines_dict["regex"], sampler=sampler)
-                else:
-                    generator = outlines.generate.text(model, sampler=sampler)
-                generators.append(generator.stream(prompt, stop_at=stop_at, max_tokens=max_tokens))
-                prompt_ids.append(prompt_id)
-                input_prompts.append(prompt)
-                generations.append("")
-                caches.append(ncache)
-                streamer.append(stream)
-            if(len(prompt_ids)):
-                eos = []
-                for i in range(len(prompt_ids)):
-                    model.cache = caches[i]
-                    is_finished = False
-                    try:
-                        decoded_response_token = next(generators[i])
-                        generations[i] += decoded_response_token
-                    except StopIteration:
-                        is_finished = True
-                    reason = None
-                    if(streamer[i]):
-                        outcontent = decoded_response_token
-                        if is_finished:
-                            outcontent = ""
-                            reason = "stop"
-                        partial_response_data = {
-                            "id": f"chatcmpl-{prompt_ids[i]}",
-                            "object": "chat.completion.chunk",
-                            "created": int(time.time()),
-                            "model": repo_str,
-                            "choices": [
-                                {
-                                    "index": 0,
-                                    "delta": {
-                                        "content": outcontent
-                                    },
-                                    "finish_reason": reason
-                                }
-                            ]
-                        }
-
-                        # Initialize a list for new prompt_id or append to existing one
-                        if prompt_ids[i] not in partial_responses:
-                            partial_responses[prompt_ids[i]] = []
-                        partial_responses[prompt_ids[i]].append(partial_response_data)
-
-                    if is_finished:
-                        eos.insert(0, i)
-
-                # Generate and store response
-                for i in eos:
-                    output = generations[i].strip()
-                    prompt = input_prompts[i]
-                    #output = tokenizer.decode(input_ids[i])[0]
-                    print("-----")
-                    print(output)
-                    generated_text = output
-                    # Calculate token counts
-                    completion_tokens = (tokenizer.encode(generated_text)).shape[-1]
-                    prompt_tokens = (tokenizer.encode(prompt)).shape[-1]
-                    full_tokens = completion_tokens + prompt_tokens
-                    eos_prompt_id = prompt_ids.pop(i)
-                    if(streamer[i]):
-                        ## Generator, yield here..
-                            partial_response_data = {
-                                "finish_reason": "stop"
-                            }
-
-                            responses[eos_prompt_id] = partial_response_data
-                    else:# Construct the response based on the format
-                        response_data = {
-                            "id": f"chatcmpl-{prompt_id}",
-                            "object": "chat.completion",
-                            "created": int(time.time()),
-                            "model": repo_str,
-                            "choices": [{
-                                "index": 0,
-                                "message": {
-                                    "role": "assistant",
-                                    "content": generated_text,
-                                },
-                                "finish_reason": "stop"
-                            }],
-                            "usage": {
-                                "prompt_tokens": prompt_tokens,
-                                "completion_tokens": completion_tokens,
-                                "total_tokens": full_tokens
-                            }
-                        }
-                        responses[eos_prompt_id] = response_data
-                    # Clean up
-                    input_prompts.pop(i)
-                    generations.pop(i)
-                    caches.pop(i)
-                    streamer.pop(i)
-
-        else:
-            # Sleep for a short duration when there's no work
-            time.sleep(0.1)  # Sleep for 100 milliseconds
-
 
 # Worker thread function
 def process_prompts():
@@ -428,6 +253,8 @@ def process_prompts():
                 prompt_ids.append(prompt_id)
                 caches.append(ncache)
                 streamer.append(stream)
+                settings_proto.temperature = temperature
+                settings.append(settings_proto.clone())  # Need individual settings per prompt to support Mirostat
                 future_tokens.append(None)
                 future_logits.append(None)
                 #print("Prompt added to queue: " + str(prompt_id))
@@ -536,6 +363,7 @@ def process_prompts():
                                 "total_tokens": full_tokens
                             }
                         }
+                        
                         responses[eos_prompt_id] = response_data
 
                     # Clean up
@@ -554,10 +382,8 @@ def process_prompts():
 
 
-
-
 # Start worker thread
-worker = Thread(target=process_prompts if not args.use_outlines else process_outline_prompts)
+worker = Thread(target=process_prompts)
 worker.start()
 
 
@@ -716,6 +542,7 @@ async def format_prompt_commandr(messages):
 
 @app.post('/v1/chat/completions')
 async def mainchat(request: ChatCompletionRequest):
+
     try:
         prompt = ''
         if repo_str == 'Phind-CodeLlama-34B-v2':
@@ -741,27 +568,7 @@ async def mainchat(request: ChatCompletionRequest):
         timeout = 180  # seconds
         start_time = time.time()
         prompt_id = generate_unique_id() # Replace with a function to generate unique IDs
-        outlines_dict = {}
-        if request.stop_at is not None:
-            outlines_dict["stop_at"] = request.stop_at
-        if request.outlines_type is not None:
-            outlines_dict["type"] = request.outlines_type
-        if outlines_dict["type"] == "choices":
-            assert request.choices is not None
-            outlines_dict["choices"] = request.choices
-        elif outlines_dict["type"] == "json":
-            assert request.json is not None
-            outlines_dict["json"] = request.json
-        elif outlines_dict["type"] == "regex":
-            assert request.regex is not None
-            outlines_dict["regex"] = request.regex
-        else:
-            assert (outlines_dict["type"] == "text") or not args.outlines
-        if not args.use_outlines:
-            prompts.put((prompt_id, prompt, request.max_tokens, request.stream, request.temperature))
-        else:
-            prompts.put((prompt_id, prompt, request.max_tokens, request.stream, request.temperature, outlines_dict))
-
+        prompts.put((prompt_id, prompt, request.max_tokens, request.stream, request.temperature))
 
         if request.stream:
             #response = StreamingResponse(streaming_request(prompt, request.max_tokens, tempmodel=repo_str, response_format='chat_completion'), media_type="text/event-stream")
@@ -777,7 +584,6 @@ async def mainchat(request: ChatCompletionRequest):
             return responses.pop(prompt_id)
 
     except Exception as e:
-        print(traceback.format_exc())
         raise HTTPException(status_code=500, detail=str(e))
 
     return response
diff --git a/llm_exl2_client_multi_outlines.py b/llm_exl2_client_multi_outlines.py
new file mode 100644
index 0000000..fd734ca
--- /dev/null
+++ b/llm_exl2_client_multi_outlines.py
@@ -0,0 +1,819 @@
+import asyncio
+import json
+import os
+import logging
+import time
+import configparser
+import argparse
+import tiktoken
+import torch
+import random
+from typing import AsyncIterable, List, Generator, Union, Optional
+import traceback
+from typing import Mapping
+import requests
+import sseclient
+import subprocess
+import re
+
+from fastapi import FastAPI, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import StreamingResponse
+from pydantic import BaseModel
+from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, TextStreamer, TextIteratorStreamer
+from threading import Thread
+import queue
+import numpy as np
+
+import sys, os
+import outlines
+from outlines.samplers import multinomial
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from exllamav2 import(
+    ExLlamaV2,
+    ExLlamaV2Config,
+    ExLlamaV2Cache,
+    ExLlamaV2Cache_8bit,
+    ExLlamaV2Cache_Q4,
+    ExLlamaV2Tokenizer,
+)
+
+from exllamav2.generator import (
+    ExLlamaV2StreamingGenerator,
+    ExLlamaV2Sampler
+)
+import uuid
+
+def generate_unique_id():
+    return uuid.uuid4()
+
+class CompletionRequest(BaseModel):
+    model: str
+    prompt: Union[str, List[str]]
+    stop: Optional[Union[str, List[str]]] = None
+    max_tokens: Optional[int] = 100  # default value of 100
+    temperature: Optional[float] = 0.0  # default value of 0.0
+    stream: Optional[bool] = False  # default value of False
+    best_of: Optional[int] = 1
+    echo: Optional[bool] = False
+    frequency_penalty: Optional[float] = 0.0  # default value of 0.0
+    presence_penalty: Optional[float] = 0.0  # default value of 0.0
+    log_probs: Optional[int] = 0  # default value of 0.0
+    n: Optional[int] = 1  # default value of 1, batch size
+    suffix: Optional[str] = None
+    top_p: Optional[float] = 0.0  # default value of 0.0
+    user: Optional[str] = None
+
+class Message(BaseModel):
+    role: str
+    content: str
+
+class ChatCompletionRequest(BaseModel):
+    model: str
+    messages: List[Message]
+    stop: Optional[Union[str, List[str]]] = None
+    max_tokens: Optional[int] = 100  # default value of 100
+    temperature: Optional[float] = 0.0  # default value of 0.0
+    stream: Optional[bool] = False  # default value of False
+    frequency_penalty: Optional[float] = 0.0  # default value of 0.0
+    presence_penalty: Optional[float] = 0.0  # default value of 0.0
+    log_probs: Optional[int] = 0  # default value of 0.0
+    n: Optional[int] = 1  # default value of 1, batch size
+    top_p: Optional[float] = 0.0  # default value of 0.0
+    user: Optional[str] = None
+    stop_at: Optional[str] = None
+    outlines_type: Optional[str] = None
+    choices: Optional[list[str]] = None
+    regex: Optional[str] = None
+    json: Optional[str] = None
+
+#repo_str = 'theprofessor-exl2-speculative'
+
+parser = argparse.ArgumentParser(description='Run server with specified port.')
+
+# Add argument for port with default type as integer
+parser.add_argument('--port', type=int, help='Port to run the server on.')
+parser.add_argument('--use_outlines', action='store_true', help='Use outlines.')
+parser.add_argument('--gpu_split', type=str, default="17,19,19,19", help='GPU splits.')
+parser.add_argument('--max_context', type=int, default=12288, help='Context length.')
+parser.add_argument('--cache_8bit', action='store_true', help='Use 8 bit cache.')
+parser.add_argument('--cache_q4', action='store_true', help='Use 4 bit cache.')
+parser.add_argument('--repo_str', type=str, default='commandr-exl2', help='The model repository name')
+
+# Parse the arguments
+args = parser.parse_args()
+repo_str = args.repo_str
+
+config = configparser.ConfigParser()
+config.read('config.ini')
+
+repo_id = config.get(repo_str, 'repo')
+host = config.get('settings', 'host')
+
+port = args.port if args.port is not None else config.getint('settings', 'port')
+
+# only allow one client at a time
+busy = False
+condition = asyncio.Condition()
+
+config = ExLlamaV2Config()
+config.model_dir = repo_id
+config.prepare()
+
+use_dynamic_rope_scaling = False 
+dynamic_rope_mult = 1.5
+dynamic_rope_offset = 0.0
+
+ropescale = 1.0
+max_context = args.max_context
+config.scale_alpha_value = ropescale
+config.max_seq_len = max_context
+base_model_native_max = 4096
+cache_8bit = args.cache_8bit
+cache_q4 = args.cache_q4
+
+if args.use_outlines:
+    model = outlines.models.exl2(
+        config.model_dir,
+        "cuda",
+        max_seq_len = config.max_seq_len,
+        scale_pos_emb = config.scale_pos_emb,
+        scale_alpha_value = config.scale_alpha_value,
+        no_flash_attn = config.no_flash_attn,
+        num_experts_per_token = config.num_experts_per_token,
+        cache_8bit = cache_8bit,
+        cache_q4 = cache_q4,
+        tokenizer_kwargs = {},
+        gpu_split = args.gpu_split, # we might be able to make this auto
+        low_mem = None,
+        verbose = None
+    )
+else:
+    model = ExLlamaV2(config)
+print("Loading model: " + repo_id)
+#cache = ExLlamaV2Cache(model, lazy=True, max_seq_len = 20480)
+#model.load_autosplit(cache)
+if not args.use_outlines:
+    model.load([int(gpu_memory) for gpu_memory in args.gpu_split.split(",")])
+
+tokenizer = ExLlamaV2Tokenizer(config)
+
+# Cache mode
+
+
+settings_proto = ExLlamaV2Sampler.Settings()
+settings_proto.temperature = 0
+settings_proto.top_k = 50
+settings_proto.top_p = 0.8
+settings_proto.top_a = 0.0
+settings_proto.token_repetition_penalty = 1.1
+#settings.disallow_tokens(tokenizer, [tokenizer.eos_token_id])
+
+# Active sequences and corresponding caches and settings
+prompts = queue.Queue()
+responses = {}
+
+input_ids = []
+prompt_length = []
+prompt_ids = []
+streamer = []
+caches = []
+input_prompts = []
+generators = []
+generations = []
+settings = []
+future_tokens = []
+future_logits = []
+sin_arr = []
+cos_arr = []
+
+# Global variable for storing partial responses
+partial_responses = {}
+
+max_parallel_seqs = 3 
+num_of_gpus = len(args.gpu_split.split(","))
+
+print("*** Loaded.. now Inference...:")
+
+app = FastAPI(title="EXL2")
+
+async def stream_response(prompt_id, timeout=180):
+    global partial_responses
+    while True:
+        await asyncio.sleep(0.05)  # Sleep to yield control to the event loop
+
+        # Check if prompt_id exists in partial_responses
+        if prompt_id in partial_responses:
+            # Stream partial responses
+            while partial_responses[prompt_id]:
+                response_chunk = partial_responses[prompt_id].pop(0)
+                yield f"data: {json.dumps(response_chunk)}\n\n"
+
+            # Check for final response or timeout
+            if prompt_id in responses:
+                final_response = responses.pop(prompt_id)
+                yield f'data: {{"id":"chatcmpl-{prompt_id}","object":"chat.completion.chunk","created":{int(time.time())},"model":"{repo_str}","choices":[{{"index":0,"delta":{{}},"finish_reason":"stop"}}]}}\n\n'
+                break
+
+# Worker thread function
+def process_outline_prompts():
+    global partial_responses
+    assert args.use_outlines
+    assert not use_dynamic_rope_scaling, "Currently ROPE scaling is not supported with outlines"
+    base_model = model.model
+    while True:
+        while not prompts.empty() or len(prompt_ids):
+            while len(prompt_ids) < max_parallel_seqs and not prompts.empty():
+                prompt_id, prompt, max_tokens, stream, temperature, outlines_dict = prompts.get()
+                print(f"got prompt with outlines dict {outlines_dict}")
+                sampler = multinomial(top_k=50, top_p=1.0, temperature=temperature)
+                ids = tokenizer.encode(prompt)
+                prompt_tokens = ids.shape[-1]
+                full_tokens = prompt_tokens + max_tokens
+                print("Processing prompt: " + str(prompt_id) + "  Req tokens: " + str(full_tokens))
+                # Truncate if new_tokens exceed max_context
+                if full_tokens > max_context:
+                    # Calculate how many tokens to truncate
+                    ids = tokenizer.encode("Say, 'Prompt exceeds allowed length. Please try again.'")
+                    # Update new_tokens after truncation
+                    prompt_tokens = ids.shape[-1]
+                    full_tokens = prompt_tokens + max_tokens
+                    print("Truncating prompt: " + str(prompt_id) + "  Req tokens: " + str(full_tokens))
+                if cache_8bit:
+                    ncache = ExLlamaV2Cache_8bit(base_model, lazy=not base_model.loaded, max_seq_len = full_tokens)  # (max_seq_len could be different for each cache)
+                elif cache_q4:
+                    ncache = ExLlamaV2Cache_Q4(base_model, lazy=not base_model.loaded, max_seq_len = full_tokens)
+                else:
+                    ncache = ExLlamaV2Cache(base_model, lazy=not base_model.loaded, max_seq_len = full_tokens)  # (max_seq_len could be different for each cache)
+                model.cache = ncache
+                model.past_seq = None
+                stop_at = outlines_dict["stop_at"]
+                if outlines_dict["type"] == "choices":
+                    generator = outlines.generate.choice(model, outlines_dict["choices"], sampler=sampler)
+                elif outlines_dict["type"] == "json":
+                    generator = outlines.generate.json(model, outlines_dict["json"], sampler=sampler)
+                elif outlines_dict["type"] == "regex":
+                    generator = outlines.generate.regex(model, outlines_dict["regex"], sampler=sampler)
+                else:
+                    generator = outlines.generate.text(model, sampler=sampler)
+                generators.append(generator.stream(prompt, stop_at=stop_at, max_tokens=max_tokens))
+                prompt_ids.append(prompt_id)
+                input_prompts.append(prompt)
+                generations.append("")
+                caches.append(ncache)
+                streamer.append(stream)
+            if(len(prompt_ids)):
+                eos = []
+                for i in range(len(prompt_ids)):
+                    model.cache = caches[i]
+                    is_finished = False
+                    try:
+                        decoded_response_token = next(generators[i])
+                        generations[i] += decoded_response_token
+                    except StopIteration:
+                        is_finished = True
+                    reason = None
+                    if(streamer[i]):
+                        outcontent = decoded_response_token
+                        if is_finished:
+                            outcontent = ""
+                            reason = "stop"
+                        partial_response_data = {
+                            "id": f"chatcmpl-{prompt_ids[i]}",
+                            "object": "chat.completion.chunk",
+                            "created": int(time.time()),
+                            "model": repo_str,
+                            "choices": [
+                                {
+                                    "index": 0,
+                                    "delta": {
+                                        "content": outcontent
+                                    },
+                                    "finish_reason": reason
+                                }
+                            ]
+                        }
+
+                        # Initialize a list for new prompt_id or append to existing one
+                        if prompt_ids[i] not in partial_responses:
+                            partial_responses[prompt_ids[i]] = []
+                        partial_responses[prompt_ids[i]].append(partial_response_data)
+
+                    if is_finished:
+                        eos.insert(0, i)
+
+                # Generate and store response
+                for i in eos:
+                    output = generations[i].strip()
+                    prompt = input_prompts[i]
+                    #output = tokenizer.decode(input_ids[i])[0]
+                    print("-----")
+                    print(output)
+                    generated_text = output
+                    # Calculate token counts
+                    completion_tokens = (tokenizer.encode(generated_text)).shape[-1]
+                    prompt_tokens = (tokenizer.encode(prompt)).shape[-1]
+                    full_tokens = completion_tokens + prompt_tokens
+                    eos_prompt_id = prompt_ids.pop(i)
+                    if(streamer[i]):
+                        ## Generator, yield here..
+                            partial_response_data = {
+                                "finish_reason": "stop"
+                            }
+
+                            responses[eos_prompt_id] = partial_response_data
+                    else:# Construct the response based on the format
+                        response_data = {
+                            "id": f"chatcmpl-{prompt_id}",
+                            "object": "chat.completion",
+                            "created": int(time.time()),
+                            "model": repo_str,
+                            "choices": [{
+                                "index": 0,
+                                "message": {
+                                    "role": "assistant",
+                                    "content": generated_text,
+                                },
+                                "finish_reason": "stop"
+                            }],
+                            "usage": {
+                                "prompt_tokens": prompt_tokens,
+                                "completion_tokens": completion_tokens,
+                                "total_tokens": full_tokens
+                            }
+                        }
+                        responses[eos_prompt_id] = response_data
+                    # Clean up
+                    input_prompts.pop(i)
+                    generations.pop(i)
+                    caches.pop(i)
+                    streamer.pop(i)
+
+        else:
+            # Sleep for a short duration when there's no work
+            time.sleep(0.1)  # Sleep for 100 milliseconds
+
+
+# Worker thread function
+def process_prompts():
+    global partial_responses  
+
+    while True:
+        while not prompts.empty() or len(input_ids):
+            while len(input_ids) < max_parallel_seqs and not prompts.empty():
+                prompt_id, prompt, max_tokens, stream, temperature = prompts.get()
+                ids = tokenizer.encode(prompt)
+                prompt_tokens = ids.shape[-1]
+                new_tokens = prompt_tokens + max_tokens
+                print("Processing prompt: " + str(prompt_id) + "  Req tokens: " + str(new_tokens))
+                # Truncate if new_tokens exceed max_context
+                if new_tokens > max_context:
+                    # Calculate how many tokens to truncate
+                    ids = tokenizer.encode("Say, 'Prompt exceeds allowed length. Please try again.'")
+                    # Update new_tokens after truncation
+                    prompt_tokens = ids.shape[-1]
+                    new_tokens = prompt_tokens + max_tokens
+                    print("Truncating prompt: " + str(prompt_id) + "  Req tokens: " + str(new_tokens))
+                prompt_length.append(prompt_tokens)
+                if use_dynamic_rope_scaling:
+                    # Dynamic Rope Scaling
+                    head_dim = model.config.head_dim
+                    model_base = model.config.rotary_embedding_base
+                    ratio = new_tokens / base_model_native_max
+                    alpha = 1.0
+                    ropesin = [None] * num_of_gpus
+                    ropecos = [None] * num_of_gpus
+                    if ratio > 1.0:
+                        alpha = ((0.2500*ratio**2) + (0.3500*ratio) + 0.4000)*dynamic_rope_mult + dynamic_rope_offset
+                        print("DYNAMIC ROPE SCALE Alpha: " + str(alpha) + "  Ratio: " + str(ratio))
+
+                    for g in range(num_of_gpus):
+                        base = model_base
+                        try:
+                            tensors = model.get_device_tensors(g)
+                        except IndexError:
+                            tensors = None
+
+                        if tensors is not None:
+                            if alpha != 1.0: base *= alpha ** (model.config.head_dim / (model.config.head_dim - 2))
+
+                            inv_freq = 1.0 / (base ** (torch.arange(0, head_dim, 2, device = "cuda:"+str(g)).float() / head_dim))
+                            t = torch.arange(model.config.max_seq_len, device = "cuda:"+str(g), dtype = torch.float32)
+
+                            freqs = torch.einsum("i,j->ij", t, inv_freq)
+                            emb = torch.cat((freqs, freqs), dim=-1)
+
+                            ropesin[g] = emb.sin()[None, None, :, :].half()
+                            ropecos[g] = emb.cos()[None, None, :, :].half()
+
+                            tensors.sin = ropesin[g]
+                            tensors.cos = ropecos[g]
+
+                if cache_8bit:
+                    ncache = ExLlamaV2Cache_8bit(model, max_seq_len = new_tokens)  # (max_seq_len could be different for each cache)
+                else:
+                    ncache = ExLlamaV2Cache(model, max_seq_len = new_tokens)  # (max_seq_len could be different for each cache)
+
+                #print("Setting up Cache: " + str(prompt_id))
+                
+                if use_dynamic_rope_scaling:
+                    sin_arr.append(ropesin)
+                    cos_arr.append(ropecos)
+
+                model.forward(ids[:, :-1], ncache, preprocess_only = True)
+                print("Cache setup: " + str(np.shape(ids[:1, :-1])))
+                input_ids.append(ids)
+                prompt_ids.append(prompt_id)
+                caches.append(ncache)
+                streamer.append(stream)
+                future_tokens.append(None)
+                future_logits.append(None)
+                #print("Prompt added to queue: " + str(prompt_id))
+
+            # Create a batch tensor of the last token in each active sequence, forward through the model using the list of
+            # active caches rather than a single, batched cache. Then sample for each token indidividually with some
+            # arbitrary stop condition
+            if(len(input_ids)):
+                #inputs = torch.cat([x[:, -1:] for x in input_ids], dim = 0)
+                #logits = model.forward(inputs, caches, input_mask = None).float().cpu()
+                eos = []
+                r = random.random()
+                for i in range(len(input_ids)):
+                    # if using dynamic rope
+                    if use_dynamic_rope_scaling:
+                        for g in range(num_of_gpus):
+                            if sin_arr[i][g] is not None and cos_arr[i][g] is not None:
+                                tensors = model.get_device_tensors(g)
+                                tensors.sin = sin_arr[i][g]
+                                tensors.cos = cos_arr[i][g]
+
+                    logits = model.forward(input_ids[i][:, -1:], caches[i], input_mask = None ).float().cpu()
+                    token, _, _, _, _ = ExLlamaV2Sampler.sample(logits[:, :1, :], settings[i], input_ids[i], r, tokenizer)
+
+                    input_ids[i] = torch.cat([input_ids[i], token], dim = 1)
+
+                    new_text = tokenizer.decode(input_ids[i][:, -2:-1], decode_special_tokens=False)[0]
+                    new_text2 = tokenizer.decode(input_ids[i][:, -2:], decode_special_tokens=False)[0]
+                    if '�' in new_text:
+                        diff = new_text2
+                    else:
+                        diff = new_text2[len(new_text):]
+
+                    if '�' in diff:
+                        diff = ""
+
+                    #print(diff)
+                    reason = None
+                    if(streamer[i]):
+                        ## Generator, yield here..
+                        outcontent = diff
+                        if diff == """<|im_end|>""":
+                            outcontent = ""
+                            reason = "stop"
+                        partial_response_data = {
+                            "id": f"chatcmpl-{prompt_ids[i]}",
+                            "object": "chat.completion.chunk",
+                            "created": int(time.time()),
+                            "model": repo_str,
+                            "choices": [
+                                {
+                                    "index": 0,
+                                    "delta": {
+                                        "content": outcontent
+                                    },
+                                    "finish_reason": reason
+                                }
+                            ]
+                        }
+
+                        # Initialize a list for new prompt_id or append to existing one
+                        if prompt_ids[i] not in partial_responses:
+                            partial_responses[prompt_ids[i]] = []
+                        partial_responses[prompt_ids[i]].append(partial_response_data)
+
+                    if token.item() == tokenizer.eos_token_id or diff == """<|im_end|>""" or caches[i].current_seq_len == caches[i].max_seq_len:
+                        eos.insert(0, i)
+                        
+                # Generate and store response
+                for i in eos:
+                    generated_part = input_ids[i][:, prompt_length[i]:]
+                    output = tokenizer.decode(generated_part[0]).strip()
+                    #output = tokenizer.decode(input_ids[i])[0]
+                    print("-----")
+                    print(output)
+                    generated_text = output
+                    # Calculate token counts
+                    completion_tokens = (tokenizer.encode(generated_text)).shape[-1]
+                    prompt_tokens = (tokenizer.encode(prompt)).shape[-1]
+                    full_tokens = completion_tokens + prompt_tokens
+                    eos_prompt_id = prompt_ids.pop(i)
+                    if(streamer[i]):
+                        ## Generator, yield here..
+                            partial_response_data = {
+                                "finish_reason": "stop"
+                            }
+
+                            responses[eos_prompt_id] = partial_response_data
+                    else:# Construct the response based on the format
+                        response_data = {
+                            "id": f"chatcmpl-{prompt_id}",
+                            "object": "chat.completion",
+                            "created": int(time.time()),
+                            "model": repo_str,
+                            "choices": [{
+                                "index": 0,
+                                "message": {
+                                    "role": "assistant",
+                                    "content": generated_text,
+                                },
+                                "finish_reason": "stop"
+                            }],
+                            "usage": {
+                                "prompt_tokens": prompt_tokens,
+                                "completion_tokens": completion_tokens,
+                                "total_tokens": full_tokens
+                            }
+                        }
+                        responses[eos_prompt_id] = response_data
+
+                    # Clean up
+                    input_ids.pop(i)
+                    caches.pop(i)
+                    settings.pop(i)
+                    prompt_length.pop(i)
+                    streamer.pop(i)
+                    if use_dynamic_rope_scaling:
+                        cos_arr.pop(i)
+                        sin_arr.pop(i)
+
+        else:
+            # Sleep for a short duration when there's no work
+            time.sleep(0.1)  # Sleep for 100 milliseconds
+
+
+
+
+
+# Start worker thread
+worker = Thread(target=process_prompts if not args.use_outlines else process_outline_prompts)
+worker.start()
+
+
+async def format_prompt(messages):
+    formatted_prompt = ""
+    for message in messages:
+        if message.role == "system":
+            formatted_prompt += f"{message.content}\n\n"
+        elif message.role == "user":
+            formatted_prompt += f"### User:\n{message.content}\n\n"
+        elif message.role == "assistant":
+            formatted_prompt += f"### Assistant:\n{message.content}\n\n"
+    # Add the final "### Assistant:\n" to prompt for the next response
+    formatted_prompt += "### Assistant:\n"
+    return formatted_prompt
+
+async def format_prompt_yi(messages):
+    formatted_prompt = ""
+    system_message_found = False
+    
+    # Check for a system message first
+    for message in messages:
+        if message.role == "system":
+            system_message_found = True
+            break
+    
+    # If no system message was found, prepend a default one
+    if not system_message_found:
+        formatted_prompt = "<|im_start|>system\nYou are a helpful AI assistant.<|im_end|>\n"
+    for message in messages:
+        if message.role == "system":
+            formatted_prompt += f"<|im_start|>system\n{message.content}<|im_end|>\n"
+        elif message.role == "user":
+            formatted_prompt += f"<|im_start|>user\n{message.content}<|im_end|>\n"
+        elif message.role == "assistant":
+            formatted_prompt += f"<|im_start|>assistant\n{message.content}<|im_end|>\n"
+    # Add the final "### Assistant:\n" to prompt for the next response
+    formatted_prompt += "<|im_start|>assistant\n"
+    return formatted_prompt
+
+async def format_prompt_nous(messages):
+    formatted_prompt = ""
+    for message in messages:
+        if message.role == "system":
+            formatted_prompt += f"{message.content}\n"
+        elif message.role == "user":
+            formatted_prompt += f"USER: {message.content}\n"
+        elif message.role == "assistant":
+            formatted_prompt += f"ASSISTANT: {message.content}\n"
+    # Add the final "### Assistant:\n" to prompt for the next response
+    formatted_prompt += "ASSISTANT: "
+    return formatted_prompt
+
+async def format_prompt_tess(messages):
+    formatted_prompt = ""
+    for message in messages:
+        if message.role == "system":
+            formatted_prompt += f"SYSTEM: {message.content}\n"
+        elif message.role == "user":
+            formatted_prompt += f"USER: {message.content}\n"
+        elif message.role == "assistant":
+            formatted_prompt += f"ASSISTANT: {message.content}\n"
+    # Add the final "### Assistant:\n" to prompt for the next response
+    formatted_prompt += "ASSISTANT: "
+    return formatted_prompt
+
+async def format_prompt_code(messages):
+    formatted_prompt = ""
+    for message in messages:
+        if message.role == "system":
+            formatted_prompt += f"### System Prompt\nYou are an intelligent programming assistant.\n\n"
+        elif message.role == "user":
+            formatted_prompt += f"### User Message\n{message.content}\n\n"
+        elif message.role == "assistant":
+            formatted_prompt += f"### Assistant\n{message.content}\n\n"
+    # Add the final "### Assistant" with ellipsis to prompt for the next response
+    formatted_prompt += "### Assistant\n..."
+    return formatted_prompt
+
+async def format_prompt_zephyr(messages):
+    formatted_prompt = ""
+    for message in messages:
+        if message.role == "system":
+            formatted_prompt += f"<|system|>\n{message.content}</s>\n"
+        elif message.role == "user":
+            formatted_prompt += f"<|user|>\n{message.content}</s>\n"
+        elif message.role == "assistant":
+            formatted_prompt += f"<|assistant|>\n{message.content}</s>\n"
+    # Add the final "### Assistant:\n" to prompt for the next response
+    formatted_prompt += "<|assistant|>\n"
+    return formatted_prompt
+
+async def format_prompt_starling(messages):
+    formatted_prompt = ""
+    system_message = ""
+    for message in messages:
+        if message.role == "system":
+            # Save system message to prepend to the first user message
+            system_message += f"{message.content}\n\n"
+        elif message.role == "user":
+            # Prepend system message if it exists
+            if system_message:
+                formatted_prompt += f"GPT4 Correct User: {system_message}{message.content}<|end_of_turn|>"
+                system_message = ""  # Clear system message after prepending
+            else:
+                formatted_prompt += f"GPT4 Correct User: {message.content}<|end_of_turn|>"
+        elif message.role == "assistant":
+            formatted_prompt += f"GPT4 Correct Assistant: {message.content}<|end_of_turn|>"  # Prep for user follow-up
+    formatted_prompt += "GPT4 Correct Assistant: \n\n"
+    return formatted_prompt
+
+async def format_prompt_mixtral(messages):
+    formatted_prompt = "<s> "
+    system_message = ""
+    for message in messages:
+        if message.role == "system":
+            # Save system message to prepend to the first user message
+            system_message += f"{message.content}\n\n"
+        elif message.role == "user":
+            # Prepend system message if it exists
+            if system_message:
+                formatted_prompt += f"[INST] {system_message}{message.content} [/INST] "
+                system_message = ""  # Clear system message after prepending
+            else:
+                formatted_prompt += f"[INST] {message.content} [/INST] "
+        elif message.role == "assistant":
+            formatted_prompt += f" {message.content}</s> "  # Prep for user follow-up
+    return formatted_prompt
+
+
+async def format_prompt_commandr(messages):
+    formatted_prompt = ""
+    system_message_found = False
+    
+    # Check for a system message first
+    for message in messages:
+        if message.role == "system":
+            system_message_found = True
+            break
+    
+    # If no system message was found, prepend a default one
+    if not system_message_found:
+        formatted_prompt += f"<BOS_TOKEN><|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{message.content}<|END_OF_TURN_TOKEN|>"
+ 
+    for message in messages:
+        if message.role == "system":
+            formatted_prompt += f"<BOS_TOKEN><|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{message.content}<|END_OF_TURN_TOKEN|>"
+        elif message.role == "user":
+            formatted_prompt += f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{message.content}<|END_OF_TURN_TOKEN|>"
+        elif message.role == "assistant":
+            formatted_prompt += f"<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{message.content}<|END_OF_TURN_TOKEN|>"
+    # Add the final "### Assistant:\n" to prompt for the next response
+    formatted_prompt += "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"
+    return formatted_prompt
+
+
+@app.post('/v1/chat/completions')
+async def mainchat(request: ChatCompletionRequest):
+    try:
+        prompt = ''
+        if repo_str == 'Phind-CodeLlama-34B-v2':
+            prompt = await format_prompt_code(request.messages)
+        elif repo_str == 'zephyr-7b-beta':
+            prompt = await format_prompt_zephyr(request.messages)
+        elif repo_str == 'Starling-LM-7B-alpha':
+            prompt = await format_prompt_starling(request.messages)
+        elif repo_str == 'Mixtral-8x7B-Instruct-v0.1-GPTQ':
+            prompt = await format_prompt_mixtral(request.messages)
+        elif repo_str == 'Yi-34B-Chat-GPTQ' or repo_str == 'Nous-Hermes-2-Yi-34B-GPTQ' or repo_str == 'theprofessor-exl2-speculative' or repo_str == 'dbrx-instruct-exl2':
+            prompt = await format_prompt_yi(request.messages)
+        elif repo_str == 'Nous-Capybara-34B-GPTQ' or repo_str == 'goliath-120b-GPTQ' or repo_str == 'goliath-120b-exl2' or repo_str == 'goliath-120b-exl2-rpcal':
+            prompt = await format_prompt_nous(request.messages)
+        elif repo_str == 'tess-xl-exl2' or repo_str == 'tess-xl-exl2-speculative':
+            prompt = await format_prompt_tess(request.messages)
+        elif repo_str == 'commandr-exl2' or repo_str == 'commandr-exl2-speculative':
+            prompt = await format_prompt_commandr(request.messages)
+        else:
+            prompt = await format_prompt(request.messages)
+        print(prompt)
+
+        timeout = 180  # seconds
+        start_time = time.time()
+        prompt_id = generate_unique_id() # Replace with a function to generate unique IDs
+        outlines_dict = {}
+        if request.stop_at is not None:
+            outlines_dict["stop_at"] = request.stop_at
+        if request.outlines_type is not None:
+            outlines_dict["type"] = request.outlines_type
+        if outlines_dict["type"] == "choices":
+            assert request.choices is not None
+            outlines_dict["choices"] = request.choices
+        elif outlines_dict["type"] == "json":
+            assert request.json is not None
+            outlines_dict["json"] = request.json
+        elif outlines_dict["type"] == "regex":
+            assert request.regex is not None
+            outlines_dict["regex"] = request.regex
+        else:
+            assert (outlines_dict["type"] == "text") or not args.outlines
+        if not args.use_outlines:
+            prompts.put((prompt_id, prompt, request.max_tokens, request.stream, request.temperature))
+        else:
+            prompts.put((prompt_id, prompt, request.max_tokens, request.stream, request.temperature, outlines_dict))
+
+
+        if request.stream:
+            #response = StreamingResponse(streaming_request(prompt, request.max_tokens, tempmodel=repo_str, response_format='chat_completion'), media_type="text/event-stream")
+            return StreamingResponse(stream_response(prompt_id), media_type="text/event-stream")
+        else:
+            #response_data = non_streaming_request(prompt, request.max_tokens, tempmodel=repo_str, response_format='chat_completion')
+            #response = response_data  # This will return a JSON response
+            while prompt_id not in responses:
+                await asyncio.sleep(0.1)  # Sleep to yield control to the event loop
+                if time.time() - start_time > timeout:
+                    return {"error": "Response timeout"} 
+
+            return responses.pop(prompt_id)
+
+    except Exception as e:
+        print(traceback.format_exc())
+        raise HTTPException(status_code=500, detail=str(e))
+
+    return response
+
+
+
+
+@app.get('/ping')
+async def get_status():
+    return {"ping": sum(prompt_length)}
+
+@app.get("/nvidia-smi")
+async def get_nvidia_smi():
+    # Execute the nvidia-smi command
+    result = subprocess.run(
+        ["nvidia-smi", "--query-gpu=utilization.gpu,memory.used,memory.total", "--format=csv,noheader"],
+        capture_output=True, text=True
+    )
+    nvidia_smi_output = result.stdout.strip()  # Remove any extra whitespace
+    # Split the output by lines and then by commas
+    gpu_data = []
+    for line in nvidia_smi_output.split("\n"):
+        utilization, memory_used, memory_total = line.split(", ")
+        # Strip the '%' and 'MiB' and convert to appropriate types
+        utilization = float(utilization.strip(' %'))
+        memory_used = int(memory_used.strip(' MiB'))
+        memory_total = int(memory_total.strip(' MiB'))
+        gpu_data.append({
+           "utilization": utilization,
+           "memory_used": memory_used,
+           "memory_total": memory_total
+        })
+    return gpu_data
+
+
+if __name__ == "__main__":
+    import uvicorn
+
+    uvicorn.run(app, host=host, port=port, log_level="debug")