diff --git a/llm_exl2_client_multi_outlines.py b/llm_exl2_client_multi_outlines.py
new file mode 100644
index 0000000..fd734ca
--- /dev/null
+++ b/llm_exl2_client_multi_outlines.py
@@ -0,0 +1,819 @@
+import asyncio
+import json
+import os
+import logging
+import time
+import configparser
+import argparse
+import tiktoken
+import torch
+import random
+from typing import AsyncIterable, List, Generator, Union, Optional
+import traceback
+from typing import Mapping
+import requests
+import sseclient
+import subprocess
+import re
+
+from fastapi import FastAPI, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import StreamingResponse
+from pydantic import BaseModel
+from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, TextStreamer, TextIteratorStreamer
+from threading import Thread
+import queue
+import numpy as np
+
+import sys, os
+import outlines
+from outlines.samplers import multinomial
+
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from exllamav2 import(
+    ExLlamaV2,
+    ExLlamaV2Config,
+    ExLlamaV2Cache,
+    ExLlamaV2Cache_8bit,
+    ExLlamaV2Cache_Q4,
+    ExLlamaV2Tokenizer,
+)
+
+from exllamav2.generator import (
+    ExLlamaV2StreamingGenerator,
+    ExLlamaV2Sampler
+)
+import uuid
+
+def generate_unique_id():
+    return uuid.uuid4()
+
+class CompletionRequest(BaseModel):
+    model: str
+    prompt: Union[str, List[str]]
+    stop: Optional[Union[str, List[str]]] = None
+    max_tokens: Optional[int] = 100  # default value of 100
+    temperature: Optional[float] = 0.0  # default value of 0.0
+    stream: Optional[bool] = False  # default value of False
+    best_of: Optional[int] = 1
+    echo: Optional[bool] = False
+    frequency_penalty: Optional[float] = 0.0  # default value of 0.0
+    presence_penalty: Optional[float] = 0.0  # default value of 0.0
+    log_probs: Optional[int] = 0  # default value of 0.0
+    n: Optional[int] = 1  # default value of 1, batch size
+    suffix: Optional[str] = None
+    top_p: Optional[float] = 0.0  # default value of 0.0
+    user: Optional[str] = None
+
+class Message(BaseModel):
+    role: str
+    content: str
+
+class ChatCompletionRequest(BaseModel):
+    model: str
+    messages: List[Message]
+    stop: Optional[Union[str, List[str]]] = None
+    max_tokens: Optional[int] = 100  # default value of 100
+    temperature: Optional[float] = 0.0  # default value of 0.0
+    stream: Optional[bool] = False  # default value of False
+    frequency_penalty: Optional[float] = 0.0  # default value of 0.0
+    presence_penalty: Optional[float] = 0.0  # default value of 0.0
+    log_probs: Optional[int] = 0  # default value of 0.0
+    n: Optional[int] = 1  # default value of 1, batch size
+    top_p: Optional[float] = 0.0  # default value of 0.0
+    user: Optional[str] = None
+    stop_at: Optional[str] = None
+    outlines_type: Optional[str] = None
+    choices: Optional[list[str]] = None
+    regex: Optional[str] = None
+    json: Optional[str] = None
+
+#repo_str = 'theprofessor-exl2-speculative'
+
+parser = argparse.ArgumentParser(description='Run server with specified port.')
+
+# Add argument for port with default type as integer
+parser.add_argument('--port', type=int, help='Port to run the server on.')
+parser.add_argument('--use_outlines', action='store_true', help='Use outlines.')
+parser.add_argument('--gpu_split', type=str, default="17,19,19,19", help='GPU splits.')
+parser.add_argument('--max_context', type=int, default=12288, help='Context length.')
+parser.add_argument('--cache_8bit', action='store_true', help='Use 8 bit cache.')
+parser.add_argument('--cache_q4', action='store_true', help='Use 4 bit cache.')
+parser.add_argument('--repo_str', type=str, default='commandr-exl2', help='The model repository name')
+
+# Parse the arguments
+args = parser.parse_args()
+repo_str = args.repo_str
+
+config = configparser.ConfigParser()
+config.read('config.ini')
+
+repo_id = config.get(repo_str, 'repo')
+host = config.get('settings', 'host')
+
+port = args.port if args.port is not None else config.getint('settings', 'port')
+
+# only allow one client at a time
+busy = False
+condition = asyncio.Condition()
+
+config = ExLlamaV2Config()
+config.model_dir = repo_id
+config.prepare()
+
+use_dynamic_rope_scaling = False 
+dynamic_rope_mult = 1.5
+dynamic_rope_offset = 0.0
+
+ropescale = 1.0
+max_context = args.max_context
+config.scale_alpha_value = ropescale
+config.max_seq_len = max_context
+base_model_native_max = 4096
+cache_8bit = args.cache_8bit
+cache_q4 = args.cache_q4
+
+if args.use_outlines:
+    model = outlines.models.exl2(
+        config.model_dir,
+        "cuda",
+        max_seq_len = config.max_seq_len,
+        scale_pos_emb = config.scale_pos_emb,
+        scale_alpha_value = config.scale_alpha_value,
+        no_flash_attn = config.no_flash_attn,
+        num_experts_per_token = config.num_experts_per_token,
+        cache_8bit = cache_8bit,
+        cache_q4 = cache_q4,
+        tokenizer_kwargs = {},
+        gpu_split = args.gpu_split, # we might be able to make this auto
+        low_mem = None,
+        verbose = None
+    )
+else:
+    model = ExLlamaV2(config)
+print("Loading model: " + repo_id)
+#cache = ExLlamaV2Cache(model, lazy=True, max_seq_len = 20480)
+#model.load_autosplit(cache)
+if not args.use_outlines:
+    model.load([int(gpu_memory) for gpu_memory in args.gpu_split.split(",")])
+
+tokenizer = ExLlamaV2Tokenizer(config)
+
+# Cache mode
+
+
+settings_proto = ExLlamaV2Sampler.Settings()
+settings_proto.temperature = 0
+settings_proto.top_k = 50
+settings_proto.top_p = 0.8
+settings_proto.top_a = 0.0
+settings_proto.token_repetition_penalty = 1.1
+#settings.disallow_tokens(tokenizer, [tokenizer.eos_token_id])
+
+# Active sequences and corresponding caches and settings
+prompts = queue.Queue()
+responses = {}
+
+input_ids = []
+prompt_length = []
+prompt_ids = []
+streamer = []
+caches = []
+input_prompts = []
+generators = []
+generations = []
+settings = []
+future_tokens = []
+future_logits = []
+sin_arr = []
+cos_arr = []
+
+# Global variable for storing partial responses
+partial_responses = {}
+
+max_parallel_seqs = 3 
+num_of_gpus = len(args.gpu_split.split(","))
+
+print("*** Loaded.. now Inference...:")
+
+app = FastAPI(title="EXL2")
+
+async def stream_response(prompt_id, timeout=180):
+    global partial_responses
+    while True:
+        await asyncio.sleep(0.05)  # Sleep to yield control to the event loop
+
+        # Check if prompt_id exists in partial_responses
+        if prompt_id in partial_responses:
+            # Stream partial responses
+            while partial_responses[prompt_id]:
+                response_chunk = partial_responses[prompt_id].pop(0)
+                yield f"data: {json.dumps(response_chunk)}\n\n"
+
+            # Check for final response or timeout
+            if prompt_id in responses:
+                final_response = responses.pop(prompt_id)
+                yield f'data: {{"id":"chatcmpl-{prompt_id}","object":"chat.completion.chunk","created":{int(time.time())},"model":"{repo_str}","choices":[{{"index":0,"delta":{{}},"finish_reason":"stop"}}]}}\n\n'
+                break
+
+# Worker thread function
+def process_outline_prompts():
+    global partial_responses
+    assert args.use_outlines
+    assert not use_dynamic_rope_scaling, "Currently ROPE scaling is not supported with outlines"
+    base_model = model.model
+    while True:
+        while not prompts.empty() or len(prompt_ids):
+            while len(prompt_ids) < max_parallel_seqs and not prompts.empty():
+                prompt_id, prompt, max_tokens, stream, temperature, outlines_dict = prompts.get()
+                print(f"got prompt with outlines dict {outlines_dict}")
+                sampler = multinomial(top_k=50, top_p=1.0, temperature=temperature)
+                ids = tokenizer.encode(prompt)
+                prompt_tokens = ids.shape[-1]
+                full_tokens = prompt_tokens + max_tokens
+                print("Processing prompt: " + str(prompt_id) + "  Req tokens: " + str(full_tokens))
+                # Truncate if new_tokens exceed max_context
+                if full_tokens > max_context:
+                    # Calculate how many tokens to truncate
+                    ids = tokenizer.encode("Say, 'Prompt exceeds allowed length. Please try again.'")
+                    # Update new_tokens after truncation
+                    prompt_tokens = ids.shape[-1]
+                    full_tokens = prompt_tokens + max_tokens
+                    print("Truncating prompt: " + str(prompt_id) + "  Req tokens: " + str(full_tokens))
+                if cache_8bit:
+                    ncache = ExLlamaV2Cache_8bit(base_model, lazy=not base_model.loaded, max_seq_len = full_tokens)  # (max_seq_len could be different for each cache)
+                elif cache_q4:
+                    ncache = ExLlamaV2Cache_Q4(base_model, lazy=not base_model.loaded, max_seq_len = full_tokens)
+                else:
+                    ncache = ExLlamaV2Cache(base_model, lazy=not base_model.loaded, max_seq_len = full_tokens)  # (max_seq_len could be different for each cache)
+                model.cache = ncache
+                model.past_seq = None
+                stop_at = outlines_dict["stop_at"]
+                if outlines_dict["type"] == "choices":
+                    generator = outlines.generate.choice(model, outlines_dict["choices"], sampler=sampler)
+                elif outlines_dict["type"] == "json":
+                    generator = outlines.generate.json(model, outlines_dict["json"], sampler=sampler)
+                elif outlines_dict["type"] == "regex":
+                    generator = outlines.generate.regex(model, outlines_dict["regex"], sampler=sampler)
+                else:
+                    generator = outlines.generate.text(model, sampler=sampler)
+                generators.append(generator.stream(prompt, stop_at=stop_at, max_tokens=max_tokens))
+                prompt_ids.append(prompt_id)
+                input_prompts.append(prompt)
+                generations.append("")
+                caches.append(ncache)
+                streamer.append(stream)
+            if(len(prompt_ids)):
+                eos = []
+                for i in range(len(prompt_ids)):
+                    model.cache = caches[i]
+                    is_finished = False
+                    try:
+                        decoded_response_token = next(generators[i])
+                        generations[i] += decoded_response_token
+                    except StopIteration:
+                        is_finished = True
+                    reason = None
+                    if(streamer[i]):
+                        outcontent = decoded_response_token
+                        if is_finished:
+                            outcontent = ""
+                            reason = "stop"
+                        partial_response_data = {
+                            "id": f"chatcmpl-{prompt_ids[i]}",
+                            "object": "chat.completion.chunk",
+                            "created": int(time.time()),
+                            "model": repo_str,
+                            "choices": [
+                                {
+                                    "index": 0,
+                                    "delta": {
+                                        "content": outcontent
+                                    },
+                                    "finish_reason": reason
+                                }
+                            ]
+                        }
+
+                        # Initialize a list for new prompt_id or append to existing one
+                        if prompt_ids[i] not in partial_responses:
+                            partial_responses[prompt_ids[i]] = []
+                        partial_responses[prompt_ids[i]].append(partial_response_data)
+
+                    if is_finished:
+                        eos.insert(0, i)
+
+                # Generate and store response
+                for i in eos:
+                    output = generations[i].strip()
+                    prompt = input_prompts[i]
+                    #output = tokenizer.decode(input_ids[i])[0]
+                    print("-----")
+                    print(output)
+                    generated_text = output
+                    # Calculate token counts
+                    completion_tokens = (tokenizer.encode(generated_text)).shape[-1]
+                    prompt_tokens = (tokenizer.encode(prompt)).shape[-1]
+                    full_tokens = completion_tokens + prompt_tokens
+                    eos_prompt_id = prompt_ids.pop(i)
+                    if(streamer[i]):
+                        ## Generator, yield here..
+                            partial_response_data = {
+                                "finish_reason": "stop"
+                            }
+
+                            responses[eos_prompt_id] = partial_response_data
+                    else:# Construct the response based on the format
+                        response_data = {
+                            "id": f"chatcmpl-{prompt_id}",
+                            "object": "chat.completion",
+                            "created": int(time.time()),
+                            "model": repo_str,
+                            "choices": [{
+                                "index": 0,
+                                "message": {
+                                    "role": "assistant",
+                                    "content": generated_text,
+                                },
+                                "finish_reason": "stop"
+                            }],
+                            "usage": {
+                                "prompt_tokens": prompt_tokens,
+                                "completion_tokens": completion_tokens,
+                                "total_tokens": full_tokens
+                            }
+                        }
+                        responses[eos_prompt_id] = response_data
+                    # Clean up
+                    input_prompts.pop(i)
+                    generations.pop(i)
+                    caches.pop(i)
+                    streamer.pop(i)
+
+        else:
+            # Sleep for a short duration when there's no work
+            time.sleep(0.1)  # Sleep for 100 milliseconds
+
+
+# Worker thread function
+def process_prompts():
+    global partial_responses  
+
+    while True:
+        while not prompts.empty() or len(input_ids):
+            while len(input_ids) < max_parallel_seqs and not prompts.empty():
+                prompt_id, prompt, max_tokens, stream, temperature = prompts.get()
+                ids = tokenizer.encode(prompt)
+                prompt_tokens = ids.shape[-1]
+                new_tokens = prompt_tokens + max_tokens
+                print("Processing prompt: " + str(prompt_id) + "  Req tokens: " + str(new_tokens))
+                # Truncate if new_tokens exceed max_context
+                if new_tokens > max_context:
+                    # Calculate how many tokens to truncate
+                    ids = tokenizer.encode("Say, 'Prompt exceeds allowed length. Please try again.'")
+                    # Update new_tokens after truncation
+                    prompt_tokens = ids.shape[-1]
+                    new_tokens = prompt_tokens + max_tokens
+                    print("Truncating prompt: " + str(prompt_id) + "  Req tokens: " + str(new_tokens))
+                prompt_length.append(prompt_tokens)
+                if use_dynamic_rope_scaling:
+                    # Dynamic Rope Scaling
+                    head_dim = model.config.head_dim
+                    model_base = model.config.rotary_embedding_base
+                    ratio = new_tokens / base_model_native_max
+                    alpha = 1.0
+                    ropesin = [None] * num_of_gpus
+                    ropecos = [None] * num_of_gpus
+                    if ratio > 1.0:
+                        alpha = ((0.2500*ratio**2) + (0.3500*ratio) + 0.4000)*dynamic_rope_mult + dynamic_rope_offset
+                        print("DYNAMIC ROPE SCALE Alpha: " + str(alpha) + "  Ratio: " + str(ratio))
+
+                    for g in range(num_of_gpus):
+                        base = model_base
+                        try:
+                            tensors = model.get_device_tensors(g)
+                        except IndexError:
+                            tensors = None
+
+                        if tensors is not None:
+                            if alpha != 1.0: base *= alpha ** (model.config.head_dim / (model.config.head_dim - 2))
+
+                            inv_freq = 1.0 / (base ** (torch.arange(0, head_dim, 2, device = "cuda:"+str(g)).float() / head_dim))
+                            t = torch.arange(model.config.max_seq_len, device = "cuda:"+str(g), dtype = torch.float32)
+
+                            freqs = torch.einsum("i,j->ij", t, inv_freq)
+                            emb = torch.cat((freqs, freqs), dim=-1)
+
+                            ropesin[g] = emb.sin()[None, None, :, :].half()
+                            ropecos[g] = emb.cos()[None, None, :, :].half()
+
+                            tensors.sin = ropesin[g]
+                            tensors.cos = ropecos[g]
+
+                if cache_8bit:
+                    ncache = ExLlamaV2Cache_8bit(model, max_seq_len = new_tokens)  # (max_seq_len could be different for each cache)
+                else:
+                    ncache = ExLlamaV2Cache(model, max_seq_len = new_tokens)  # (max_seq_len could be different for each cache)
+
+                #print("Setting up Cache: " + str(prompt_id))
+                
+                if use_dynamic_rope_scaling:
+                    sin_arr.append(ropesin)
+                    cos_arr.append(ropecos)
+
+                model.forward(ids[:, :-1], ncache, preprocess_only = True)
+                print("Cache setup: " + str(np.shape(ids[:1, :-1])))
+                input_ids.append(ids)
+                prompt_ids.append(prompt_id)
+                caches.append(ncache)
+                streamer.append(stream)
+                future_tokens.append(None)
+                future_logits.append(None)
+                #print("Prompt added to queue: " + str(prompt_id))
+
+            # Create a batch tensor of the last token in each active sequence, forward through the model using the list of
+            # active caches rather than a single, batched cache. Then sample for each token indidividually with some
+            # arbitrary stop condition
+            if(len(input_ids)):
+                #inputs = torch.cat([x[:, -1:] for x in input_ids], dim = 0)
+                #logits = model.forward(inputs, caches, input_mask = None).float().cpu()
+                eos = []
+                r = random.random()
+                for i in range(len(input_ids)):
+                    # if using dynamic rope
+                    if use_dynamic_rope_scaling:
+                        for g in range(num_of_gpus):
+                            if sin_arr[i][g] is not None and cos_arr[i][g] is not None:
+                                tensors = model.get_device_tensors(g)
+                                tensors.sin = sin_arr[i][g]
+                                tensors.cos = cos_arr[i][g]
+
+                    logits = model.forward(input_ids[i][:, -1:], caches[i], input_mask = None ).float().cpu()
+                    token, _, _, _, _ = ExLlamaV2Sampler.sample(logits[:, :1, :], settings[i], input_ids[i], r, tokenizer)
+
+                    input_ids[i] = torch.cat([input_ids[i], token], dim = 1)
+
+                    new_text = tokenizer.decode(input_ids[i][:, -2:-1], decode_special_tokens=False)[0]
+                    new_text2 = tokenizer.decode(input_ids[i][:, -2:], decode_special_tokens=False)[0]
+                    if '�' in new_text:
+                        diff = new_text2
+                    else:
+                        diff = new_text2[len(new_text):]
+
+                    if '�' in diff:
+                        diff = ""
+
+                    #print(diff)
+                    reason = None
+                    if(streamer[i]):
+                        ## Generator, yield here..
+                        outcontent = diff
+                        if diff == """<|im_end|>""":
+                            outcontent = ""
+                            reason = "stop"
+                        partial_response_data = {
+                            "id": f"chatcmpl-{prompt_ids[i]}",
+                            "object": "chat.completion.chunk",
+                            "created": int(time.time()),
+                            "model": repo_str,
+                            "choices": [
+                                {
+                                    "index": 0,
+                                    "delta": {
+                                        "content": outcontent
+                                    },
+                                    "finish_reason": reason
+                                }
+                            ]
+                        }
+
+                        # Initialize a list for new prompt_id or append to existing one
+                        if prompt_ids[i] not in partial_responses:
+                            partial_responses[prompt_ids[i]] = []
+                        partial_responses[prompt_ids[i]].append(partial_response_data)
+
+                    if token.item() == tokenizer.eos_token_id or diff == """<|im_end|>""" or caches[i].current_seq_len == caches[i].max_seq_len:
+                        eos.insert(0, i)
+                        
+                # Generate and store response
+                for i in eos:
+                    generated_part = input_ids[i][:, prompt_length[i]:]
+                    output = tokenizer.decode(generated_part[0]).strip()
+                    #output = tokenizer.decode(input_ids[i])[0]
+                    print("-----")
+                    print(output)
+                    generated_text = output
+                    # Calculate token counts
+                    completion_tokens = (tokenizer.encode(generated_text)).shape[-1]
+                    prompt_tokens = (tokenizer.encode(prompt)).shape[-1]
+                    full_tokens = completion_tokens + prompt_tokens
+                    eos_prompt_id = prompt_ids.pop(i)
+                    if(streamer[i]):
+                        ## Generator, yield here..
+                            partial_response_data = {
+                                "finish_reason": "stop"
+                            }
+
+                            responses[eos_prompt_id] = partial_response_data
+                    else:# Construct the response based on the format
+                        response_data = {
+                            "id": f"chatcmpl-{prompt_id}",
+                            "object": "chat.completion",
+                            "created": int(time.time()),
+                            "model": repo_str,
+                            "choices": [{
+                                "index": 0,
+                                "message": {
+                                    "role": "assistant",
+                                    "content": generated_text,
+                                },
+                                "finish_reason": "stop"
+                            }],
+                            "usage": {
+                                "prompt_tokens": prompt_tokens,
+                                "completion_tokens": completion_tokens,
+                                "total_tokens": full_tokens
+                            }
+                        }
+                        responses[eos_prompt_id] = response_data
+
+                    # Clean up
+                    input_ids.pop(i)
+                    caches.pop(i)
+                    settings.pop(i)
+                    prompt_length.pop(i)
+                    streamer.pop(i)
+                    if use_dynamic_rope_scaling:
+                        cos_arr.pop(i)
+                        sin_arr.pop(i)
+
+        else:
+            # Sleep for a short duration when there's no work
+            time.sleep(0.1)  # Sleep for 100 milliseconds
+
+
+
+
+
+# Start worker thread
+worker = Thread(target=process_prompts if not args.use_outlines else process_outline_prompts)
+worker.start()
+
+
+async def format_prompt(messages):
+    formatted_prompt = ""
+    for message in messages:
+        if message.role == "system":
+            formatted_prompt += f"{message.content}\n\n"
+        elif message.role == "user":
+            formatted_prompt += f"### User:\n{message.content}\n\n"
+        elif message.role == "assistant":
+            formatted_prompt += f"### Assistant:\n{message.content}\n\n"
+    # Add the final "### Assistant:\n" to prompt for the next response
+    formatted_prompt += "### Assistant:\n"
+    return formatted_prompt
+
+async def format_prompt_yi(messages):
+    formatted_prompt = ""
+    system_message_found = False
+    
+    # Check for a system message first
+    for message in messages:
+        if message.role == "system":
+            system_message_found = True
+            break
+    
+    # If no system message was found, prepend a default one
+    if not system_message_found:
+        formatted_prompt = "<|im_start|>system\nYou are a helpful AI assistant.<|im_end|>\n"
+    for message in messages:
+        if message.role == "system":
+            formatted_prompt += f"<|im_start|>system\n{message.content}<|im_end|>\n"
+        elif message.role == "user":
+            formatted_prompt += f"<|im_start|>user\n{message.content}<|im_end|>\n"
+        elif message.role == "assistant":
+            formatted_prompt += f"<|im_start|>assistant\n{message.content}<|im_end|>\n"
+    # Add the final "### Assistant:\n" to prompt for the next response
+    formatted_prompt += "<|im_start|>assistant\n"
+    return formatted_prompt
+
+async def format_prompt_nous(messages):
+    formatted_prompt = ""
+    for message in messages:
+        if message.role == "system":
+            formatted_prompt += f"{message.content}\n"
+        elif message.role == "user":
+            formatted_prompt += f"USER: {message.content}\n"
+        elif message.role == "assistant":
+            formatted_prompt += f"ASSISTANT: {message.content}\n"
+    # Add the final "### Assistant:\n" to prompt for the next response
+    formatted_prompt += "ASSISTANT: "
+    return formatted_prompt
+
+async def format_prompt_tess(messages):
+    formatted_prompt = ""
+    for message in messages:
+        if message.role == "system":
+            formatted_prompt += f"SYSTEM: {message.content}\n"
+        elif message.role == "user":
+            formatted_prompt += f"USER: {message.content}\n"
+        elif message.role == "assistant":
+            formatted_prompt += f"ASSISTANT: {message.content}\n"
+    # Add the final "### Assistant:\n" to prompt for the next response
+    formatted_prompt += "ASSISTANT: "
+    return formatted_prompt
+
+async def format_prompt_code(messages):
+    formatted_prompt = ""
+    for message in messages:
+        if message.role == "system":
+            formatted_prompt += f"### System Prompt\nYou are an intelligent programming assistant.\n\n"
+        elif message.role == "user":
+            formatted_prompt += f"### User Message\n{message.content}\n\n"
+        elif message.role == "assistant":
+            formatted_prompt += f"### Assistant\n{message.content}\n\n"
+    # Add the final "### Assistant" with ellipsis to prompt for the next response
+    formatted_prompt += "### Assistant\n..."
+    return formatted_prompt
+
+async def format_prompt_zephyr(messages):
+    formatted_prompt = ""
+    for message in messages:
+        if message.role == "system":
+            formatted_prompt += f"<|system|>\n{message.content}</s>\n"
+        elif message.role == "user":
+            formatted_prompt += f"<|user|>\n{message.content}</s>\n"
+        elif message.role == "assistant":
+            formatted_prompt += f"<|assistant|>\n{message.content}</s>\n"
+    # Add the final "### Assistant:\n" to prompt for the next response
+    formatted_prompt += "<|assistant|>\n"
+    return formatted_prompt
+
+async def format_prompt_starling(messages):
+    formatted_prompt = ""
+    system_message = ""
+    for message in messages:
+        if message.role == "system":
+            # Save system message to prepend to the first user message
+            system_message += f"{message.content}\n\n"
+        elif message.role == "user":
+            # Prepend system message if it exists
+            if system_message:
+                formatted_prompt += f"GPT4 Correct User: {system_message}{message.content}<|end_of_turn|>"
+                system_message = ""  # Clear system message after prepending
+            else:
+                formatted_prompt += f"GPT4 Correct User: {message.content}<|end_of_turn|>"
+        elif message.role == "assistant":
+            formatted_prompt += f"GPT4 Correct Assistant: {message.content}<|end_of_turn|>"  # Prep for user follow-up
+    formatted_prompt += "GPT4 Correct Assistant: \n\n"
+    return formatted_prompt
+
+async def format_prompt_mixtral(messages):
+    formatted_prompt = "<s> "
+    system_message = ""
+    for message in messages:
+        if message.role == "system":
+            # Save system message to prepend to the first user message
+            system_message += f"{message.content}\n\n"
+        elif message.role == "user":
+            # Prepend system message if it exists
+            if system_message:
+                formatted_prompt += f"[INST] {system_message}{message.content} [/INST] "
+                system_message = ""  # Clear system message after prepending
+            else:
+                formatted_prompt += f"[INST] {message.content} [/INST] "
+        elif message.role == "assistant":
+            formatted_prompt += f" {message.content}</s> "  # Prep for user follow-up
+    return formatted_prompt
+
+
+async def format_prompt_commandr(messages):
+    formatted_prompt = ""
+    system_message_found = False
+    
+    # Check for a system message first
+    for message in messages:
+        if message.role == "system":
+            system_message_found = True
+            break
+    
+    # If no system message was found, prepend a default one
+    if not system_message_found:
+        formatted_prompt += f"<BOS_TOKEN><|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{message.content}<|END_OF_TURN_TOKEN|>"
+ 
+    for message in messages:
+        if message.role == "system":
+            formatted_prompt += f"<BOS_TOKEN><|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{message.content}<|END_OF_TURN_TOKEN|>"
+        elif message.role == "user":
+            formatted_prompt += f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{message.content}<|END_OF_TURN_TOKEN|>"
+        elif message.role == "assistant":
+            formatted_prompt += f"<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{message.content}<|END_OF_TURN_TOKEN|>"
+    # Add the final "### Assistant:\n" to prompt for the next response
+    formatted_prompt += "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"
+    return formatted_prompt
+
+
+@app.post('/v1/chat/completions')
+async def mainchat(request: ChatCompletionRequest):
+    try:
+        prompt = ''
+        if repo_str == 'Phind-CodeLlama-34B-v2':
+            prompt = await format_prompt_code(request.messages)
+        elif repo_str == 'zephyr-7b-beta':
+            prompt = await format_prompt_zephyr(request.messages)
+        elif repo_str == 'Starling-LM-7B-alpha':
+            prompt = await format_prompt_starling(request.messages)
+        elif repo_str == 'Mixtral-8x7B-Instruct-v0.1-GPTQ':
+            prompt = await format_prompt_mixtral(request.messages)
+        elif repo_str == 'Yi-34B-Chat-GPTQ' or repo_str == 'Nous-Hermes-2-Yi-34B-GPTQ' or repo_str == 'theprofessor-exl2-speculative' or repo_str == 'dbrx-instruct-exl2':
+            prompt = await format_prompt_yi(request.messages)
+        elif repo_str == 'Nous-Capybara-34B-GPTQ' or repo_str == 'goliath-120b-GPTQ' or repo_str == 'goliath-120b-exl2' or repo_str == 'goliath-120b-exl2-rpcal':
+            prompt = await format_prompt_nous(request.messages)
+        elif repo_str == 'tess-xl-exl2' or repo_str == 'tess-xl-exl2-speculative':
+            prompt = await format_prompt_tess(request.messages)
+        elif repo_str == 'commandr-exl2' or repo_str == 'commandr-exl2-speculative':
+            prompt = await format_prompt_commandr(request.messages)
+        else:
+            prompt = await format_prompt(request.messages)
+        print(prompt)
+
+        timeout = 180  # seconds
+        start_time = time.time()
+        prompt_id = generate_unique_id() # Replace with a function to generate unique IDs
+        outlines_dict = {}
+        if request.stop_at is not None:
+            outlines_dict["stop_at"] = request.stop_at
+        if request.outlines_type is not None:
+            outlines_dict["type"] = request.outlines_type
+        if outlines_dict["type"] == "choices":
+            assert request.choices is not None
+            outlines_dict["choices"] = request.choices
+        elif outlines_dict["type"] == "json":
+            assert request.json is not None
+            outlines_dict["json"] = request.json
+        elif outlines_dict["type"] == "regex":
+            assert request.regex is not None
+            outlines_dict["regex"] = request.regex
+        else:
+            assert (outlines_dict["type"] == "text") or not args.outlines
+        if not args.use_outlines:
+            prompts.put((prompt_id, prompt, request.max_tokens, request.stream, request.temperature))
+        else:
+            prompts.put((prompt_id, prompt, request.max_tokens, request.stream, request.temperature, outlines_dict))
+
+
+        if request.stream:
+            #response = StreamingResponse(streaming_request(prompt, request.max_tokens, tempmodel=repo_str, response_format='chat_completion'), media_type="text/event-stream")
+            return StreamingResponse(stream_response(prompt_id), media_type="text/event-stream")
+        else:
+            #response_data = non_streaming_request(prompt, request.max_tokens, tempmodel=repo_str, response_format='chat_completion')
+            #response = response_data  # This will return a JSON response
+            while prompt_id not in responses:
+                await asyncio.sleep(0.1)  # Sleep to yield control to the event loop
+                if time.time() - start_time > timeout:
+                    return {"error": "Response timeout"} 
+
+            return responses.pop(prompt_id)
+
+    except Exception as e:
+        print(traceback.format_exc())
+        raise HTTPException(status_code=500, detail=str(e))
+
+    return response
+
+
+
+
+@app.get('/ping')
+async def get_status():
+    return {"ping": sum(prompt_length)}
+
+@app.get("/nvidia-smi")
+async def get_nvidia_smi():
+    # Execute the nvidia-smi command
+    result = subprocess.run(
+        ["nvidia-smi", "--query-gpu=utilization.gpu,memory.used,memory.total", "--format=csv,noheader"],
+        capture_output=True, text=True
+    )
+    nvidia_smi_output = result.stdout.strip()  # Remove any extra whitespace
+    # Split the output by lines and then by commas
+    gpu_data = []
+    for line in nvidia_smi_output.split("\n"):
+        utilization, memory_used, memory_total = line.split(", ")
+        # Strip the '%' and 'MiB' and convert to appropriate types
+        utilization = float(utilization.strip(' %'))
+        memory_used = int(memory_used.strip(' MiB'))
+        memory_total = int(memory_total.strip(' MiB'))
+        gpu_data.append({
+           "utilization": utilization,
+           "memory_used": memory_used,
+           "memory_total": memory_total
+        })
+    return gpu_data
+
+
+if __name__ == "__main__":
+    import uvicorn
+
+    uvicorn.run(app, host=host, port=port, log_level="debug")