From de671021de765cf6df819b5625ae6a8258895026 Mon Sep 17 00:00:00 2001 From: isamu-isozaki Date: Mon, 22 Apr 2024 22:48:49 -0400 Subject: [PATCH 01/11] Attempt adding outlines --- llm_exl2_client_multi.py | 40 ++++++++++++++++++++++++++++++---------- 1 file changed, 30 insertions(+), 10 deletions(-) diff --git a/llm_exl2_client_multi.py b/llm_exl2_client_multi.py index 1717527..af98faa 100644 --- a/llm_exl2_client_multi.py +++ b/llm_exl2_client_multi.py @@ -26,6 +26,7 @@ import numpy as np import sys, os +import outlines sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from exllamav2 import( @@ -87,6 +88,7 @@ class ChatCompletionRequest(BaseModel): # Add argument for port with default type as integer parser.add_argument('--port', type=int, help='Port to run the server on.') +parser.add_argument('--use_outlines', action='store_true', help='Use outlines.') # Parse the arguments args = parser.parse_args() @@ -117,11 +119,29 @@ class ChatCompletionRequest(BaseModel): config.max_seq_len = max_context base_model_native_max = 4096 -model = ExLlamaV2(config) +if args.use_outlines: + model = outlines.models.exl2( + config.model_dir, + "cuda", + max_seq_len = config.max_seq_len, + scale_pos_emb = config.scale_pos_emb, + scale_alpha_value = config.scale_alpha_value, + no_flash_attn = config.no_flash_attn, + num_experts_per_token = config.num_experts_per_token, + cache_8bit = True, + cache_q4 = False, + tokenizer_kwargs = {}, + gpu_split = "17,19,19,19", # we might be able to make this auto + low_mem = None, + verbose = None + ) +else: + model = ExLlamaV2(config) print("Loading model: " + repo_id) #cache = ExLlamaV2Cache(model, lazy=True, max_seq_len = 20480) #model.load_autosplit(cache) -model.load([17,19,19,19]) +if not args.use_outlines: + model.load([17,19,19,19]) tokenizer = ExLlamaV2Tokenizer(config) @@ -235,19 +255,19 @@ def process_prompts(): tensors.sin = ropesin[g] tensors.cos = ropecos[g] - - if cache_8bit: - ncache = ExLlamaV2Cache_8bit(model, max_seq_len = new_tokens) # (max_seq_len could be different for each cache) - else: - ncache = ExLlamaV2Cache(model, max_seq_len = new_tokens) # (max_seq_len could be different for each cache) + if not args.use_outlines: + if cache_8bit: + ncache = ExLlamaV2Cache_8bit(model, max_seq_len = new_tokens) # (max_seq_len could be different for each cache) + else: + ncache = ExLlamaV2Cache(model, max_seq_len = new_tokens) # (max_seq_len could be different for each cache) #print("Setting up Cache: " + str(prompt_id)) - + if use_dynamic_rope_scaling: sin_arr.append(ropesin) cos_arr.append(ropecos) - - model.forward(ids[:, :-1], ncache, preprocess_only = True) + if not args.use_outlines: + model.forward(ids[:, :-1], ncache, preprocess_only = True) print("Cache setup: " + str(np.shape(ids[:1, :-1]))) input_ids.append(ids) prompt_ids.append(prompt_id) From 18f96ba5c57486bd3d0c7e2ada03fb193233b5a0 Mon Sep 17 00:00:00 2001 From: isamu-isozaki Date: Wed, 24 Apr 2024 16:22:55 -0400 Subject: [PATCH 02/11] Setup basic outlines logic --- llm_exl2_client_multi.py | 213 ++++++++++++++++++++++++++++++++++----- 1 file changed, 189 insertions(+), 24 deletions(-) diff --git a/llm_exl2_client_multi.py b/llm_exl2_client_multi.py index af98faa..7210885 100644 --- a/llm_exl2_client_multi.py +++ b/llm_exl2_client_multi.py @@ -15,7 +15,7 @@ import subprocess import re -from fastapi import FastAPI +from fastapi import FastAPI, HTTPException from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import StreamingResponse from pydantic import BaseModel @@ -27,6 +27,8 @@ import sys, os import outlines +from outlines.samplers import multinomial + sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from exllamav2 import( @@ -34,6 +36,7 @@ ExLlamaV2Config, ExLlamaV2Cache, ExLlamaV2Cache_8bit, + ExLlamaV2Cache_Q4, ExLlamaV2Tokenizer, ) @@ -81,6 +84,21 @@ class ChatCompletionRequest(BaseModel): top_p: Optional[float] = 0.0 # default value of 0.0 user: Optional[str] = None +class OutlinesRequest(ChatCompletionRequest): + ends_at: str | None = None + +class OutlinesChoiceRequest(OutlinesRequest): + choices: list[str] + +class OutlinesRegexRequest(OutlinesRequest): + regex: str + +class OutlinesJsonRequest(OutlinesRequest): + json: str + +class OutlinesTextRequest(OutlinesRequest): + pass + repo_str = 'commandr-exl2' #repo_str = 'theprofessor-exl2-speculative' @@ -118,6 +136,8 @@ class ChatCompletionRequest(BaseModel): config.scale_alpha_value = ropescale config.max_seq_len = max_context base_model_native_max = 4096 +cache_8bit = True +cache_q4 = False if args.use_outlines: model = outlines.models.exl2( @@ -128,8 +148,8 @@ class ChatCompletionRequest(BaseModel): scale_alpha_value = config.scale_alpha_value, no_flash_attn = config.no_flash_attn, num_experts_per_token = config.num_experts_per_token, - cache_8bit = True, - cache_q4 = False, + cache_8bit = cache_8bit, + cache_q4 = cache_q4, tokenizer_kwargs = {}, gpu_split = "17,19,19,19", # we might be able to make this auto low_mem = None, @@ -147,7 +167,7 @@ class ChatCompletionRequest(BaseModel): # Cache mode -cache_8bit = True + settings_proto = ExLlamaV2Sampler.Settings() settings_proto.temperature = 0 @@ -166,6 +186,7 @@ class ChatCompletionRequest(BaseModel): prompt_ids = [] streamer = [] caches = [] +past_seq_length = [] settings = [] future_tokens = [] future_logits = [] @@ -200,15 +221,144 @@ async def stream_response(prompt_id, timeout=180): yield f'data: {{"id":"chatcmpl-{prompt_id}","object":"chat.completion.chunk","created":{int(time.time())},"model":"{repo_str}","choices":[{{"index":0,"delta":{{}},"finish_reason":"stop"}}]}}\n\n' break +# Worker thread function +def process_outline_prompts(): + # TODO: Somehow make this streams + global partial_responses + assert args.use_outlines + assert not use_dynamic_rope_scaling, "Currently ROPE scaling is not supported with outlines" + base_model = model.model + while True: + while not prompts.empty() or len(input_ids): + while len(input_ids) < max_parallel_seqs and not prompts.empty(): + prompt_id, prompt, max_tokens, stream, temperature, outlines_dict = prompts.get() + sampler = multinomial(top_k=50., top_p=1.0, temperature=temperature) + ids = tokenizer.encode(prompt) + prompt_tokens = ids.shape[-1] + new_tokens = prompt_tokens + max_tokens + print("Processing prompt: " + str(prompt_id) + " Req tokens: " + str(new_tokens)) + # Truncate if new_tokens exceed max_context + if new_tokens > max_context: + # Calculate how many tokens to truncate + ids = tokenizer.encode("Say, 'Prompt exceeds allowed length. Please try again.'") + # Update new_tokens after truncation + prompt_tokens = ids.shape[-1] + new_tokens = prompt_tokens + max_tokens + print("Truncating prompt: " + str(prompt_id) + " Req tokens: " + str(new_tokens)) + prompt_length.append(prompt_tokens) + if use_dynamic_rope_scaling: + # Dynamic Rope Scaling + head_dim = base_model.config.head_dim + model_base = base_model.config.rotary_embedding_base + max_seq_len = base_model.config.max_seq_len + ratio = new_tokens / base_model_native_max + alpha = 1.0 + ropesin = [None] * num_of_gpus + ropecos = [None] * num_of_gpus + if ratio > 1.0: + alpha = ((0.2500*ratio**2) + (0.3500*ratio) + 0.4000)*dynamic_rope_mult + dynamic_rope_offset + print("DYNAMIC ROPE SCALE Alpha: " + str(alpha) + " Ratio: " + str(ratio)) + + for g in range(num_of_gpus): + base = model_base + try: + tensors = base_model.get_device_tensors(g) + except IndexError: + tensors = None + + if tensors is not None: + if alpha != 1.0: base *= alpha ** (head_dim / (head_dim - 2)) + + inv_freq = 1.0 / (base ** (torch.arange(0, head_dim, 2, device = "cuda:"+str(g)).float() / head_dim)) + t = torch.arange(max_seq_len, device = "cuda:"+str(g), dtype = torch.float32) + + freqs = torch.einsum("i,j->ij", t, inv_freq) + emb = torch.cat((freqs, freqs), dim=-1) + + ropesin[g] = emb.sin()[None, None, :, :].half() + ropecos[g] = emb.cos()[None, None, :, :].half() + + tensors.sin = ropesin[g] + tensors.cos = ropecos[g] + if cache_8bit: + ncache = ExLlamaV2Cache_8bit(base_model, lazy=not base_model.loaded, max_seq_len = new_tokens) # (max_seq_len could be different for each cache) + elif cache_q4: + ncache = ExLlamaV2Cache_Q4(base_model, lazy=not base_model.loaded, max_seq_len = new_tokens) + else: + ncache = ExLlamaV2Cache(base_model, lazy=not base_model.loaded, max_seq_len = new_tokens) # (max_seq_len could be different for each cache) + model.cache = ncache + model.past_seq = None + stop_at = outlines["stop_at"] + if outlines_dict["type"] == "choices": + generator = outlines.generate.choice(model, outlines_dict["choices"], sampler=sampler, max_tokens=max_tokens, stop_at=stop_at) + elif outlines_dict["type"] == "json": + generator = outlines.generate.json(model, outlines_dict["json"], sampler=sampler, max_tokens=max_tokens, stop_at=stop_at) + elif outlines_dict["type"] == "regex": + generator = outlines.generate.regex(model, outlines_dict["regex"], sampler=sampler, max_tokens=max_tokens, stop_at=stop_at) + else: + generator = outlines.generate.text(model, sampler=sampler, max_tokens=max_tokens, stop_at=stop_at) + output = generator(prompt) + completion_tokens = (tokenizer.encode(output)).shape[-1] + prompt_tokens = (tokenizer.encode(prompt)).shape[-1] + full_tokens = completion_tokens + prompt_tokens + + + if(streamer[i]): + ## Generator, yield here.. + partial_response_data = { + "id": f"chatcmpl-{prompt_id}", + "object": "chat.completion.chunk", + "created": int(time.time()), + "model": repo_str, + "choices": [ + { + "index": 0, + "delta": { + "content": output + }, + "finish_reason": "stop" + } + ] + } + + # Initialize a list for new prompt_id or append to existing one + if prompt_id not in partial_responses: + partial_responses[prompt_id] = [] + partial_responses[prompt_id].append(partial_response_data) + else:# Construct the response based on the format + response_data = { + "id": f"chatcmpl-{prompt_id}", + "object": "chat.completion", + "created": int(time.time()), + "model": repo_str, + "choices": [{ + "index": 0, + "message": { + "role": "assistant", + "content": output, + }, + "finish_reason": "stop" + }], + "usage": { + "prompt_tokens": prompt_tokens, + "completion_tokens": completion_tokens, + "total_tokens": full_tokens + } + } + responses[prompt_id] = response_data + else: + # Sleep for a short duration when there's no work + time.sleep(0.1) # Sleep for 100 milliseconds + # Worker thread function def process_prompts(): - global partial_responses - + global partial_responses + base_model = model while True: while not prompts.empty() or len(input_ids): while len(input_ids) < max_parallel_seqs and not prompts.empty(): - prompt_id, prompt, max_tokens, stream, temperature = prompts.get() + prompt_id, prompt, max_tokens, stream, temperature, outlines_dict = prompts.get() ids = tokenizer.encode(prompt) prompt_tokens = ids.shape[-1] new_tokens = prompt_tokens + max_tokens @@ -224,8 +374,9 @@ def process_prompts(): prompt_length.append(prompt_tokens) if use_dynamic_rope_scaling: # Dynamic Rope Scaling - head_dim = model.config.head_dim - model_base = model.config.rotary_embedding_base + head_dim = base_model.config.head_dim + model_base = base_model.config.rotary_embedding_base + max_seq_len = base_model.config.max_seq_len ratio = new_tokens / base_model_native_max alpha = 1.0 ropesin = [None] * num_of_gpus @@ -237,15 +388,15 @@ def process_prompts(): for g in range(num_of_gpus): base = model_base try: - tensors = model.get_device_tensors(g) + tensors = base_model.get_device_tensors(g) except IndexError: tensors = None if tensors is not None: - if alpha != 1.0: base *= alpha ** (model.config.head_dim / (model.config.head_dim - 2)) + if alpha != 1.0: base *= alpha ** (head_dim / (head_dim - 2)) inv_freq = 1.0 / (base ** (torch.arange(0, head_dim, 2, device = "cuda:"+str(g)).float() / head_dim)) - t = torch.arange(model.config.max_seq_len, device = "cuda:"+str(g), dtype = torch.float32) + t = torch.arange(max_seq_len, device = "cuda:"+str(g), dtype = torch.float32) freqs = torch.einsum("i,j->ij", t, inv_freq) emb = torch.cat((freqs, freqs), dim=-1) @@ -255,19 +406,18 @@ def process_prompts(): tensors.sin = ropesin[g] tensors.cos = ropecos[g] - if not args.use_outlines: - if cache_8bit: - ncache = ExLlamaV2Cache_8bit(model, max_seq_len = new_tokens) # (max_seq_len could be different for each cache) - else: - ncache = ExLlamaV2Cache(model, max_seq_len = new_tokens) # (max_seq_len could be different for each cache) - + if cache_8bit: + ncache = ExLlamaV2Cache_8bit(base_model, lazy=not base_model.loaded, max_seq_len = new_tokens) # (max_seq_len could be different for each cache) + elif cache_q4: + ncache = ExLlamaV2Cache_Q4(base_model, lazy=not base_model.loaded, max_seq_len = new_tokens) + else: + ncache = ExLlamaV2Cache(base_model, lazy=not base_model.loaded, max_seq_len = new_tokens) # (max_seq_len could be different for each cache) #print("Setting up Cache: " + str(prompt_id)) if use_dynamic_rope_scaling: sin_arr.append(ropesin) cos_arr.append(ropecos) - if not args.use_outlines: - model.forward(ids[:, :-1], ncache, preprocess_only = True) + model.forward(ids[:, :-1], ncache, preprocess_only = True) print("Cache setup: " + str(np.shape(ids[:1, :-1]))) input_ids.append(ids) prompt_ids.append(prompt_id) @@ -342,7 +492,7 @@ def process_prompts(): if token.item() == tokenizer.eos_token_id or diff == """<|im_end|>""" or caches[i].current_seq_len == caches[i].max_seq_len: eos.insert(0, i) - + # Generate and store response for i in eos: generated_part = input_ids[i][:, prompt_length[i]:] @@ -383,7 +533,6 @@ def process_prompts(): "total_tokens": full_tokens } } - responses[eos_prompt_id] = response_data # Clean up @@ -403,7 +552,7 @@ def process_prompts(): # Start worker thread -worker = Thread(target=process_prompts) +worker = Thread(target=process_prompts if not args.use_outlines else process_outline_prompts) worker.start() @@ -588,7 +737,23 @@ async def mainchat(request: ChatCompletionRequest): timeout = 180 # seconds start_time = time.time() prompt_id = generate_unique_id() # Replace with a function to generate unique IDs - prompts.put((prompt_id, prompt, request.max_tokens, request.stream, request.temperature)) + outlines_dict = {} + if isinstance(request, OutlinesRequest): + outlines_dict["ends_at"] = request.ends_at + if isinstance(request, OutlinesChoiceRequest): + outlines_dict["type"] = "choices" + outlines_dict["choices"] = request.choices + elif isinstance(request, OutlinesJsonRequest): + outlines_dict["type"] = "json" + outlines_dict["json"] = request.json + elif isinstance(request, OutlinesRegexRequest): + outlines_dict["type"] = "regex" + outlines_dict["regex"] = request.regex + elif isinstance(request, OutlinesTextRequest): + outlines_dict["type"] = "text" + elif args.use_outliens: + raise NotImplementedError("If outlines is used, the request must be an OutlinesRequest") + prompts.put((prompt_id, prompt, request.max_tokens, request.stream, request.temperature, outlines_dict)) if request.stream: #response = StreamingResponse(streaming_request(prompt, request.max_tokens, tempmodel=repo_str, response_format='chat_completion'), media_type="text/event-stream") From 7667151d2886ac88a02bc92a881c0b2d224a0f14 Mon Sep 17 00:00:00 2001 From: isamu-isozaki Date: Wed, 24 Apr 2024 16:29:17 -0400 Subject: [PATCH 03/11] Remove diff --- llm_exl2_client_multi.py | 38 +------------------------------------- 1 file changed, 1 insertion(+), 37 deletions(-) diff --git a/llm_exl2_client_multi.py b/llm_exl2_client_multi.py index 7210885..d8d653a 100644 --- a/llm_exl2_client_multi.py +++ b/llm_exl2_client_multi.py @@ -186,7 +186,6 @@ class OutlinesTextRequest(OutlinesRequest): prompt_ids = [] streamer = [] caches = [] -past_seq_length = [] settings = [] future_tokens = [] future_logits = [] @@ -245,41 +244,6 @@ def process_outline_prompts(): prompt_tokens = ids.shape[-1] new_tokens = prompt_tokens + max_tokens print("Truncating prompt: " + str(prompt_id) + " Req tokens: " + str(new_tokens)) - prompt_length.append(prompt_tokens) - if use_dynamic_rope_scaling: - # Dynamic Rope Scaling - head_dim = base_model.config.head_dim - model_base = base_model.config.rotary_embedding_base - max_seq_len = base_model.config.max_seq_len - ratio = new_tokens / base_model_native_max - alpha = 1.0 - ropesin = [None] * num_of_gpus - ropecos = [None] * num_of_gpus - if ratio > 1.0: - alpha = ((0.2500*ratio**2) + (0.3500*ratio) + 0.4000)*dynamic_rope_mult + dynamic_rope_offset - print("DYNAMIC ROPE SCALE Alpha: " + str(alpha) + " Ratio: " + str(ratio)) - - for g in range(num_of_gpus): - base = model_base - try: - tensors = base_model.get_device_tensors(g) - except IndexError: - tensors = None - - if tensors is not None: - if alpha != 1.0: base *= alpha ** (head_dim / (head_dim - 2)) - - inv_freq = 1.0 / (base ** (torch.arange(0, head_dim, 2, device = "cuda:"+str(g)).float() / head_dim)) - t = torch.arange(max_seq_len, device = "cuda:"+str(g), dtype = torch.float32) - - freqs = torch.einsum("i,j->ij", t, inv_freq) - emb = torch.cat((freqs, freqs), dim=-1) - - ropesin[g] = emb.sin()[None, None, :, :].half() - ropecos[g] = emb.cos()[None, None, :, :].half() - - tensors.sin = ropesin[g] - tensors.cos = ropecos[g] if cache_8bit: ncache = ExLlamaV2Cache_8bit(base_model, lazy=not base_model.loaded, max_seq_len = new_tokens) # (max_seq_len could be different for each cache) elif cache_q4: @@ -303,7 +267,7 @@ def process_outline_prompts(): full_tokens = completion_tokens + prompt_tokens - if(streamer[i]): + if(stream): ## Generator, yield here.. partial_response_data = { "id": f"chatcmpl-{prompt_id}", From 74694709a2d106c6c2241e605c6d8b20bcb3dd01 Mon Sep 17 00:00:00 2001 From: isamu-isozaki Date: Wed, 24 Apr 2024 16:35:45 -0400 Subject: [PATCH 04/11] Remove diff --- llm_exl2_client_multi.py | 1 - 1 file changed, 1 deletion(-) diff --git a/llm_exl2_client_multi.py b/llm_exl2_client_multi.py index d8d653a..aa8c21b 100644 --- a/llm_exl2_client_multi.py +++ b/llm_exl2_client_multi.py @@ -168,7 +168,6 @@ class OutlinesTextRequest(OutlinesRequest): # Cache mode - settings_proto = ExLlamaV2Sampler.Settings() settings_proto.temperature = 0 settings_proto.top_k = 50 From 9161a6f9b100d49c3d962b32e6f4adf6b439fdf9 Mon Sep 17 00:00:00 2001 From: isamu-isozaki Date: Sat, 27 Apr 2024 00:51:56 -0400 Subject: [PATCH 05/11] Remove diffs --- llm_exl2_client_multi.py | 44 ++++++++++++++++++++++------------------ 1 file changed, 24 insertions(+), 20 deletions(-) diff --git a/llm_exl2_client_multi.py b/llm_exl2_client_multi.py index aa8c21b..7d6d041 100644 --- a/llm_exl2_client_multi.py +++ b/llm_exl2_client_multi.py @@ -28,6 +28,7 @@ import sys, os import outlines from outlines.samplers import multinomial +from outlines.generate import continuous sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) @@ -253,14 +254,15 @@ def process_outline_prompts(): model.past_seq = None stop_at = outlines["stop_at"] if outlines_dict["type"] == "choices": - generator = outlines.generate.choice(model, outlines_dict["choices"], sampler=sampler, max_tokens=max_tokens, stop_at=stop_at) + generator = outlines.generate.choice(model, outlines_dict["choices"], sampler=sampler) elif outlines_dict["type"] == "json": - generator = outlines.generate.json(model, outlines_dict["json"], sampler=sampler, max_tokens=max_tokens, stop_at=stop_at) + generator = outlines.generate.json(model, outlines_dict["json"], sampler=sampler) elif outlines_dict["type"] == "regex": - generator = outlines.generate.regex(model, outlines_dict["regex"], sampler=sampler, max_tokens=max_tokens, stop_at=stop_at) + generator = outlines.generate.regex(model, outlines_dict["regex"], sampler=sampler) else: - generator = outlines.generate.text(model, sampler=sampler, max_tokens=max_tokens, stop_at=stop_at) - output = generator(prompt) + generator = outlines.generate.text(model, sampler=sampler) + # generator_c = continuous(generator) + output = generator(prompt, max_tokens=max_tokens, stop_at=stop_at) completion_tokens = (tokenizer.encode(output)).shape[-1] prompt_tokens = (tokenizer.encode(prompt)).shape[-1] full_tokens = completion_tokens + prompt_tokens @@ -316,12 +318,12 @@ def process_outline_prompts(): # Worker thread function def process_prompts(): - global partial_responses - base_model = model + global partial_responses + while True: while not prompts.empty() or len(input_ids): while len(input_ids) < max_parallel_seqs and not prompts.empty(): - prompt_id, prompt, max_tokens, stream, temperature, outlines_dict = prompts.get() + prompt_id, prompt, max_tokens, stream, temperature = prompts.get() ids = tokenizer.encode(prompt) prompt_tokens = ids.shape[-1] new_tokens = prompt_tokens + max_tokens @@ -337,9 +339,8 @@ def process_prompts(): prompt_length.append(prompt_tokens) if use_dynamic_rope_scaling: # Dynamic Rope Scaling - head_dim = base_model.config.head_dim - model_base = base_model.config.rotary_embedding_base - max_seq_len = base_model.config.max_seq_len + head_dim = model.config.head_dim + model_base = model.config.rotary_embedding_base ratio = new_tokens / base_model_native_max alpha = 1.0 ropesin = [None] * num_of_gpus @@ -351,15 +352,15 @@ def process_prompts(): for g in range(num_of_gpus): base = model_base try: - tensors = base_model.get_device_tensors(g) + tensors = model.get_device_tensors(g) except IndexError: tensors = None if tensors is not None: - if alpha != 1.0: base *= alpha ** (head_dim / (head_dim - 2)) + if alpha != 1.0: base *= alpha ** (model.config.head_dim / (model.config.head_dim - 2)) inv_freq = 1.0 / (base ** (torch.arange(0, head_dim, 2, device = "cuda:"+str(g)).float() / head_dim)) - t = torch.arange(max_seq_len, device = "cuda:"+str(g), dtype = torch.float32) + t = torch.arange(model.config.max_seq_len, device = "cuda:"+str(g), dtype = torch.float32) freqs = torch.einsum("i,j->ij", t, inv_freq) emb = torch.cat((freqs, freqs), dim=-1) @@ -369,17 +370,18 @@ def process_prompts(): tensors.sin = ropesin[g] tensors.cos = ropecos[g] + if cache_8bit: - ncache = ExLlamaV2Cache_8bit(base_model, lazy=not base_model.loaded, max_seq_len = new_tokens) # (max_seq_len could be different for each cache) - elif cache_q4: - ncache = ExLlamaV2Cache_Q4(base_model, lazy=not base_model.loaded, max_seq_len = new_tokens) + ncache = ExLlamaV2Cache_8bit(model, max_seq_len = new_tokens) # (max_seq_len could be different for each cache) else: - ncache = ExLlamaV2Cache(base_model, lazy=not base_model.loaded, max_seq_len = new_tokens) # (max_seq_len could be different for each cache) - #print("Setting up Cache: " + str(prompt_id)) + ncache = ExLlamaV2Cache(model, max_seq_len = new_tokens) # (max_seq_len could be different for each cache) + #print("Setting up Cache: " + str(prompt_id)) + if use_dynamic_rope_scaling: sin_arr.append(ropesin) cos_arr.append(ropecos) + model.forward(ids[:, :-1], ncache, preprocess_only = True) print("Cache setup: " + str(np.shape(ids[:1, :-1]))) input_ids.append(ids) @@ -455,7 +457,7 @@ def process_prompts(): if token.item() == tokenizer.eos_token_id or diff == """<|im_end|>""" or caches[i].current_seq_len == caches[i].max_seq_len: eos.insert(0, i) - + # Generate and store response for i in eos: generated_part = input_ids[i][:, prompt_length[i]:] @@ -514,6 +516,8 @@ def process_prompts(): + + # Start worker thread worker = Thread(target=process_prompts if not args.use_outlines else process_outline_prompts) worker.start() From edae5dc25f865f2ea373fe2b9794bd1a85d7de74 Mon Sep 17 00:00:00 2001 From: isamu-isozaki Date: Sat, 27 Apr 2024 00:55:29 -0400 Subject: [PATCH 06/11] Don't modify previous funtion --- llm_exl2_client_multi.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/llm_exl2_client_multi.py b/llm_exl2_client_multi.py index 7d6d041..e0bdd8a 100644 --- a/llm_exl2_client_multi.py +++ b/llm_exl2_client_multi.py @@ -720,7 +720,11 @@ async def mainchat(request: ChatCompletionRequest): outlines_dict["type"] = "text" elif args.use_outliens: raise NotImplementedError("If outlines is used, the request must be an OutlinesRequest") - prompts.put((prompt_id, prompt, request.max_tokens, request.stream, request.temperature, outlines_dict)) + if not args.use_outlines: + prompts.put((prompt_id, prompt, request.max_tokens, request.stream, request.temperature)) + else: + prompts.put((prompt_id, prompt, request.max_tokens, request.stream, request.temperature, outlines_dict)) + if request.stream: #response = StreamingResponse(streaming_request(prompt, request.max_tokens, tempmodel=repo_str, response_format='chat_completion'), media_type="text/event-stream") From df92442886fdaa68a710415d50db0e95bf902530 Mon Sep 17 00:00:00 2001 From: isamu-isozaki Date: Sat, 27 Apr 2024 12:28:36 -0400 Subject: [PATCH 07/11] Streaming logic done --- llm_exl2_client_multi.py | 158 +++++++++++++++++++++++++-------------- 1 file changed, 102 insertions(+), 56 deletions(-) diff --git a/llm_exl2_client_multi.py b/llm_exl2_client_multi.py index e0bdd8a..4db57df 100644 --- a/llm_exl2_client_multi.py +++ b/llm_exl2_client_multi.py @@ -28,7 +28,6 @@ import sys, os import outlines from outlines.samplers import multinomial -from outlines.generate import continuous sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) @@ -186,6 +185,9 @@ class OutlinesTextRequest(OutlinesRequest): prompt_ids = [] streamer = [] caches = [] +input_prompts = [] +generators = [] +generations = [] settings = [] future_tokens = [] future_logits = [] @@ -234,22 +236,22 @@ def process_outline_prompts(): sampler = multinomial(top_k=50., top_p=1.0, temperature=temperature) ids = tokenizer.encode(prompt) prompt_tokens = ids.shape[-1] - new_tokens = prompt_tokens + max_tokens - print("Processing prompt: " + str(prompt_id) + " Req tokens: " + str(new_tokens)) + full_tokens = prompt_tokens + max_tokens + print("Processing prompt: " + str(prompt_id) + " Req tokens: " + str(full_tokens)) # Truncate if new_tokens exceed max_context - if new_tokens > max_context: + if full_tokens > max_context: # Calculate how many tokens to truncate ids = tokenizer.encode("Say, 'Prompt exceeds allowed length. Please try again.'") # Update new_tokens after truncation prompt_tokens = ids.shape[-1] - new_tokens = prompt_tokens + max_tokens - print("Truncating prompt: " + str(prompt_id) + " Req tokens: " + str(new_tokens)) + full_tokens = prompt_tokens + max_tokens + print("Truncating prompt: " + str(prompt_id) + " Req tokens: " + str(full_tokens)) if cache_8bit: - ncache = ExLlamaV2Cache_8bit(base_model, lazy=not base_model.loaded, max_seq_len = new_tokens) # (max_seq_len could be different for each cache) + ncache = ExLlamaV2Cache_8bit(base_model, lazy=not base_model.loaded, max_seq_len = full_tokens) # (max_seq_len could be different for each cache) elif cache_q4: - ncache = ExLlamaV2Cache_Q4(base_model, lazy=not base_model.loaded, max_seq_len = new_tokens) + ncache = ExLlamaV2Cache_Q4(base_model, lazy=not base_model.loaded, max_seq_len = full_tokens) else: - ncache = ExLlamaV2Cache(base_model, lazy=not base_model.loaded, max_seq_len = new_tokens) # (max_seq_len could be different for each cache) + ncache = ExLlamaV2Cache(base_model, lazy=not base_model.loaded, max_seq_len = full_tokens) # (max_seq_len could be different for each cache) model.cache = ncache model.past_seq = None stop_at = outlines["stop_at"] @@ -261,56 +263,102 @@ def process_outline_prompts(): generator = outlines.generate.regex(model, outlines_dict["regex"], sampler=sampler) else: generator = outlines.generate.text(model, sampler=sampler) - # generator_c = continuous(generator) - output = generator(prompt, max_tokens=max_tokens, stop_at=stop_at) - completion_tokens = (tokenizer.encode(output)).shape[-1] - prompt_tokens = (tokenizer.encode(prompt)).shape[-1] - full_tokens = completion_tokens + prompt_tokens - - - if(stream): - ## Generator, yield here.. - partial_response_data = { - "id": f"chatcmpl-{prompt_id}", - "object": "chat.completion.chunk", - "created": int(time.time()), - "model": repo_str, - "choices": [ - { + generators.append(generator.stream(prompt, stop_at=stop_at, max_tokens=max_tokens)) + input_ids.append(prompt_id) + input_prompts.append(prompt) + generations.append("") + caches.append(ncache) + streamer.append(stream) + #print("Prompt added to queue: " + str(prompt_id)) + if(len(input_ids)): + eos = [] + for i in range(len(input_ids)): + model.cache = caches[i] + is_finished = False + try: + decoded_response_token = next(generators[i]) + generations[i] += decoded_response_token + except StopIteration: + is_finished = True + reason = None + if(streamer[i]): + outcontent = decoded_response_token + if is_finished: + outcontent = outcontent + reason = "stop" + partial_response_data = { + "id": f"chatcmpl-{prompt_ids[i]}", + "object": "chat.completion.chunk", + "created": int(time.time()), + "model": repo_str, + "choices": [ + { + "index": 0, + "delta": { + "content": outcontent + }, + "finish_reason": reason + } + ] + } + + # Initialize a list for new prompt_id or append to existing one + if prompt_ids[i] not in partial_responses: + partial_responses[prompt_ids[i]] = [] + partial_responses[prompt_ids[i]].append(partial_response_data) + + if is_finished: + eos.insert(0, i) + + # Generate and store response + for i in eos: + output = generations[i].strip() + prompt = input_prompts[i] + #output = tokenizer.decode(input_ids[i])[0] + print("-----") + print(output) + generated_text = output + # Calculate token counts + completion_tokens = (tokenizer.encode(generated_text)).shape[-1] + prompt_tokens = (tokenizer.encode(prompt)).shape[-1] + full_tokens = completion_tokens + prompt_tokens + eos_prompt_id = prompt_ids.pop(i) + if(streamer[i]): + ## Generator, yield here.. + partial_response_data = { + "finish_reason": "stop" + } + + responses[eos_prompt_id] = partial_response_data + else:# Construct the response based on the format + response_data = { + "id": f"chatcmpl-{prompt_id}", + "object": "chat.completion", + "created": int(time.time()), + "model": repo_str, + "choices": [{ "index": 0, - "delta": { - "content": output + "message": { + "role": "assistant", + "content": generated_text, }, "finish_reason": "stop" + }], + "usage": { + "prompt_tokens": prompt_tokens, + "completion_tokens": completion_tokens, + "total_tokens": full_tokens } - ] - } - - # Initialize a list for new prompt_id or append to existing one - if prompt_id not in partial_responses: - partial_responses[prompt_id] = [] - partial_responses[prompt_id].append(partial_response_data) - else:# Construct the response based on the format - response_data = { - "id": f"chatcmpl-{prompt_id}", - "object": "chat.completion", - "created": int(time.time()), - "model": repo_str, - "choices": [{ - "index": 0, - "message": { - "role": "assistant", - "content": output, - }, - "finish_reason": "stop" - }], - "usage": { - "prompt_tokens": prompt_tokens, - "completion_tokens": completion_tokens, - "total_tokens": full_tokens } - } - responses[prompt_id] = response_data + responses[eos_prompt_id] = response_data + + # Clean up + input_ids.pop(i) + input_prompts.pop(i) + generations.pop(i) + caches.pop(i) + streamer.pop(i) + else: # Sleep for a short duration when there's no work time.sleep(0.1) # Sleep for 100 milliseconds @@ -388,8 +436,6 @@ def process_prompts(): prompt_ids.append(prompt_id) caches.append(ncache) streamer.append(stream) - settings_proto.temperature = temperature - settings.append(settings_proto.clone()) # Need individual settings per prompt to support Mirostat future_tokens.append(None) future_logits.append(None) #print("Prompt added to queue: " + str(prompt_id)) From 1c996851d794d9473b7679eb8e7b731e4c733d76 Mon Sep 17 00:00:00 2001 From: isamu-isozaki Date: Mon, 29 Apr 2024 00:35:12 -0400 Subject: [PATCH 08/11] Main logic done --- README.md | 4 + .../llm_exl2_client_multi.cpython-310.pyc | Bin 0 -> 17443 bytes llm_exl2_client_multi.py | 91 ++++++++---------- 3 files changed, 45 insertions(+), 50 deletions(-) create mode 100644 __pycache__/llm_exl2_client_multi.cpython-310.pyc diff --git a/README.md b/README.md index 482f6ea..81380bd 100644 --- a/README.md +++ b/README.md @@ -64,3 +64,7 @@ pip install flash-attn --no-build-isolation ``` Ubuntu 23 does not require the adding of any repos for the toolkit. + +``` +python llm_exl2_client_multi.py --port=5000 --use_outlines --gpu_split="5" --max_context=512 --repo_str=phi3b +``` \ No newline at end of file diff --git a/__pycache__/llm_exl2_client_multi.cpython-310.pyc b/__pycache__/llm_exl2_client_multi.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..711d0725e3b47b67e90039425d70e44121ba982d GIT binary patch literal 17443 zcmbt*Yj7M_c3yW+&-;PF`@z=`1Ro$V;8UVVfuu-~lqiuPMS#+(MeXbWw*d?|FWkOE z60lRtKxij%bCO3F@o!(UFKq$r2W@{$P0L!zcMwA zX-v;+mU`=!F5`w}$hXh($+u~l^6j_$@*S`O@*T8-@*T25@*TFqcpLdhA!cJ4NAhEZw3XI1d+*YCeZOr09%zFH?L(`&`jhoW z^M?yZtRsbItY-?(TF(}aT1N}dSzV)^|_0Chf_kv#j4f#|GHo zJsn|hriv3FcA{$%P(Z(zFOZEkpUNx*K3bS#xAK}Zuzrys2eqZNQ z_Da!@+*gr%0J&G#LF=lm)u&O{jQz}Asu86#*MRfds)l(xXuZx3S)XHvtvA>a%R&ur zT5sAGaIV{L0=~tbu|AK~jFkcWg7pQ!EZX{xy>DrDY0iFS=^A?$@GU$H&viUDo_RZC ze<7y^Xa?U%N{qlQlZOWiQs3*fDneU0JGrgFU~h zp9O{-{V`cCZ%@<<7)z09b}>_86M$uQ0&tlf2ITA{pkwc@3t&}pnsswQvu@$Jjb{Z< z70(y(xOi%KzJ%v(Ja_QCgXhb5?&h@YyX=(p75nF?|EeeG^;gmB7i5q7*=cqLwDL8U zXJ^^DcXjLQ>^yrBl=ltx61#x+H|i6Wlzrrp8>*v`e zHih>uu*>WU-uKz7>?+>xv1vAg_updI*lT$IB72>E4)4Fs-e4Bq-)3*J>v(_1{#yOJ z?5$OuecnDjZ?Fve!n-DV`b$W^#Ii`AWpnRh4z2IW6owR=&C8VU+wZf58^(QuFM~Vv z`~&NO{b1<>wrF3rudp0jVmI#jK_U9Id`1foX$PDd^?UKt&Ps9ag0OjZHg9|8#hfF& z@QZd438KV3|FvSSRP;iZ%L4CgepL5@m$Kz@uDCF2c){~oN1VQL!Hd5%GjrwK?KxYb z0Vy$g`Reqg9AkO=R+igH>6{ka&K6Mcs_m3ZMTZi@XR?lcsl@C&`qFc{B1$rHzQiZ9 z6(^g&cnLj3>P+c|UCdQ&?uE*^vYpQrZ7()s-xjJ;r1wxvjsx9prlN(w%p#g)2t^B( zyvP+xg&c+v@WSVAU(9C<+1Czv9j)MGsW_io@DghY*||mA>s}wvJU5#YUgw6ySC4pI zt)y0mF-x0TsjVqko8aVHX0}ky+kB3eru}CiGKt3t0SK+GuWHlUs6K_7%}OQ5JU_uB zA8LFn5F~hX^z8BLWt%&+WSKHAE!lIzxn9Vp=khtbD6Z%8g^YbWe<-8k896oMzpjndJ@}QwK0?6D}R9~-3wqr3S}|nnT{xxA2Ljx7b#?KX9Q8X^Ap77&x;DXfK|+j z3b)UD0Y@^>Lrk*f1!qxVrZn%F_S|C0>zwD*MsaQ>Q?{|@V#Q0Ax$PjuZl-WqFq|(f z5Jk*7o?b+qmHGMHZO@O6WXhl?$L6C0UMQ0(W(#&E<3%zVbh?tKa4eHqu4MC#93Q&l zb!0NxVzDH$G)yOx;Y17EM_`0NfWR&Q&zCC-zK25o`FtrW_$YhkAfDMzEux$qCN<{&_!j8(RAN&~t&k{IF;5hi7gQ&f#%rqNw9B zom1Ll-D~mK@Y8rKAn9|Bx5D5$aGv5inWuOw0$PpAxZf)??T z%8$x8@sf%Yuf;9mB~>5sR)^#z6;Dch;w6Poyw%Q2inoZD)_BWjt?`oLtpVf@5{B%P zR!%IYoU%Qao6p%SRW5NcPUkSQojIP9l0r3* zPE&ey=n7y;lu{h!#iHG;qf{KHb572sxoGFh)kHdtT`JR@W-o%7=XOT$id~IgbL>92yNDBf|M=Ea@7sTRtHI`1#k-Q+~gKCmG^IQs|LQ1BNS0l~* z9Z#}<%t797u9EaB8KX@bu$U`+V^-&2yr8)wY_IF-|qL=bZomK}{kM8X&ydpUy< zd2wehi+vx>W$ePN*Quh}e0edOxtYyZY%g9cW#;o)XAy)iie7KALQ(}zEaBkL;p#l| zB5QNvg;aR?h?kUy9f@?A8;}su+i+`fykNd`D^suwUJ!INTXO7b$Hh{XVNhxoj;H2` z9h*M`0TT}qx6=c#pt;&xftoHfSHG*jXDl1jipocI$wHtEWBh;@p%&2$CbNwP=tS`% z6T~A}h4RT?rdp|q9*t-5dZdp%B-UE^{`ctj@Khu4KTka*=B;ksyLWGjx@)uX@l>&N zD|MkbZ*w||$H&Kyd*->%T|5Ln@Qa+t+YkQ%%FI#q%^!^7fJ?X$w1xb zYQXlffU5y7D1stX53NGQKrAG}fZ-2!BO?;2 z_Nnj}hjVQB`0yNd-#G|C>1uj-Y`8Q_5@;iNoTgPtM&omfmEsK`%t1&OHVRfFW5bft zfm}UFIcftPKK>?ZE;9S}@bLp5l(p?aR6_c=Tmxdv|q&eM`QY1|kActZIBf z1Vr$z4s9{0LLziSgB>8O^6%>og~F@ab+Ay)sQGGU&0h;|Qg;lsu_$^IE%u5Y{&ms&SbA9WZA^2c!k6i18sLB!WY8;WF*m>_1#~L_E;QUzPY?e65Hk_dbPA72wMB;3bIGt@cA(_4v=~S~( zi;Hdblp9~#?iy~)^|7vdU}3Op5UJ8$B8CydOe1YaJ;l0ZXas3d8QO`EDMNBq*RLfX zX{@!*KV8sj39(DX@4_1MR%nW zrQG+m4-D2X_KJP=eYDYtF|y3L-8tRuVnYvn3p(5Mz+dZ8*dl$CY+=g_nn-x&sqhF~#~^Y@!(J z)(vCb=Jv7F0|WNS?KE0&Oh0IN_<@h@co3)!x`V0)tiEgoeH?CucC0 zdj56>Fw2M72+h%Q$LP+f>NArSj&n%lS5j9mUpbee6P?b~oP%RZq?}5bLPj9Pxt$UaaIy>8TycE&qwX0%p`+iEANFrTfi z@ELSM3AiM-NdXs!YtT{F6eZxErM?>qOKPJK??;gKDBL=x9`#c2v7<>IbNnQ5Kg8JO z`GaGE4j(VR?%?^*?-@7_0_voL+^-Hasc*$#4I6?N6Hv{ zQHDW#L>F3!ZSqO?$XM7q`i84H zx)OE0OyCCfKahicl*3|#+Hz%{r;H+jC4kW(FER@rAA%%)XhE!m|Z z=m(St=0z~qa;1vG>)j}CWk^Z{n^Xjd!%H+5gJo#y`Fpb8Xp~G;ASIHp;w{07if)vn z`hiG0Pg5am6m?q&-^rEhJ20E27bEviD3pF8DZ3>n?DPD9O{nvR8JCm z@*VB%hzaH3r~D5w%yn`h(LhTs{Bb}d*-sjdc|;h(2iecZEfo@`l>OdU`H=l61S#>l zzos>%KM|-0NfxR3UH?*u!HQc8xB=I|c^DL7<`WSYXE|g&6S7{!g>2c7^&lle)_ar6 zGyhY|L$Z8wc@e9}-GG#B5|9H9iH@b@QfDnB-(8S5gQ9yAzhw;zXp^?H#Q@~ZR!qh={%Qki_bjJ#S#GN-aZ5b3W7uDg*vLgrM3YG=@;G z45blLlJ$5!&Q$LHR!DNidS2jtEnc|_*||>~s2_k7*$;}4*hCfg^|z9c1Ziadr9~-w zAZPD(2QUK6O@j{pTgu0#;OUgi5H|+KK)=|ou*EUx7Y6?}^owIE1>=Jr7oc@I?5wt^z2)=jNqZ5TV`Gwv|W;i_Igi8b5- zn%^m@I_&O{I?9e3bUEz9BhXRQ9=x-;FSqvOoo)d8*)GukDYUQGb~onD$0P7w*ajVC zySrVy;O>?>N{Y-r&{1}8n6EwV2pfftvW@MPI!eIZ>F#3tpredIN7>`>E?3FI|{as?YS~>5J!5r^%RkJXH-t zQ=u!Y<0)m<@{>LEIMj#$vv0zD58Z+6Gm!kr5M2%C#+K42b4x(*v0MR0MN&c#>%g5` zOCBlaSE}(z602a$7L_t8->c9fN{%dKof{87f%CJKt2^snxDs1^S@6m<0e~V)bc^LEIK9bc##WaKs?7ec{ad?KDP(~UyAAKK82Ri3(EV+ z!-t@aKnkxEJRej~&j;NW2K8)y9`Ta1rh+JiBZ%xwmUFrabW~uz#9`QGo>44wGV=1@ zAn=>iM0~zNce+hwtr^ADEN&pY0D8nq)NZ-LHxYiIq0L^F5k)Ca)Emqku8h4H3D8oj zBKf&vPHFv3!#@9q)Q^t{Y|tx4pQ=mn-zIdaH$;l|twzWBzoY&{q#B}D5jqdp$n(ov z0xw9KBF&o&U}8Ld0aw1X+`;COgmT538M?jpbf`NNfNlud;Qy3b{1G*vbNijUxq3BW94^kbveu zT0_J)qVF}f>ItJuA2CLB=p(vuCjh%|%m|oNDrM}FrIpMN{o_ZG3#KpB&?Ha-btQE3 z&H>}NK4k2Y*a7Gi5wv4MQ&4)zXX_*f8al~l)cXIYlhBx{r{*2)o#Y=tJ2?sMn?6+ccaTqTB46Rz&kqAYr3o9#7x%=*I0ry2wQ5et0PR;621?NqtHv1 zak+s!6L}-9&M2mGeQmh`266dobw5icI*=Q%xI4DXy>@C`WNteN0vh z$i<=-!TeRkLeV7^_2H(ZZ8ibId zzKkhw`kB8mwc~9#doe))l%VMgexRmrP}UGa4ablRqy7_eBglObxzWdRk=B~rn3~-4 zJcMz~R8PnQ3}!Aa+>>g)BX5){DR_vbvMIXWg;64P`a-H@4I5uO@*>slCls$n+c4C+ zb+)6yaK#(T*VmrjYN4@;rd&lHK|j47)iqoVN+*xZe5nEx(36(4vFJ@}#Y*lhR@5Qu z3#};Dl$a2I#wVsQX9^xfZj(giBzk&8j`C69+Vr`rkSXYFe-u4E zJ$+$%=JeDIQshA&M!}85*R_uLGR0EGxsLww7~yt*RzE%#ElQJAHr~dZFm*FugT>4S z1u6FQgGP2D7p4M9LF;pa$UThQ&||qsYf(^GQIH&aH8lOk^vtjbrVHXG2S*6Yq?;* z)>K5=r{Xm2MbmQ2N?nn*vNcXBreHR;^Z6XCYl7_Y3p`sO&&xKxiYPLwb~ZP;RKuky zqIN(ct7>B1h8I!Dtqm}O>+HYre3QlSCNW3*c1NehI%eqqtoi0W$Q@lr5MX}AtM182 z)jzS{IoTLic)}&lN3zK{6V774FX5+ z`l&4BU}ewde}!g}Fh6T1F^^lGG> zKyGLQ3#i;Ma;K0Rc`O%ct^G5)UKlwvapv-LX6EuM=cZ0hxHvqo&SWm1&&*uAI;B!w zN|pysGhXeTaL-Mh-H4{dj-5}fgih*a6^fVkJQa8HrPDKKF3+^kAnqIt%9AdE@!rYK^6FIaG9uWQwIXUDI>zxqr zQ(9Nsb>u+EcP`-#0+Da9nS2vTye5JhN=yP1lB+Y1ceL@2HQsTQD2dR95_^FOw-DzR z-W|ZWBf=Xn29X1&5a(-nljFyGbe*xTpCJ)lpQ8NI;6Tv@XM|Wi27{kZ#P4b~GpFH_ zMO#Xc*_i(pg<$fA(>X#NGVOOLgfuwE5`;;(7j6C#QstF{^RLPIhhoTuvuyEj1FH{M ze}VK)s>S)w4XjS&!iBb&YGCyP>;C`?AD{v2uNzoh$n6If%rC+Acx^jAZDd&jTQkM=gl#0um?QmZy z>A1ksA>IgbufTPOG_;PuSpx=Cw7u<~@nsmJYk@BXZt1tQ(UjUas(n`$;Q^4IgeW6F z9Z4TPawdKAP&J{Xg!Iwb^sFti)t>3bft9{^DSh-zT3SP@+b*fSH~rjgq+KZDYPT{6 z7ncL$2h%t}U#%wJ$jKTf!G259UnQ%fN179_Hwc+BIYPjg-{x^0y_>Dv%Pl$>GXI{7(z``rC zx%}vEW!EfICrAoa>L7`GQkak@Atzl;DoUIb15P`q)X=%SzE8CVaE3yY1kMpSOMqlU z{;LGOOMpzSa6PPu4Y&3@O1b3er!M?+6e2}K8tdq*rN&2A@SEV-IeRuccf*S`4te<0 z@G#+$d6qmD8y{K)`8Mk64$2}k3pq_g2hY&ND4X9urc~YDC_RJ<@W7#C;g9K9p~+Ep z3DRY8oU}bDgy~5(AG+SgLzm)B^HofmzS`jsFgEL(jgSw5oJnPY`?Huo38NG)#_t%P z@O|ic)xV#wZ4Oeh-h603eAD42}E197$9LzOQL~rsjhcm;YO!v<**+Nz%Fp zcMtf8NRk*pg$_yd#+?{6GHN-xzj!kT`bs-+eL+s^N5>#Nvi)}vpkr`JUx3f$4{`kA z&xbo%+PICn4;wskpc3eHlbNfyIecpmDF%pPbG+X4u=KkoTzIh|!vD_1apy-YC}V!a z;IP|>LHfi^5PUJ=Zy0(YZ#9=8wPMuasZn1wvexD5k#rg#2JsbgeLqnVxqPllCfedS z;$zD9c^sM&YlfxYP_;LmRwQ*|&bc{OERprz<}+ZP#9X#4ZP&_!L-M86#;e9ys%oTm zR?XC<+!=U^McZ0~`;K(8&QMcctPzoIjfOytm09d%D|vU|FY+_f&*2IQPQzDHV3bUzI{3sd>yE>V zh}_0zD+@EDCci?Xcoo2tUt{I5q2a4AZj&z)E|X{@9Q^^8oKO3QXTWj}m-vMR@POQE zyqLOHfhikGt`~x3RR+C~$lZ|EasFr2#zQI{z?~Hp1P3=h%_0g{Uuu!&%6HM65Gm7+ zA-`0E547Xh>f1nmm_}%D*zD)wO_eR@c!)~TKI8Q_lEw?IPd(v6#~n@rBECvf1;(2j zkt26ExMi&{c)I20c#P0u*vN_w(Hp*4OKdvg^aDZyCWTk-z0E|5akz)S#LmHd!gGHD2=T7^`qy1mB>Y1brz}E z0|N3?khgQcNl|%4_xC9DcM1GD0lIYKKOykn3H&92|3=^j0eMaOK85}i!1ICA)#?3* zlty zu5D>+2U} Date: Mon, 29 Apr 2024 00:36:31 -0400 Subject: [PATCH 09/11] Remove unnecessary diffs --- README.md | 4 ---- .../llm_exl2_client_multi.cpython-310.pyc | Bin 17443 -> 0 bytes 2 files changed, 4 deletions(-) delete mode 100644 __pycache__/llm_exl2_client_multi.cpython-310.pyc diff --git a/README.md b/README.md index 81380bd..482f6ea 100644 --- a/README.md +++ b/README.md @@ -64,7 +64,3 @@ pip install flash-attn --no-build-isolation ``` Ubuntu 23 does not require the adding of any repos for the toolkit. - -``` -python llm_exl2_client_multi.py --port=5000 --use_outlines --gpu_split="5" --max_context=512 --repo_str=phi3b -``` \ No newline at end of file diff --git a/__pycache__/llm_exl2_client_multi.cpython-310.pyc b/__pycache__/llm_exl2_client_multi.cpython-310.pyc deleted file mode 100644 index 711d0725e3b47b67e90039425d70e44121ba982d..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 17443 zcmbt*Yj7M_c3yW+&-;PF`@z=`1Ro$V;8UVVfuu-~lqiuPMS#+(MeXbWw*d?|FWkOE z60lRtKxij%bCO3F@o!(UFKq$r2W@{$P0L!zcMwA zX-v;+mU`=!F5`w}$hXh($+u~l^6j_$@*S`O@*T8-@*T25@*TFqcpLdhA!cJ4NAhEZw3XI1d+*YCeZOr09%zFH?L(`&`jhoW z^M?yZtRsbItY-?(TF(}aT1N}dSzV)^|_0Chf_kv#j4f#|GHo zJsn|hriv3FcA{$%P(Z(zFOZEkpUNx*K3bS#xAK}Zuzrys2eqZNQ z_Da!@+*gr%0J&G#LF=lm)u&O{jQz}Asu86#*MRfds)l(xXuZx3S)XHvtvA>a%R&ur zT5sAGaIV{L0=~tbu|AK~jFkcWg7pQ!EZX{xy>DrDY0iFS=^A?$@GU$H&viUDo_RZC ze<7y^Xa?U%N{qlQlZOWiQs3*fDneU0JGrgFU~h zp9O{-{V`cCZ%@<<7)z09b}>_86M$uQ0&tlf2ITA{pkwc@3t&}pnsswQvu@$Jjb{Z< z70(y(xOi%KzJ%v(Ja_QCgXhb5?&h@YyX=(p75nF?|EeeG^;gmB7i5q7*=cqLwDL8U zXJ^^DcXjLQ>^yrBl=ltx61#x+H|i6Wlzrrp8>*v`e zHih>uu*>WU-uKz7>?+>xv1vAg_updI*lT$IB72>E4)4Fs-e4Bq-)3*J>v(_1{#yOJ z?5$OuecnDjZ?Fve!n-DV`b$W^#Ii`AWpnRh4z2IW6owR=&C8VU+wZf58^(QuFM~Vv z`~&NO{b1<>wrF3rudp0jVmI#jK_U9Id`1foX$PDd^?UKt&Ps9ag0OjZHg9|8#hfF& z@QZd438KV3|FvSSRP;iZ%L4CgepL5@m$Kz@uDCF2c){~oN1VQL!Hd5%GjrwK?KxYb z0Vy$g`Reqg9AkO=R+igH>6{ka&K6Mcs_m3ZMTZi@XR?lcsl@C&`qFc{B1$rHzQiZ9 z6(^g&cnLj3>P+c|UCdQ&?uE*^vYpQrZ7()s-xjJ;r1wxvjsx9prlN(w%p#g)2t^B( zyvP+xg&c+v@WSVAU(9C<+1Czv9j)MGsW_io@DghY*||mA>s}wvJU5#YUgw6ySC4pI zt)y0mF-x0TsjVqko8aVHX0}ky+kB3eru}CiGKt3t0SK+GuWHlUs6K_7%}OQ5JU_uB zA8LFn5F~hX^z8BLWt%&+WSKHAE!lIzxn9Vp=khtbD6Z%8g^YbWe<-8k896oMzpjndJ@}QwK0?6D}R9~-3wqr3S}|nnT{xxA2Ljx7b#?KX9Q8X^Ap77&x;DXfK|+j z3b)UD0Y@^>Lrk*f1!qxVrZn%F_S|C0>zwD*MsaQ>Q?{|@V#Q0Ax$PjuZl-WqFq|(f z5Jk*7o?b+qmHGMHZO@O6WXhl?$L6C0UMQ0(W(#&E<3%zVbh?tKa4eHqu4MC#93Q&l zb!0NxVzDH$G)yOx;Y17EM_`0NfWR&Q&zCC-zK25o`FtrW_$YhkAfDMzEux$qCN<{&_!j8(RAN&~t&k{IF;5hi7gQ&f#%rqNw9B zom1Ll-D~mK@Y8rKAn9|Bx5D5$aGv5inWuOw0$PpAxZf)??T z%8$x8@sf%Yuf;9mB~>5sR)^#z6;Dch;w6Poyw%Q2inoZD)_BWjt?`oLtpVf@5{B%P zR!%IYoU%Qao6p%SRW5NcPUkSQojIP9l0r3* zPE&ey=n7y;lu{h!#iHG;qf{KHb572sxoGFh)kHdtT`JR@W-o%7=XOT$id~IgbL>92yNDBf|M=Ea@7sTRtHI`1#k-Q+~gKCmG^IQs|LQ1BNS0l~* z9Z#}<%t797u9EaB8KX@bu$U`+V^-&2yr8)wY_IF-|qL=bZomK}{kM8X&ydpUy< zd2wehi+vx>W$ePN*Quh}e0edOxtYyZY%g9cW#;o)XAy)iie7KALQ(}zEaBkL;p#l| zB5QNvg;aR?h?kUy9f@?A8;}su+i+`fykNd`D^suwUJ!INTXO7b$Hh{XVNhxoj;H2` z9h*M`0TT}qx6=c#pt;&xftoHfSHG*jXDl1jipocI$wHtEWBh;@p%&2$CbNwP=tS`% z6T~A}h4RT?rdp|q9*t-5dZdp%B-UE^{`ctj@Khu4KTka*=B;ksyLWGjx@)uX@l>&N zD|MkbZ*w||$H&Kyd*->%T|5Ln@Qa+t+YkQ%%FI#q%^!^7fJ?X$w1xb zYQXlffU5y7D1stX53NGQKrAG}fZ-2!BO?;2 z_Nnj}hjVQB`0yNd-#G|C>1uj-Y`8Q_5@;iNoTgPtM&omfmEsK`%t1&OHVRfFW5bft zfm}UFIcftPKK>?ZE;9S}@bLp5l(p?aR6_c=Tmxdv|q&eM`QY1|kActZIBf z1Vr$z4s9{0LLziSgB>8O^6%>og~F@ab+Ay)sQGGU&0h;|Qg;lsu_$^IE%u5Y{&ms&SbA9WZA^2c!k6i18sLB!WY8;WF*m>_1#~L_E;QUzPY?e65Hk_dbPA72wMB;3bIGt@cA(_4v=~S~( zi;Hdblp9~#?iy~)^|7vdU}3Op5UJ8$B8CydOe1YaJ;l0ZXas3d8QO`EDMNBq*RLfX zX{@!*KV8sj39(DX@4_1MR%nW zrQG+m4-D2X_KJP=eYDYtF|y3L-8tRuVnYvn3p(5Mz+dZ8*dl$CY+=g_nn-x&sqhF~#~^Y@!(J z)(vCb=Jv7F0|WNS?KE0&Oh0IN_<@h@co3)!x`V0)tiEgoeH?CucC0 zdj56>Fw2M72+h%Q$LP+f>NArSj&n%lS5j9mUpbee6P?b~oP%RZq?}5bLPj9Pxt$UaaIy>8TycE&qwX0%p`+iEANFrTfi z@ELSM3AiM-NdXs!YtT{F6eZxErM?>qOKPJK??;gKDBL=x9`#c2v7<>IbNnQ5Kg8JO z`GaGE4j(VR?%?^*?-@7_0_voL+^-Hasc*$#4I6?N6Hv{ zQHDW#L>F3!ZSqO?$XM7q`i84H zx)OE0OyCCfKahicl*3|#+Hz%{r;H+jC4kW(FER@rAA%%)XhE!m|Z z=m(St=0z~qa;1vG>)j}CWk^Z{n^Xjd!%H+5gJo#y`Fpb8Xp~G;ASIHp;w{07if)vn z`hiG0Pg5am6m?q&-^rEhJ20E27bEviD3pF8DZ3>n?DPD9O{nvR8JCm z@*VB%hzaH3r~D5w%yn`h(LhTs{Bb}d*-sjdc|;h(2iecZEfo@`l>OdU`H=l61S#>l zzos>%KM|-0NfxR3UH?*u!HQc8xB=I|c^DL7<`WSYXE|g&6S7{!g>2c7^&lle)_ar6 zGyhY|L$Z8wc@e9}-GG#B5|9H9iH@b@QfDnB-(8S5gQ9yAzhw;zXp^?H#Q@~ZR!qh={%Qki_bjJ#S#GN-aZ5b3W7uDg*vLgrM3YG=@;G z45blLlJ$5!&Q$LHR!DNidS2jtEnc|_*||>~s2_k7*$;}4*hCfg^|z9c1Ziadr9~-w zAZPD(2QUK6O@j{pTgu0#;OUgi5H|+KK)=|ou*EUx7Y6?}^owIE1>=Jr7oc@I?5wt^z2)=jNqZ5TV`Gwv|W;i_Igi8b5- zn%^m@I_&O{I?9e3bUEz9BhXRQ9=x-;FSqvOoo)d8*)GukDYUQGb~onD$0P7w*ajVC zySrVy;O>?>N{Y-r&{1}8n6EwV2pfftvW@MPI!eIZ>F#3tpredIN7>`>E?3FI|{as?YS~>5J!5r^%RkJXH-t zQ=u!Y<0)m<@{>LEIMj#$vv0zD58Z+6Gm!kr5M2%C#+K42b4x(*v0MR0MN&c#>%g5` zOCBlaSE}(z602a$7L_t8->c9fN{%dKof{87f%CJKt2^snxDs1^S@6m<0e~V)bc^LEIK9bc##WaKs?7ec{ad?KDP(~UyAAKK82Ri3(EV+ z!-t@aKnkxEJRej~&j;NW2K8)y9`Ta1rh+JiBZ%xwmUFrabW~uz#9`QGo>44wGV=1@ zAn=>iM0~zNce+hwtr^ADEN&pY0D8nq)NZ-LHxYiIq0L^F5k)Ca)Emqku8h4H3D8oj zBKf&vPHFv3!#@9q)Q^t{Y|tx4pQ=mn-zIdaH$;l|twzWBzoY&{q#B}D5jqdp$n(ov z0xw9KBF&o&U}8Ld0aw1X+`;COgmT538M?jpbf`NNfNlud;Qy3b{1G*vbNijUxq3BW94^kbveu zT0_J)qVF}f>ItJuA2CLB=p(vuCjh%|%m|oNDrM}FrIpMN{o_ZG3#KpB&?Ha-btQE3 z&H>}NK4k2Y*a7Gi5wv4MQ&4)zXX_*f8al~l)cXIYlhBx{r{*2)o#Y=tJ2?sMn?6+ccaTqTB46Rz&kqAYr3o9#7x%=*I0ry2wQ5et0PR;621?NqtHv1 zak+s!6L}-9&M2mGeQmh`266dobw5icI*=Q%xI4DXy>@C`WNteN0vh z$i<=-!TeRkLeV7^_2H(ZZ8ibId zzKkhw`kB8mwc~9#doe))l%VMgexRmrP}UGa4ablRqy7_eBglObxzWdRk=B~rn3~-4 zJcMz~R8PnQ3}!Aa+>>g)BX5){DR_vbvMIXWg;64P`a-H@4I5uO@*>slCls$n+c4C+ zb+)6yaK#(T*VmrjYN4@;rd&lHK|j47)iqoVN+*xZe5nEx(36(4vFJ@}#Y*lhR@5Qu z3#};Dl$a2I#wVsQX9^xfZj(giBzk&8j`C69+Vr`rkSXYFe-u4E zJ$+$%=JeDIQshA&M!}85*R_uLGR0EGxsLww7~yt*RzE%#ElQJAHr~dZFm*FugT>4S z1u6FQgGP2D7p4M9LF;pa$UThQ&||qsYf(^GQIH&aH8lOk^vtjbrVHXG2S*6Yq?;* z)>K5=r{Xm2MbmQ2N?nn*vNcXBreHR;^Z6XCYl7_Y3p`sO&&xKxiYPLwb~ZP;RKuky zqIN(ct7>B1h8I!Dtqm}O>+HYre3QlSCNW3*c1NehI%eqqtoi0W$Q@lr5MX}AtM182 z)jzS{IoTLic)}&lN3zK{6V774FX5+ z`l&4BU}ewde}!g}Fh6T1F^^lGG> zKyGLQ3#i;Ma;K0Rc`O%ct^G5)UKlwvapv-LX6EuM=cZ0hxHvqo&SWm1&&*uAI;B!w zN|pysGhXeTaL-Mh-H4{dj-5}fgih*a6^fVkJQa8HrPDKKF3+^kAnqIt%9AdE@!rYK^6FIaG9uWQwIXUDI>zxqr zQ(9Nsb>u+EcP`-#0+Da9nS2vTye5JhN=yP1lB+Y1ceL@2HQsTQD2dR95_^FOw-DzR z-W|ZWBf=Xn29X1&5a(-nljFyGbe*xTpCJ)lpQ8NI;6Tv@XM|Wi27{kZ#P4b~GpFH_ zMO#Xc*_i(pg<$fA(>X#NGVOOLgfuwE5`;;(7j6C#QstF{^RLPIhhoTuvuyEj1FH{M ze}VK)s>S)w4XjS&!iBb&YGCyP>;C`?AD{v2uNzoh$n6If%rC+Acx^jAZDd&jTQkM=gl#0um?QmZy z>A1ksA>IgbufTPOG_;PuSpx=Cw7u<~@nsmJYk@BXZt1tQ(UjUas(n`$;Q^4IgeW6F z9Z4TPawdKAP&J{Xg!Iwb^sFti)t>3bft9{^DSh-zT3SP@+b*fSH~rjgq+KZDYPT{6 z7ncL$2h%t}U#%wJ$jKTf!G259UnQ%fN179_Hwc+BIYPjg-{x^0y_>Dv%Pl$>GXI{7(z``rC zx%}vEW!EfICrAoa>L7`GQkak@Atzl;DoUIb15P`q)X=%SzE8CVaE3yY1kMpSOMqlU z{;LGOOMpzSa6PPu4Y&3@O1b3er!M?+6e2}K8tdq*rN&2A@SEV-IeRuccf*S`4te<0 z@G#+$d6qmD8y{K)`8Mk64$2}k3pq_g2hY&ND4X9urc~YDC_RJ<@W7#C;g9K9p~+Ep z3DRY8oU}bDgy~5(AG+SgLzm)B^HofmzS`jsFgEL(jgSw5oJnPY`?Huo38NG)#_t%P z@O|ic)xV#wZ4Oeh-h603eAD42}E197$9LzOQL~rsjhcm;YO!v<**+Nz%Fp zcMtf8NRk*pg$_yd#+?{6GHN-xzj!kT`bs-+eL+s^N5>#Nvi)}vpkr`JUx3f$4{`kA z&xbo%+PICn4;wskpc3eHlbNfyIecpmDF%pPbG+X4u=KkoTzIh|!vD_1apy-YC}V!a z;IP|>LHfi^5PUJ=Zy0(YZ#9=8wPMuasZn1wvexD5k#rg#2JsbgeLqnVxqPllCfedS z;$zD9c^sM&YlfxYP_;LmRwQ*|&bc{OERprz<}+ZP#9X#4ZP&_!L-M86#;e9ys%oTm zR?XC<+!=U^McZ0~`;K(8&QMcctPzoIjfOytm09d%D|vU|FY+_f&*2IQPQzDHV3bUzI{3sd>yE>V zh}_0zD+@EDCci?Xcoo2tUt{I5q2a4AZj&z)E|X{@9Q^^8oKO3QXTWj}m-vMR@POQE zyqLOHfhikGt`~x3RR+C~$lZ|EasFr2#zQI{z?~Hp1P3=h%_0g{Uuu!&%6HM65Gm7+ zA-`0E547Xh>f1nmm_}%D*zD)wO_eR@c!)~TKI8Q_lEw?IPd(v6#~n@rBECvf1;(2j zkt26ExMi&{c)I20c#P0u*vN_w(Hp*4OKdvg^aDZyCWTk-z0E|5akz)S#LmHd!gGHD2=T7^`qy1mB>Y1brz}E z0|N3?khgQcNl|%4_xC9DcM1GD0lIYKKOykn3H&92|3=^j0eMaOK85}i!1ICA)#?3* zlty zu5D>+2U} Date: Mon, 29 Apr 2024 00:37:59 -0400 Subject: [PATCH 10/11] Remove diffs --- llm_exl2_client_multi.py | 1 + 1 file changed, 1 insertion(+) diff --git a/llm_exl2_client_multi.py b/llm_exl2_client_multi.py index 4e93872..fd734ca 100644 --- a/llm_exl2_client_multi.py +++ b/llm_exl2_client_multi.py @@ -736,6 +736,7 @@ async def mainchat(request: ChatCompletionRequest): prompt = await format_prompt_commandr(request.messages) else: prompt = await format_prompt(request.messages) + print(prompt) timeout = 180 # seconds start_time = time.time() From 08380200c9f4a4ba0839720b9af74974d5e331ca Mon Sep 17 00:00:00 2001 From: isamu-isozaki Date: Mon, 29 Apr 2024 18:07:40 -0400 Subject: [PATCH 11/11] Changed file name --- llm_exl2_client_multi.py | 226 +-------- llm_exl2_client_multi_outlines.py | 819 ++++++++++++++++++++++++++++++ 2 files changed, 835 insertions(+), 210 deletions(-) create mode 100644 llm_exl2_client_multi_outlines.py diff --git a/llm_exl2_client_multi.py b/llm_exl2_client_multi.py index fd734ca..1717527 100644 --- a/llm_exl2_client_multi.py +++ b/llm_exl2_client_multi.py @@ -9,26 +9,23 @@ import torch import random from typing import AsyncIterable, List, Generator, Union, Optional -import traceback -from typing import Mapping + import requests import sseclient import subprocess import re -from fastapi import FastAPI, HTTPException +from fastapi import FastAPI from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import StreamingResponse from pydantic import BaseModel from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, TextStreamer, TextIteratorStreamer from threading import Thread +from auto_gptq import exllama_set_max_input_length import queue import numpy as np import sys, os -import outlines -from outlines.samplers import multinomial - sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from exllamav2 import( @@ -36,7 +33,6 @@ ExLlamaV2Config, ExLlamaV2Cache, ExLlamaV2Cache_8bit, - ExLlamaV2Cache_Q4, ExLlamaV2Tokenizer, ) @@ -83,28 +79,17 @@ class ChatCompletionRequest(BaseModel): n: Optional[int] = 1 # default value of 1, batch size top_p: Optional[float] = 0.0 # default value of 0.0 user: Optional[str] = None - stop_at: Optional[str] = None - outlines_type: Optional[str] = None - choices: Optional[list[str]] = None - regex: Optional[str] = None - json: Optional[str] = None +repo_str = 'commandr-exl2' #repo_str = 'theprofessor-exl2-speculative' parser = argparse.ArgumentParser(description='Run server with specified port.') # Add argument for port with default type as integer parser.add_argument('--port', type=int, help='Port to run the server on.') -parser.add_argument('--use_outlines', action='store_true', help='Use outlines.') -parser.add_argument('--gpu_split', type=str, default="17,19,19,19", help='GPU splits.') -parser.add_argument('--max_context', type=int, default=12288, help='Context length.') -parser.add_argument('--cache_8bit', action='store_true', help='Use 8 bit cache.') -parser.add_argument('--cache_q4', action='store_true', help='Use 4 bit cache.') -parser.add_argument('--repo_str', type=str, default='commandr-exl2', help='The model repository name') # Parse the arguments args = parser.parse_args() -repo_str = args.repo_str config = configparser.ConfigParser() config.read('config.ini') @@ -127,41 +112,22 @@ class ChatCompletionRequest(BaseModel): dynamic_rope_offset = 0.0 ropescale = 1.0 -max_context = args.max_context +max_context = 12288 config.scale_alpha_value = ropescale config.max_seq_len = max_context base_model_native_max = 4096 -cache_8bit = args.cache_8bit -cache_q4 = args.cache_q4 - -if args.use_outlines: - model = outlines.models.exl2( - config.model_dir, - "cuda", - max_seq_len = config.max_seq_len, - scale_pos_emb = config.scale_pos_emb, - scale_alpha_value = config.scale_alpha_value, - no_flash_attn = config.no_flash_attn, - num_experts_per_token = config.num_experts_per_token, - cache_8bit = cache_8bit, - cache_q4 = cache_q4, - tokenizer_kwargs = {}, - gpu_split = args.gpu_split, # we might be able to make this auto - low_mem = None, - verbose = None - ) -else: - model = ExLlamaV2(config) + +model = ExLlamaV2(config) print("Loading model: " + repo_id) #cache = ExLlamaV2Cache(model, lazy=True, max_seq_len = 20480) #model.load_autosplit(cache) -if not args.use_outlines: - model.load([int(gpu_memory) for gpu_memory in args.gpu_split.split(",")]) +model.load([17,19,19,19]) tokenizer = ExLlamaV2Tokenizer(config) # Cache mode +cache_8bit = True settings_proto = ExLlamaV2Sampler.Settings() settings_proto.temperature = 0 @@ -180,9 +146,6 @@ class ChatCompletionRequest(BaseModel): prompt_ids = [] streamer = [] caches = [] -input_prompts = [] -generators = [] -generations = [] settings = [] future_tokens = [] future_logits = [] @@ -193,7 +156,7 @@ class ChatCompletionRequest(BaseModel): partial_responses = {} max_parallel_seqs = 3 -num_of_gpus = len(args.gpu_split.split(",")) +num_of_gpus = 4 print("*** Loaded.. now Inference...:") @@ -217,144 +180,6 @@ async def stream_response(prompt_id, timeout=180): yield f'data: {{"id":"chatcmpl-{prompt_id}","object":"chat.completion.chunk","created":{int(time.time())},"model":"{repo_str}","choices":[{{"index":0,"delta":{{}},"finish_reason":"stop"}}]}}\n\n' break -# Worker thread function -def process_outline_prompts(): - global partial_responses - assert args.use_outlines - assert not use_dynamic_rope_scaling, "Currently ROPE scaling is not supported with outlines" - base_model = model.model - while True: - while not prompts.empty() or len(prompt_ids): - while len(prompt_ids) < max_parallel_seqs and not prompts.empty(): - prompt_id, prompt, max_tokens, stream, temperature, outlines_dict = prompts.get() - print(f"got prompt with outlines dict {outlines_dict}") - sampler = multinomial(top_k=50, top_p=1.0, temperature=temperature) - ids = tokenizer.encode(prompt) - prompt_tokens = ids.shape[-1] - full_tokens = prompt_tokens + max_tokens - print("Processing prompt: " + str(prompt_id) + " Req tokens: " + str(full_tokens)) - # Truncate if new_tokens exceed max_context - if full_tokens > max_context: - # Calculate how many tokens to truncate - ids = tokenizer.encode("Say, 'Prompt exceeds allowed length. Please try again.'") - # Update new_tokens after truncation - prompt_tokens = ids.shape[-1] - full_tokens = prompt_tokens + max_tokens - print("Truncating prompt: " + str(prompt_id) + " Req tokens: " + str(full_tokens)) - if cache_8bit: - ncache = ExLlamaV2Cache_8bit(base_model, lazy=not base_model.loaded, max_seq_len = full_tokens) # (max_seq_len could be different for each cache) - elif cache_q4: - ncache = ExLlamaV2Cache_Q4(base_model, lazy=not base_model.loaded, max_seq_len = full_tokens) - else: - ncache = ExLlamaV2Cache(base_model, lazy=not base_model.loaded, max_seq_len = full_tokens) # (max_seq_len could be different for each cache) - model.cache = ncache - model.past_seq = None - stop_at = outlines_dict["stop_at"] - if outlines_dict["type"] == "choices": - generator = outlines.generate.choice(model, outlines_dict["choices"], sampler=sampler) - elif outlines_dict["type"] == "json": - generator = outlines.generate.json(model, outlines_dict["json"], sampler=sampler) - elif outlines_dict["type"] == "regex": - generator = outlines.generate.regex(model, outlines_dict["regex"], sampler=sampler) - else: - generator = outlines.generate.text(model, sampler=sampler) - generators.append(generator.stream(prompt, stop_at=stop_at, max_tokens=max_tokens)) - prompt_ids.append(prompt_id) - input_prompts.append(prompt) - generations.append("") - caches.append(ncache) - streamer.append(stream) - if(len(prompt_ids)): - eos = [] - for i in range(len(prompt_ids)): - model.cache = caches[i] - is_finished = False - try: - decoded_response_token = next(generators[i]) - generations[i] += decoded_response_token - except StopIteration: - is_finished = True - reason = None - if(streamer[i]): - outcontent = decoded_response_token - if is_finished: - outcontent = "" - reason = "stop" - partial_response_data = { - "id": f"chatcmpl-{prompt_ids[i]}", - "object": "chat.completion.chunk", - "created": int(time.time()), - "model": repo_str, - "choices": [ - { - "index": 0, - "delta": { - "content": outcontent - }, - "finish_reason": reason - } - ] - } - - # Initialize a list for new prompt_id or append to existing one - if prompt_ids[i] not in partial_responses: - partial_responses[prompt_ids[i]] = [] - partial_responses[prompt_ids[i]].append(partial_response_data) - - if is_finished: - eos.insert(0, i) - - # Generate and store response - for i in eos: - output = generations[i].strip() - prompt = input_prompts[i] - #output = tokenizer.decode(input_ids[i])[0] - print("-----") - print(output) - generated_text = output - # Calculate token counts - completion_tokens = (tokenizer.encode(generated_text)).shape[-1] - prompt_tokens = (tokenizer.encode(prompt)).shape[-1] - full_tokens = completion_tokens + prompt_tokens - eos_prompt_id = prompt_ids.pop(i) - if(streamer[i]): - ## Generator, yield here.. - partial_response_data = { - "finish_reason": "stop" - } - - responses[eos_prompt_id] = partial_response_data - else:# Construct the response based on the format - response_data = { - "id": f"chatcmpl-{prompt_id}", - "object": "chat.completion", - "created": int(time.time()), - "model": repo_str, - "choices": [{ - "index": 0, - "message": { - "role": "assistant", - "content": generated_text, - }, - "finish_reason": "stop" - }], - "usage": { - "prompt_tokens": prompt_tokens, - "completion_tokens": completion_tokens, - "total_tokens": full_tokens - } - } - responses[eos_prompt_id] = response_data - # Clean up - input_prompts.pop(i) - generations.pop(i) - caches.pop(i) - streamer.pop(i) - - else: - # Sleep for a short duration when there's no work - time.sleep(0.1) # Sleep for 100 milliseconds - # Worker thread function def process_prompts(): @@ -428,6 +253,8 @@ def process_prompts(): prompt_ids.append(prompt_id) caches.append(ncache) streamer.append(stream) + settings_proto.temperature = temperature + settings.append(settings_proto.clone()) # Need individual settings per prompt to support Mirostat future_tokens.append(None) future_logits.append(None) #print("Prompt added to queue: " + str(prompt_id)) @@ -536,6 +363,7 @@ def process_prompts(): "total_tokens": full_tokens } } + responses[eos_prompt_id] = response_data # Clean up @@ -554,10 +382,8 @@ def process_prompts(): - - # Start worker thread -worker = Thread(target=process_prompts if not args.use_outlines else process_outline_prompts) +worker = Thread(target=process_prompts) worker.start() @@ -716,6 +542,7 @@ async def format_prompt_commandr(messages): @app.post('/v1/chat/completions') async def mainchat(request: ChatCompletionRequest): + try: prompt = '' if repo_str == 'Phind-CodeLlama-34B-v2': @@ -741,27 +568,7 @@ async def mainchat(request: ChatCompletionRequest): timeout = 180 # seconds start_time = time.time() prompt_id = generate_unique_id() # Replace with a function to generate unique IDs - outlines_dict = {} - if request.stop_at is not None: - outlines_dict["stop_at"] = request.stop_at - if request.outlines_type is not None: - outlines_dict["type"] = request.outlines_type - if outlines_dict["type"] == "choices": - assert request.choices is not None - outlines_dict["choices"] = request.choices - elif outlines_dict["type"] == "json": - assert request.json is not None - outlines_dict["json"] = request.json - elif outlines_dict["type"] == "regex": - assert request.regex is not None - outlines_dict["regex"] = request.regex - else: - assert (outlines_dict["type"] == "text") or not args.outlines - if not args.use_outlines: - prompts.put((prompt_id, prompt, request.max_tokens, request.stream, request.temperature)) - else: - prompts.put((prompt_id, prompt, request.max_tokens, request.stream, request.temperature, outlines_dict)) - + prompts.put((prompt_id, prompt, request.max_tokens, request.stream, request.temperature)) if request.stream: #response = StreamingResponse(streaming_request(prompt, request.max_tokens, tempmodel=repo_str, response_format='chat_completion'), media_type="text/event-stream") @@ -777,7 +584,6 @@ async def mainchat(request: ChatCompletionRequest): return responses.pop(prompt_id) except Exception as e: - print(traceback.format_exc()) raise HTTPException(status_code=500, detail=str(e)) return response diff --git a/llm_exl2_client_multi_outlines.py b/llm_exl2_client_multi_outlines.py new file mode 100644 index 0000000..fd734ca --- /dev/null +++ b/llm_exl2_client_multi_outlines.py @@ -0,0 +1,819 @@ +import asyncio +import json +import os +import logging +import time +import configparser +import argparse +import tiktoken +import torch +import random +from typing import AsyncIterable, List, Generator, Union, Optional +import traceback +from typing import Mapping +import requests +import sseclient +import subprocess +import re + +from fastapi import FastAPI, HTTPException +from fastapi.middleware.cors import CORSMiddleware +from fastapi.responses import StreamingResponse +from pydantic import BaseModel +from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, TextStreamer, TextIteratorStreamer +from threading import Thread +import queue +import numpy as np + +import sys, os +import outlines +from outlines.samplers import multinomial + +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from exllamav2 import( + ExLlamaV2, + ExLlamaV2Config, + ExLlamaV2Cache, + ExLlamaV2Cache_8bit, + ExLlamaV2Cache_Q4, + ExLlamaV2Tokenizer, +) + +from exllamav2.generator import ( + ExLlamaV2StreamingGenerator, + ExLlamaV2Sampler +) +import uuid + +def generate_unique_id(): + return uuid.uuid4() + +class CompletionRequest(BaseModel): + model: str + prompt: Union[str, List[str]] + stop: Optional[Union[str, List[str]]] = None + max_tokens: Optional[int] = 100 # default value of 100 + temperature: Optional[float] = 0.0 # default value of 0.0 + stream: Optional[bool] = False # default value of False + best_of: Optional[int] = 1 + echo: Optional[bool] = False + frequency_penalty: Optional[float] = 0.0 # default value of 0.0 + presence_penalty: Optional[float] = 0.0 # default value of 0.0 + log_probs: Optional[int] = 0 # default value of 0.0 + n: Optional[int] = 1 # default value of 1, batch size + suffix: Optional[str] = None + top_p: Optional[float] = 0.0 # default value of 0.0 + user: Optional[str] = None + +class Message(BaseModel): + role: str + content: str + +class ChatCompletionRequest(BaseModel): + model: str + messages: List[Message] + stop: Optional[Union[str, List[str]]] = None + max_tokens: Optional[int] = 100 # default value of 100 + temperature: Optional[float] = 0.0 # default value of 0.0 + stream: Optional[bool] = False # default value of False + frequency_penalty: Optional[float] = 0.0 # default value of 0.0 + presence_penalty: Optional[float] = 0.0 # default value of 0.0 + log_probs: Optional[int] = 0 # default value of 0.0 + n: Optional[int] = 1 # default value of 1, batch size + top_p: Optional[float] = 0.0 # default value of 0.0 + user: Optional[str] = None + stop_at: Optional[str] = None + outlines_type: Optional[str] = None + choices: Optional[list[str]] = None + regex: Optional[str] = None + json: Optional[str] = None + +#repo_str = 'theprofessor-exl2-speculative' + +parser = argparse.ArgumentParser(description='Run server with specified port.') + +# Add argument for port with default type as integer +parser.add_argument('--port', type=int, help='Port to run the server on.') +parser.add_argument('--use_outlines', action='store_true', help='Use outlines.') +parser.add_argument('--gpu_split', type=str, default="17,19,19,19", help='GPU splits.') +parser.add_argument('--max_context', type=int, default=12288, help='Context length.') +parser.add_argument('--cache_8bit', action='store_true', help='Use 8 bit cache.') +parser.add_argument('--cache_q4', action='store_true', help='Use 4 bit cache.') +parser.add_argument('--repo_str', type=str, default='commandr-exl2', help='The model repository name') + +# Parse the arguments +args = parser.parse_args() +repo_str = args.repo_str + +config = configparser.ConfigParser() +config.read('config.ini') + +repo_id = config.get(repo_str, 'repo') +host = config.get('settings', 'host') + +port = args.port if args.port is not None else config.getint('settings', 'port') + +# only allow one client at a time +busy = False +condition = asyncio.Condition() + +config = ExLlamaV2Config() +config.model_dir = repo_id +config.prepare() + +use_dynamic_rope_scaling = False +dynamic_rope_mult = 1.5 +dynamic_rope_offset = 0.0 + +ropescale = 1.0 +max_context = args.max_context +config.scale_alpha_value = ropescale +config.max_seq_len = max_context +base_model_native_max = 4096 +cache_8bit = args.cache_8bit +cache_q4 = args.cache_q4 + +if args.use_outlines: + model = outlines.models.exl2( + config.model_dir, + "cuda", + max_seq_len = config.max_seq_len, + scale_pos_emb = config.scale_pos_emb, + scale_alpha_value = config.scale_alpha_value, + no_flash_attn = config.no_flash_attn, + num_experts_per_token = config.num_experts_per_token, + cache_8bit = cache_8bit, + cache_q4 = cache_q4, + tokenizer_kwargs = {}, + gpu_split = args.gpu_split, # we might be able to make this auto + low_mem = None, + verbose = None + ) +else: + model = ExLlamaV2(config) +print("Loading model: " + repo_id) +#cache = ExLlamaV2Cache(model, lazy=True, max_seq_len = 20480) +#model.load_autosplit(cache) +if not args.use_outlines: + model.load([int(gpu_memory) for gpu_memory in args.gpu_split.split(",")]) + +tokenizer = ExLlamaV2Tokenizer(config) + +# Cache mode + + +settings_proto = ExLlamaV2Sampler.Settings() +settings_proto.temperature = 0 +settings_proto.top_k = 50 +settings_proto.top_p = 0.8 +settings_proto.top_a = 0.0 +settings_proto.token_repetition_penalty = 1.1 +#settings.disallow_tokens(tokenizer, [tokenizer.eos_token_id]) + +# Active sequences and corresponding caches and settings +prompts = queue.Queue() +responses = {} + +input_ids = [] +prompt_length = [] +prompt_ids = [] +streamer = [] +caches = [] +input_prompts = [] +generators = [] +generations = [] +settings = [] +future_tokens = [] +future_logits = [] +sin_arr = [] +cos_arr = [] + +# Global variable for storing partial responses +partial_responses = {} + +max_parallel_seqs = 3 +num_of_gpus = len(args.gpu_split.split(",")) + +print("*** Loaded.. now Inference...:") + +app = FastAPI(title="EXL2") + +async def stream_response(prompt_id, timeout=180): + global partial_responses + while True: + await asyncio.sleep(0.05) # Sleep to yield control to the event loop + + # Check if prompt_id exists in partial_responses + if prompt_id in partial_responses: + # Stream partial responses + while partial_responses[prompt_id]: + response_chunk = partial_responses[prompt_id].pop(0) + yield f"data: {json.dumps(response_chunk)}\n\n" + + # Check for final response or timeout + if prompt_id in responses: + final_response = responses.pop(prompt_id) + yield f'data: {{"id":"chatcmpl-{prompt_id}","object":"chat.completion.chunk","created":{int(time.time())},"model":"{repo_str}","choices":[{{"index":0,"delta":{{}},"finish_reason":"stop"}}]}}\n\n' + break + +# Worker thread function +def process_outline_prompts(): + global partial_responses + assert args.use_outlines + assert not use_dynamic_rope_scaling, "Currently ROPE scaling is not supported with outlines" + base_model = model.model + while True: + while not prompts.empty() or len(prompt_ids): + while len(prompt_ids) < max_parallel_seqs and not prompts.empty(): + prompt_id, prompt, max_tokens, stream, temperature, outlines_dict = prompts.get() + print(f"got prompt with outlines dict {outlines_dict}") + sampler = multinomial(top_k=50, top_p=1.0, temperature=temperature) + ids = tokenizer.encode(prompt) + prompt_tokens = ids.shape[-1] + full_tokens = prompt_tokens + max_tokens + print("Processing prompt: " + str(prompt_id) + " Req tokens: " + str(full_tokens)) + # Truncate if new_tokens exceed max_context + if full_tokens > max_context: + # Calculate how many tokens to truncate + ids = tokenizer.encode("Say, 'Prompt exceeds allowed length. Please try again.'") + # Update new_tokens after truncation + prompt_tokens = ids.shape[-1] + full_tokens = prompt_tokens + max_tokens + print("Truncating prompt: " + str(prompt_id) + " Req tokens: " + str(full_tokens)) + if cache_8bit: + ncache = ExLlamaV2Cache_8bit(base_model, lazy=not base_model.loaded, max_seq_len = full_tokens) # (max_seq_len could be different for each cache) + elif cache_q4: + ncache = ExLlamaV2Cache_Q4(base_model, lazy=not base_model.loaded, max_seq_len = full_tokens) + else: + ncache = ExLlamaV2Cache(base_model, lazy=not base_model.loaded, max_seq_len = full_tokens) # (max_seq_len could be different for each cache) + model.cache = ncache + model.past_seq = None + stop_at = outlines_dict["stop_at"] + if outlines_dict["type"] == "choices": + generator = outlines.generate.choice(model, outlines_dict["choices"], sampler=sampler) + elif outlines_dict["type"] == "json": + generator = outlines.generate.json(model, outlines_dict["json"], sampler=sampler) + elif outlines_dict["type"] == "regex": + generator = outlines.generate.regex(model, outlines_dict["regex"], sampler=sampler) + else: + generator = outlines.generate.text(model, sampler=sampler) + generators.append(generator.stream(prompt, stop_at=stop_at, max_tokens=max_tokens)) + prompt_ids.append(prompt_id) + input_prompts.append(prompt) + generations.append("") + caches.append(ncache) + streamer.append(stream) + if(len(prompt_ids)): + eos = [] + for i in range(len(prompt_ids)): + model.cache = caches[i] + is_finished = False + try: + decoded_response_token = next(generators[i]) + generations[i] += decoded_response_token + except StopIteration: + is_finished = True + reason = None + if(streamer[i]): + outcontent = decoded_response_token + if is_finished: + outcontent = "" + reason = "stop" + partial_response_data = { + "id": f"chatcmpl-{prompt_ids[i]}", + "object": "chat.completion.chunk", + "created": int(time.time()), + "model": repo_str, + "choices": [ + { + "index": 0, + "delta": { + "content": outcontent + }, + "finish_reason": reason + } + ] + } + + # Initialize a list for new prompt_id or append to existing one + if prompt_ids[i] not in partial_responses: + partial_responses[prompt_ids[i]] = [] + partial_responses[prompt_ids[i]].append(partial_response_data) + + if is_finished: + eos.insert(0, i) + + # Generate and store response + for i in eos: + output = generations[i].strip() + prompt = input_prompts[i] + #output = tokenizer.decode(input_ids[i])[0] + print("-----") + print(output) + generated_text = output + # Calculate token counts + completion_tokens = (tokenizer.encode(generated_text)).shape[-1] + prompt_tokens = (tokenizer.encode(prompt)).shape[-1] + full_tokens = completion_tokens + prompt_tokens + eos_prompt_id = prompt_ids.pop(i) + if(streamer[i]): + ## Generator, yield here.. + partial_response_data = { + "finish_reason": "stop" + } + + responses[eos_prompt_id] = partial_response_data + else:# Construct the response based on the format + response_data = { + "id": f"chatcmpl-{prompt_id}", + "object": "chat.completion", + "created": int(time.time()), + "model": repo_str, + "choices": [{ + "index": 0, + "message": { + "role": "assistant", + "content": generated_text, + }, + "finish_reason": "stop" + }], + "usage": { + "prompt_tokens": prompt_tokens, + "completion_tokens": completion_tokens, + "total_tokens": full_tokens + } + } + responses[eos_prompt_id] = response_data + # Clean up + input_prompts.pop(i) + generations.pop(i) + caches.pop(i) + streamer.pop(i) + + else: + # Sleep for a short duration when there's no work + time.sleep(0.1) # Sleep for 100 milliseconds + + +# Worker thread function +def process_prompts(): + global partial_responses + + while True: + while not prompts.empty() or len(input_ids): + while len(input_ids) < max_parallel_seqs and not prompts.empty(): + prompt_id, prompt, max_tokens, stream, temperature = prompts.get() + ids = tokenizer.encode(prompt) + prompt_tokens = ids.shape[-1] + new_tokens = prompt_tokens + max_tokens + print("Processing prompt: " + str(prompt_id) + " Req tokens: " + str(new_tokens)) + # Truncate if new_tokens exceed max_context + if new_tokens > max_context: + # Calculate how many tokens to truncate + ids = tokenizer.encode("Say, 'Prompt exceeds allowed length. Please try again.'") + # Update new_tokens after truncation + prompt_tokens = ids.shape[-1] + new_tokens = prompt_tokens + max_tokens + print("Truncating prompt: " + str(prompt_id) + " Req tokens: " + str(new_tokens)) + prompt_length.append(prompt_tokens) + if use_dynamic_rope_scaling: + # Dynamic Rope Scaling + head_dim = model.config.head_dim + model_base = model.config.rotary_embedding_base + ratio = new_tokens / base_model_native_max + alpha = 1.0 + ropesin = [None] * num_of_gpus + ropecos = [None] * num_of_gpus + if ratio > 1.0: + alpha = ((0.2500*ratio**2) + (0.3500*ratio) + 0.4000)*dynamic_rope_mult + dynamic_rope_offset + print("DYNAMIC ROPE SCALE Alpha: " + str(alpha) + " Ratio: " + str(ratio)) + + for g in range(num_of_gpus): + base = model_base + try: + tensors = model.get_device_tensors(g) + except IndexError: + tensors = None + + if tensors is not None: + if alpha != 1.0: base *= alpha ** (model.config.head_dim / (model.config.head_dim - 2)) + + inv_freq = 1.0 / (base ** (torch.arange(0, head_dim, 2, device = "cuda:"+str(g)).float() / head_dim)) + t = torch.arange(model.config.max_seq_len, device = "cuda:"+str(g), dtype = torch.float32) + + freqs = torch.einsum("i,j->ij", t, inv_freq) + emb = torch.cat((freqs, freqs), dim=-1) + + ropesin[g] = emb.sin()[None, None, :, :].half() + ropecos[g] = emb.cos()[None, None, :, :].half() + + tensors.sin = ropesin[g] + tensors.cos = ropecos[g] + + if cache_8bit: + ncache = ExLlamaV2Cache_8bit(model, max_seq_len = new_tokens) # (max_seq_len could be different for each cache) + else: + ncache = ExLlamaV2Cache(model, max_seq_len = new_tokens) # (max_seq_len could be different for each cache) + + #print("Setting up Cache: " + str(prompt_id)) + + if use_dynamic_rope_scaling: + sin_arr.append(ropesin) + cos_arr.append(ropecos) + + model.forward(ids[:, :-1], ncache, preprocess_only = True) + print("Cache setup: " + str(np.shape(ids[:1, :-1]))) + input_ids.append(ids) + prompt_ids.append(prompt_id) + caches.append(ncache) + streamer.append(stream) + future_tokens.append(None) + future_logits.append(None) + #print("Prompt added to queue: " + str(prompt_id)) + + # Create a batch tensor of the last token in each active sequence, forward through the model using the list of + # active caches rather than a single, batched cache. Then sample for each token indidividually with some + # arbitrary stop condition + if(len(input_ids)): + #inputs = torch.cat([x[:, -1:] for x in input_ids], dim = 0) + #logits = model.forward(inputs, caches, input_mask = None).float().cpu() + eos = [] + r = random.random() + for i in range(len(input_ids)): + # if using dynamic rope + if use_dynamic_rope_scaling: + for g in range(num_of_gpus): + if sin_arr[i][g] is not None and cos_arr[i][g] is not None: + tensors = model.get_device_tensors(g) + tensors.sin = sin_arr[i][g] + tensors.cos = cos_arr[i][g] + + logits = model.forward(input_ids[i][:, -1:], caches[i], input_mask = None ).float().cpu() + token, _, _, _, _ = ExLlamaV2Sampler.sample(logits[:, :1, :], settings[i], input_ids[i], r, tokenizer) + + input_ids[i] = torch.cat([input_ids[i], token], dim = 1) + + new_text = tokenizer.decode(input_ids[i][:, -2:-1], decode_special_tokens=False)[0] + new_text2 = tokenizer.decode(input_ids[i][:, -2:], decode_special_tokens=False)[0] + if '�' in new_text: + diff = new_text2 + else: + diff = new_text2[len(new_text):] + + if '�' in diff: + diff = "" + + #print(diff) + reason = None + if(streamer[i]): + ## Generator, yield here.. + outcontent = diff + if diff == """<|im_end|>""": + outcontent = "" + reason = "stop" + partial_response_data = { + "id": f"chatcmpl-{prompt_ids[i]}", + "object": "chat.completion.chunk", + "created": int(time.time()), + "model": repo_str, + "choices": [ + { + "index": 0, + "delta": { + "content": outcontent + }, + "finish_reason": reason + } + ] + } + + # Initialize a list for new prompt_id or append to existing one + if prompt_ids[i] not in partial_responses: + partial_responses[prompt_ids[i]] = [] + partial_responses[prompt_ids[i]].append(partial_response_data) + + if token.item() == tokenizer.eos_token_id or diff == """<|im_end|>""" or caches[i].current_seq_len == caches[i].max_seq_len: + eos.insert(0, i) + + # Generate and store response + for i in eos: + generated_part = input_ids[i][:, prompt_length[i]:] + output = tokenizer.decode(generated_part[0]).strip() + #output = tokenizer.decode(input_ids[i])[0] + print("-----") + print(output) + generated_text = output + # Calculate token counts + completion_tokens = (tokenizer.encode(generated_text)).shape[-1] + prompt_tokens = (tokenizer.encode(prompt)).shape[-1] + full_tokens = completion_tokens + prompt_tokens + eos_prompt_id = prompt_ids.pop(i) + if(streamer[i]): + ## Generator, yield here.. + partial_response_data = { + "finish_reason": "stop" + } + + responses[eos_prompt_id] = partial_response_data + else:# Construct the response based on the format + response_data = { + "id": f"chatcmpl-{prompt_id}", + "object": "chat.completion", + "created": int(time.time()), + "model": repo_str, + "choices": [{ + "index": 0, + "message": { + "role": "assistant", + "content": generated_text, + }, + "finish_reason": "stop" + }], + "usage": { + "prompt_tokens": prompt_tokens, + "completion_tokens": completion_tokens, + "total_tokens": full_tokens + } + } + responses[eos_prompt_id] = response_data + + # Clean up + input_ids.pop(i) + caches.pop(i) + settings.pop(i) + prompt_length.pop(i) + streamer.pop(i) + if use_dynamic_rope_scaling: + cos_arr.pop(i) + sin_arr.pop(i) + + else: + # Sleep for a short duration when there's no work + time.sleep(0.1) # Sleep for 100 milliseconds + + + + + +# Start worker thread +worker = Thread(target=process_prompts if not args.use_outlines else process_outline_prompts) +worker.start() + + +async def format_prompt(messages): + formatted_prompt = "" + for message in messages: + if message.role == "system": + formatted_prompt += f"{message.content}\n\n" + elif message.role == "user": + formatted_prompt += f"### User:\n{message.content}\n\n" + elif message.role == "assistant": + formatted_prompt += f"### Assistant:\n{message.content}\n\n" + # Add the final "### Assistant:\n" to prompt for the next response + formatted_prompt += "### Assistant:\n" + return formatted_prompt + +async def format_prompt_yi(messages): + formatted_prompt = "" + system_message_found = False + + # Check for a system message first + for message in messages: + if message.role == "system": + system_message_found = True + break + + # If no system message was found, prepend a default one + if not system_message_found: + formatted_prompt = "<|im_start|>system\nYou are a helpful AI assistant.<|im_end|>\n" + for message in messages: + if message.role == "system": + formatted_prompt += f"<|im_start|>system\n{message.content}<|im_end|>\n" + elif message.role == "user": + formatted_prompt += f"<|im_start|>user\n{message.content}<|im_end|>\n" + elif message.role == "assistant": + formatted_prompt += f"<|im_start|>assistant\n{message.content}<|im_end|>\n" + # Add the final "### Assistant:\n" to prompt for the next response + formatted_prompt += "<|im_start|>assistant\n" + return formatted_prompt + +async def format_prompt_nous(messages): + formatted_prompt = "" + for message in messages: + if message.role == "system": + formatted_prompt += f"{message.content}\n" + elif message.role == "user": + formatted_prompt += f"USER: {message.content}\n" + elif message.role == "assistant": + formatted_prompt += f"ASSISTANT: {message.content}\n" + # Add the final "### Assistant:\n" to prompt for the next response + formatted_prompt += "ASSISTANT: " + return formatted_prompt + +async def format_prompt_tess(messages): + formatted_prompt = "" + for message in messages: + if message.role == "system": + formatted_prompt += f"SYSTEM: {message.content}\n" + elif message.role == "user": + formatted_prompt += f"USER: {message.content}\n" + elif message.role == "assistant": + formatted_prompt += f"ASSISTANT: {message.content}\n" + # Add the final "### Assistant:\n" to prompt for the next response + formatted_prompt += "ASSISTANT: " + return formatted_prompt + +async def format_prompt_code(messages): + formatted_prompt = "" + for message in messages: + if message.role == "system": + formatted_prompt += f"### System Prompt\nYou are an intelligent programming assistant.\n\n" + elif message.role == "user": + formatted_prompt += f"### User Message\n{message.content}\n\n" + elif message.role == "assistant": + formatted_prompt += f"### Assistant\n{message.content}\n\n" + # Add the final "### Assistant" with ellipsis to prompt for the next response + formatted_prompt += "### Assistant\n..." + return formatted_prompt + +async def format_prompt_zephyr(messages): + formatted_prompt = "" + for message in messages: + if message.role == "system": + formatted_prompt += f"<|system|>\n{message.content}\n" + elif message.role == "user": + formatted_prompt += f"<|user|>\n{message.content}\n" + elif message.role == "assistant": + formatted_prompt += f"<|assistant|>\n{message.content}\n" + # Add the final "### Assistant:\n" to prompt for the next response + formatted_prompt += "<|assistant|>\n" + return formatted_prompt + +async def format_prompt_starling(messages): + formatted_prompt = "" + system_message = "" + for message in messages: + if message.role == "system": + # Save system message to prepend to the first user message + system_message += f"{message.content}\n\n" + elif message.role == "user": + # Prepend system message if it exists + if system_message: + formatted_prompt += f"GPT4 Correct User: {system_message}{message.content}<|end_of_turn|>" + system_message = "" # Clear system message after prepending + else: + formatted_prompt += f"GPT4 Correct User: {message.content}<|end_of_turn|>" + elif message.role == "assistant": + formatted_prompt += f"GPT4 Correct Assistant: {message.content}<|end_of_turn|>" # Prep for user follow-up + formatted_prompt += "GPT4 Correct Assistant: \n\n" + return formatted_prompt + +async def format_prompt_mixtral(messages): + formatted_prompt = " " + system_message = "" + for message in messages: + if message.role == "system": + # Save system message to prepend to the first user message + system_message += f"{message.content}\n\n" + elif message.role == "user": + # Prepend system message if it exists + if system_message: + formatted_prompt += f"[INST] {system_message}{message.content} [/INST] " + system_message = "" # Clear system message after prepending + else: + formatted_prompt += f"[INST] {message.content} [/INST] " + elif message.role == "assistant": + formatted_prompt += f" {message.content} " # Prep for user follow-up + return formatted_prompt + + +async def format_prompt_commandr(messages): + formatted_prompt = "" + system_message_found = False + + # Check for a system message first + for message in messages: + if message.role == "system": + system_message_found = True + break + + # If no system message was found, prepend a default one + if not system_message_found: + formatted_prompt += f"<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{message.content}<|END_OF_TURN_TOKEN|>" + + for message in messages: + if message.role == "system": + formatted_prompt += f"<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{message.content}<|END_OF_TURN_TOKEN|>" + elif message.role == "user": + formatted_prompt += f"<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{message.content}<|END_OF_TURN_TOKEN|>" + elif message.role == "assistant": + formatted_prompt += f"<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{message.content}<|END_OF_TURN_TOKEN|>" + # Add the final "### Assistant:\n" to prompt for the next response + formatted_prompt += "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>" + return formatted_prompt + + +@app.post('/v1/chat/completions') +async def mainchat(request: ChatCompletionRequest): + try: + prompt = '' + if repo_str == 'Phind-CodeLlama-34B-v2': + prompt = await format_prompt_code(request.messages) + elif repo_str == 'zephyr-7b-beta': + prompt = await format_prompt_zephyr(request.messages) + elif repo_str == 'Starling-LM-7B-alpha': + prompt = await format_prompt_starling(request.messages) + elif repo_str == 'Mixtral-8x7B-Instruct-v0.1-GPTQ': + prompt = await format_prompt_mixtral(request.messages) + elif repo_str == 'Yi-34B-Chat-GPTQ' or repo_str == 'Nous-Hermes-2-Yi-34B-GPTQ' or repo_str == 'theprofessor-exl2-speculative' or repo_str == 'dbrx-instruct-exl2': + prompt = await format_prompt_yi(request.messages) + elif repo_str == 'Nous-Capybara-34B-GPTQ' or repo_str == 'goliath-120b-GPTQ' or repo_str == 'goliath-120b-exl2' or repo_str == 'goliath-120b-exl2-rpcal': + prompt = await format_prompt_nous(request.messages) + elif repo_str == 'tess-xl-exl2' or repo_str == 'tess-xl-exl2-speculative': + prompt = await format_prompt_tess(request.messages) + elif repo_str == 'commandr-exl2' or repo_str == 'commandr-exl2-speculative': + prompt = await format_prompt_commandr(request.messages) + else: + prompt = await format_prompt(request.messages) + print(prompt) + + timeout = 180 # seconds + start_time = time.time() + prompt_id = generate_unique_id() # Replace with a function to generate unique IDs + outlines_dict = {} + if request.stop_at is not None: + outlines_dict["stop_at"] = request.stop_at + if request.outlines_type is not None: + outlines_dict["type"] = request.outlines_type + if outlines_dict["type"] == "choices": + assert request.choices is not None + outlines_dict["choices"] = request.choices + elif outlines_dict["type"] == "json": + assert request.json is not None + outlines_dict["json"] = request.json + elif outlines_dict["type"] == "regex": + assert request.regex is not None + outlines_dict["regex"] = request.regex + else: + assert (outlines_dict["type"] == "text") or not args.outlines + if not args.use_outlines: + prompts.put((prompt_id, prompt, request.max_tokens, request.stream, request.temperature)) + else: + prompts.put((prompt_id, prompt, request.max_tokens, request.stream, request.temperature, outlines_dict)) + + + if request.stream: + #response = StreamingResponse(streaming_request(prompt, request.max_tokens, tempmodel=repo_str, response_format='chat_completion'), media_type="text/event-stream") + return StreamingResponse(stream_response(prompt_id), media_type="text/event-stream") + else: + #response_data = non_streaming_request(prompt, request.max_tokens, tempmodel=repo_str, response_format='chat_completion') + #response = response_data # This will return a JSON response + while prompt_id not in responses: + await asyncio.sleep(0.1) # Sleep to yield control to the event loop + if time.time() - start_time > timeout: + return {"error": "Response timeout"} + + return responses.pop(prompt_id) + + except Exception as e: + print(traceback.format_exc()) + raise HTTPException(status_code=500, detail=str(e)) + + return response + + + + +@app.get('/ping') +async def get_status(): + return {"ping": sum(prompt_length)} + +@app.get("/nvidia-smi") +async def get_nvidia_smi(): + # Execute the nvidia-smi command + result = subprocess.run( + ["nvidia-smi", "--query-gpu=utilization.gpu,memory.used,memory.total", "--format=csv,noheader"], + capture_output=True, text=True + ) + nvidia_smi_output = result.stdout.strip() # Remove any extra whitespace + # Split the output by lines and then by commas + gpu_data = [] + for line in nvidia_smi_output.split("\n"): + utilization, memory_used, memory_total = line.split(", ") + # Strip the '%' and 'MiB' and convert to appropriate types + utilization = float(utilization.strip(' %')) + memory_used = int(memory_used.strip(' MiB')) + memory_total = int(memory_total.strip(' MiB')) + gpu_data.append({ + "utilization": utilization, + "memory_used": memory_used, + "memory_total": memory_total + }) + return gpu_data + + +if __name__ == "__main__": + import uvicorn + + uvicorn.run(app, host=host, port=port, log_level="debug")