From be385ef9f4d708b7b272ab3ba79db824c37b3c6c Mon Sep 17 00:00:00 2001
From: Noah Yoshida <noahcy117@gmail.com>
Date: Thu, 13 Jun 2024 12:12:59 -0700
Subject: [PATCH 01/22] runpod

---
 runpod/Dockerfile        |  75 +++++++++
 runpod/builder/setup.sh  |  23 +++
 runpod/src/entrypoint.sh |  45 +++++
 runpod/src/handler.py    |  91 ++++++++++
 runpod/src/server.py     | 352 +++++++++++++++++++++++++++++++++++++++
 5 files changed, 586 insertions(+)
 create mode 100644 runpod/Dockerfile
 create mode 100644 runpod/builder/setup.sh
 create mode 100644 runpod/src/entrypoint.sh
 create mode 100644 runpod/src/handler.py
 create mode 100644 runpod/src/server.py

diff --git a/runpod/Dockerfile b/runpod/Dockerfile
new file mode 100644
index 000000000..3fee1dd51
--- /dev/null
+++ b/runpod/Dockerfile
@@ -0,0 +1,75 @@
+# Base image
+FROM ghcr.io/predibase/lorax:0.10.0
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Set the working directory
+WORKDIR /
+
+# Update and upgrade the system packages (Worker Template)
+COPY builder/setup.sh /setup.sh
+RUN /bin/bash /setup.sh && \
+    rm /setup.sh
+
+# Install Python dependencies (Worker Template)
+# COPY builder/requirements.txt /requirements.txt
+# RUN python3 -m pip install --upgrade pip && \
+#     python3 -m pip install --upgrade -r /requirements.txt --no-cache-dir && \
+#     rm /requirements.txt
+
+# Add src files (Worker Template)
+ADD src .
+
+# Whether to download the model into /runpod-volume or not.
+ARG DOWNLOAD_MODEL=
+ENV DOWNLOAD_MODEL=$DOWNLOAD_MODEL
+
+# Set environment variables
+ARG HF_MODEL_ID=
+ENV HF_MODEL_ID=$HF_MODEL_ID
+
+ARG HF_MODEL_REVISION=
+ENV HF_MODEL_REVISION=$HF_MODEL_REVISION
+
+ARG SM_NUM_GPUS=
+ENV SM_NUM_GPUS=$SM_NUM_GPUS
+
+ARG HF_MODEL_QUANTIZE=
+ENV HF_MODEL_QUANTIZE=$HF_MODEL_QUANTIZE
+
+ARG HF_MODEL_TRUST_REMOTE_CODE=
+ENV HF_MODEL_TRUST_REMOTE_CODE=$HF_MODEL_TRUST_REMOTE_CODE
+
+ARG MODEL_BASE_PATH="/runpod-volume/"
+ENV MODEL_BASE_PATH=$MODEL_BASE_PATH
+
+ARG HUGGING_FACE_HUB_TOKEN=
+ENV HUGGING_FACE_HUB_TOKEN=$HUGGING_FACE_HUB_TOKEN
+
+ARG HF_MAX_TOTAL_TOKENS=
+ENV HF_MAX_TOTAL_TOKENS=$HF_MAX_TOTAL_TOKENS
+
+ARG HF_MAX_INPUT_LENGTH=
+ENV HF_MAX_INPUT_LENGTH=$HF_MAX_INPUT_LENGTH
+
+ARG HF_MAX_BATCH_TOTAL_TOKENS=
+ENV HF_MAX_BATCH_TOTAL_TOKENS=$HF_MAX_BATCH_TOTAL_TOKENS
+
+ARG HF_MAX_BATCH_PREFILL_TOKENS=
+ENV HF_MAX_BATCH_PREFILL_TOKENS=$HF_MAX_BATCH_PREFILL_TOKENS
+
+# Prepare the hugging face directories for caching datasets, models, and more.
+ENV HF_DATASETS_CACHE="/runpod-volume/huggingface-cache/datasets"
+ENV HUGGINGFACE_HUB_CACHE="/runpod-volume/huggingface-cache/hub"
+ENV TRANSFORMERS_CACHE="/runpod-volume/huggingface-cache/hub"
+
+# Conditionally download the model weights based on DOWNLOAD_MODEL
+RUN if [ "$DOWNLOAD_MODEL" = "1" ]; then \
+    lorax-server download-weights $HF_MODEL_ID; \
+  fi
+
+# Quick temporary updates
+RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.10
+RUN python3.10 -m pip install git+https://github.com/runpod/runpod-python@a1#egg=runpod --compile
+RUN python3.10 -m pip install lorax-client
+
+ENTRYPOINT ["./entrypoint.sh"]
diff --git a/runpod/builder/setup.sh b/runpod/builder/setup.sh
new file mode 100644
index 000000000..2b9926ec8
--- /dev/null
+++ b/runpod/builder/setup.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+# Stop script on error
+set -e
+
+# Update System
+apt-get update && apt-get upgrade -y
+
+# Install System Dependencies
+# - openssh-server: for ssh access and web terminal
+apt-get install -y --no-install-recommends software-properties-common curl git openssh-server
+
+# Install Python 3.10
+add-apt-repository ppa:deadsnakes/ppa -y
+apt-get update && apt-get install -y --no-install-recommends python3.10 python3.10-dev python3.10-distutils
+update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1
+
+# Install pip for Python 3.10
+curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py
+python3 get-pip.py
+
+# Clean up
+apt-get autoremove -y && apt-get clean -y && rm -rf /var/lib/apt/lists/*
diff --git a/runpod/src/entrypoint.sh b/runpod/src/entrypoint.sh
new file mode 100644
index 000000000..09e69a875
--- /dev/null
+++ b/runpod/src/entrypoint.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+
+# if [[ -z "${HF_MODEL_ID}" ]]; then
+#   echo "HF_MODEL_ID must be set"
+#   exit 1
+# fi
+# export MODEL_ID="${HF_MODEL_ID}"
+
+if [[ -n "${HF_MODEL_REVISION}" ]]; then
+  export REVISION="${HF_MODEL_REVISION}"
+fi
+
+if [[ -n "${SM_NUM_GPUS}" ]]; then
+  export NUM_SHARD="${SM_NUM_GPUS}"
+fi
+
+if [[ -n "${HF_MODEL_QUANTIZE}" ]]; then
+  export QUANTIZE="${HF_MODEL_QUANTIZE}"
+fi
+
+if [[ -n "${HF_MODEL_TRUST_REMOTE_CODE}" ]]; then
+  export TRUST_REMOTE_CODE="${HF_MODEL_TRUST_REMOTE_CODE}"
+fi
+
+if [[ -n "${HF_MAX_TOTAL_TOKENS}" ]]; then
+  export MAX_TOTAL_TOKENS="${HF_MAX_TOTAL_TOKENS}"
+fi
+
+if [[ -n "${HF_MAX_INPUT_LENGTH}" ]]; then
+  export MAX_INPUT_LENGTH="${HF_MAX_INPUT_LENGTH}"
+fi
+
+if [[ -n "${HF_MAX_BATCH_TOTAL_TOKENS}" ]]; then
+  export MAX_BATCH_TOTAL_TOKENS="${HF_MAX_BATCH_TOTAL_TOKENS}"
+fi
+
+if [[ -n "${HF_MAX_BATCH_PREFILL_TOKENS}" ]]; then
+  export MAX_BATCH_PREFILL_TOKENS="${HF_MAX_BATCH_PREFILL_TOKENS}"
+fi
+
+# Start the text generation server
+nohup lorax-launcher --port 8080 --model-id      predibase/Mistral-7B-v0.1-dequantized      --adapter-source      hub      --default-adapter-source      pbase      --max-batch-prefill-tokens      32768      --max-total-tokens      8192      --max-input-length      8191      --max-concurrent-requests      1024 &
+
+# Start the handler using python 3.10
+python3.10 -u /handler.py
diff --git a/runpod/src/handler.py b/runpod/src/handler.py
new file mode 100644
index 000000000..39ac671ca
--- /dev/null
+++ b/runpod/src/handler.py
@@ -0,0 +1,91 @@
+#!/usr/bin/env python
+''' Contains the handler function that will be called by the serverless. '''
+
+from typing import Generator
+import runpod
+import os
+import time
+
+# For download the weights
+from lorax import Client
+
+# Prepare global variables
+JOBS = set()
+TGI_LOCAL_PORT = int(os.environ.get('TGI_LOCAL_PORT', 8080))
+
+# Create the client
+client = Client("http://127.0.0.1:{}".format(TGI_LOCAL_PORT))
+
+# Wait for the hugging face TGI worker to start running.
+while True:
+    try:
+        client.generate("Why is the sky blue?").generated_text
+        print("Successfully cold booted the hugging face text generation inference server!")
+
+        # Break from the while loop
+        break
+
+    except Exception as e:
+        print("The hugging face text generation inference server is still cold booting...")
+        time.sleep(5)
+
+def concurrency_controller() -> bool:
+    # Handle at most 100 jobs at a time.
+    return len(JOBS) > 20
+
+async def handler_streaming(job: dict) -> Generator[dict[str, list], None, None]:
+    '''
+    This is the handler function that will be called by the serverless.
+    ''' 
+    # Get job input
+    job_input = job['input']
+
+    # Prompts
+    prompt = job_input['prompt']
+
+    # Validate the inputs
+    sampling_params = job_input.get('sampling_params', {})
+
+    # Add job to the set.
+    JOBS.add(job['id'])
+
+     # Include metrics in the highest level for the job output for aggregrate.
+    def aggregate_function(streamed_outputs):
+        aggregate_output = ""
+        for stream in streamed_outputs:
+            aggregate_output += stream['text']
+
+        # Aggregate metrics to expose to the user
+        # input_tokens = -1 # TBD
+        # output_tokens = -1 # TBD
+
+        return {
+            "text": aggregate_output,
+            # "input_tokens": input_tokens,
+            # "output_tokens": output_tokens,
+        }
+
+    # Streaming case
+    for response in client.generate_stream(prompt, **sampling_params):
+        if not response.token.special:
+            text_outputs = response.token.text
+            ret = {"text": text_outputs}
+
+            # Update the aggregate transformation function
+            runpod.serverless.modules.rp_metrics.metrics_collector.update_stream_aggregate(
+                job_id=job['id'], 
+                aggregate_function=aggregate_function
+            )
+
+            yield ret
+
+    # Remove job from the set.
+    JOBS.remove(job['id'])
+
+# Start the serverless worker with appropriate settings
+print("Starting the TGI serverless worker with streaming enabled.")
+runpod.serverless.start({
+    "handler": handler_streaming, 
+    "concurrency_controller": concurrency_controller, 
+    "return_aggregate_stream": True
+})
diff --git a/runpod/src/server.py b/runpod/src/server.py
new file mode 100644
index 000000000..63cc39c77
--- /dev/null
+++ b/runpod/src/server.py
@@ -0,0 +1,352 @@
+import asyncio
+import os
+from pathlib import Path
+from typing import List, Optional
+
+import torch
+from grpc import aio
+from grpc_reflection.v1alpha import reflection
+from loguru import logger
+
+from lorax_server.adapters.utils import download_adapter
+from lorax_server.cache import Cache
+from lorax_server.interceptor import ExceptionInterceptor
+from lorax_server.models import Model, get_model
+from lorax_server.pb import generate_pb2, generate_pb2_grpc
+from lorax_server.tracing import UDSOpenTelemetryAioServerInterceptor
+from lorax_server.utils import HUB, LOCAL, PBASE, S3, map_pbase_model_id_to_s3
+from lorax_server.utils.adapter import BASE_MODEL_ADAPTER_ID, is_base_model
+from lorax_server.utils.sgmv import has_sgmv
+from lorax_server.utils.state import set_speculative_tokens
+
+
+class LoraxService(generate_pb2_grpc.LoraxServiceServicer):
+    """
+    Implementation of the LoraxService gRPC service.
+
+    Args:
+        model (Model): The model used for inference.
+        cache (Cache): The cache used for storing and retrieving batches.
+        server_urls (List[str]): List of server URLs for service discovery.
+    """
+
+    def __init__(self, model: Model, cache: Cache, server_urls: List[str]):
+        self.cache = cache
+        self.model = model
+        self.server_urls = server_urls
+        # For some reason, inference_mode does not work well with GLOO which we use on CPU
+        if model.device.type == "cuda":
+            # Force inference mode for the lifetime of LoraxService
+            self._inference_mode_raii_guard = torch._C._InferenceMode(True)
+
+    async def Info(self, request, context):
+        return self.model.info
+
+    async def Health(self, request, context):
+        if self.model.device.type == "cuda":
+            torch.zeros((2, 2)).cuda()
+        return generate_pb2.HealthResponse()
+
+    async def ServiceDiscovery(self, request, context):
+        return generate_pb2.ServiceDiscoveryResponse(urls=self.server_urls)
+
+    async def ClearCache(self, request, context):
+        if request.HasField("id"):
+            self.cache.delete(request.id)
+        else:
+            self.cache.clear()
+        return generate_pb2.ClearCacheResponse()
+
+    async def FilterBatch(self, request, context):
+        batch = self.cache.pop(request.batch_id)
+        if batch is None:
+            raise ValueError(f"Batch ID {request.batch_id} not found in cache.")
+        filtered_batch = batch.filter(request.request_ids)
+        self.cache.set(filtered_batch)
+
+        return generate_pb2.FilterBatchResponse(batch=filtered_batch.to_pb())
+
+    async def Warmup(self, request: generate_pb2.WarmupRequest, context):
+        batch = self.model.batch_type.from_pb(
+            request.batch,
+            self.model.tokenizer,
+            self.model.tokenizers,
+            self.model.dtype,
+            self.model.device,
+        )
+        max_supported_total_tokens = self.model.warmup(batch, request.max_new_tokens)
+
+        return generate_pb2.WarmupResponse(max_supported_total_tokens=max_supported_total_tokens)
+
+    async def Prefill(self, request: generate_pb2.PrefillRequest, context):
+        batch = self.model.batch_type.from_pb(
+            request.batch,
+            self.model.tokenizer,
+            self.model.tokenizers,
+            self.model.dtype,
+            self.model.device,
+        )
+
+        generations, next_batch = self.model.generate_token(batch)
+        self.cache.set(next_batch)
+
+        return generate_pb2.PrefillResponse(
+            generations=[generation.to_pb() for generation in generations],
+            batch=next_batch.to_pb() if next_batch else None,
+        )
+
+    async def Embed(self, request: generate_pb2.EmbedRequest, context):
+        if not self.model.supports_embeddings:
+            raise ValueError("Model does not support embeddings")
+        
+        batch = self.model.batch_type.from_pb(
+            request.batch,
+            self.model.tokenizer,
+            self.model.tokenizers,
+            self.model.dtype,
+            self.model.device,
+        )
+        embeddings = self.model.embed(batch)
+        embeddings_proto = []
+        for i, embedding in enumerate(embeddings):
+            embeddings_proto.append(generate_pb2.Embedding(request_id=batch.request_ids[i], values=embedding))
+        return generate_pb2.EmbedResponse(embeddings=embeddings_proto)
+
+    async def Decode(self, request: generate_pb2.DecodeRequest, context):
+        if len(request.batches) == 0:
+            raise ValueError("Must provide at least one batch")
+
+        batches = []
+        for batch_pb in request.batches:
+            batch = self.cache.pop(batch_pb.id)
+            if batch is None:
+                raise ValueError(f"Batch ID {batch_pb.id} not found in cache.")
+            batches.append(batch)
+
+        if len(batches) == 0:
+            raise ValueError("All batches are empty")
+
+        if len(batches) > 1:
+            batch = self.model.batch_type.concatenate(batches)
+        else:
+            batch = batches[0]
+
+        generations, next_batch = self.model.generate_token(batch)
+        self.cache.set(next_batch)
+
+        return generate_pb2.DecodeResponse(
+            generations=[generation.to_pb() for generation in generations],
+            batch=next_batch.to_pb() if next_batch else None,
+        )
+
+    async def DownloadAdapter(self, request: generate_pb2.DownloadAdapterRequest, context):
+        adapter_parameters = request.adapter_parameters
+        if is_base_model(adapter_parameters):
+            logger.info("No adapter to download for base model. Skipping.")
+            return generate_pb2.DownloadAdapterResponse(downloaded=False)
+
+        adapter_bytes = 0
+        api_token = request.api_token
+        adapter_source = _adapter_source_enum_to_string(request.adapter_source)
+        for adapter_id in adapter_parameters.adapter_ids:
+            if adapter_id == BASE_MODEL_ADAPTER_ID:
+                logger.info("No adapter to download for base model. Skipping.")
+                continue
+
+            adapter_bytes += download_adapter(adapter_id, adapter_source, api_token)
+
+        adapter_memory_size = self.model.adapter_memory_size()
+        if adapter_memory_size > 0:
+            logger.info(
+                f"Downloaded adapter {adapter_id} memory size: {adapter_bytes} bytes "
+                f"(reservation: {adapter_memory_size} bytes)"
+            )
+            adapter_memory_fraction = adapter_bytes / adapter_memory_size
+            if adapter_memory_fraction > 1:
+                raise ValueError(
+                    f"Adapter {adapter_id} is larger than adapter memory reservation: "
+                    f"{adapter_bytes} / {adapter_memory_size} bytes"
+                )
+        else:
+            # Assume 0.0 memory fraction if adapter memory size is not set
+            logger.info(
+                f"Downloaded adapter {adapter_id} memory size: {adapter_bytes} bytes " f"(no reservation limit)"
+            )
+            adapter_memory_fraction = 0.0
+
+        return generate_pb2.DownloadAdapterResponse(downloaded=True, memory_fraction=adapter_memory_fraction)
+
+    async def LoadAdapter(self, request: generate_pb2.LoadAdapterRequest, context):
+        adapter_parameters = request.adapter_parameters
+        if is_base_model(adapter_parameters):
+            logger.info("No adapter to load for base model. Skipping.")
+            return generate_pb2.LoadAdapterResponse(loaded=False)
+
+        try:
+            adapter_source = _adapter_source_enum_to_string(request.adapter_source)
+            adapter_index = request.adapter_index
+            api_token = request.api_token
+
+            if adapter_source == PBASE:
+                for i in range(len(adapter_parameters.adapter_ids)):
+                    adapter_id = adapter_parameters.adapter_ids[i]
+                    adapter_id = map_pbase_model_id_to_s3(adapter_id, api_token)
+                    adapter_parameters.adapter_ids[i] = adapter_id
+                adapter_source = S3
+
+            self.model.load_adapter(adapter_parameters, adapter_source, adapter_index, api_token)
+
+            return generate_pb2.LoadAdapterResponse(loaded=True)
+        except Exception:
+            logger.exception("Error when loading adapter")
+            raise
+
+    async def OffloadAdapter(self, request: generate_pb2.OffloadAdapterRequest, context):
+        adapter_parameters = request.adapter_parameters
+        if is_base_model(adapter_parameters):
+            logger.info("No adapter to offload for base model. Skipping.")
+            return generate_pb2.OffloadAdapterResponse(offloaded=False)
+
+        try:
+            adapter_idx = request.adapter_index
+            adapter_source = _adapter_source_enum_to_string(request.adapter_source)
+            adapter_index = request.adapter_index
+            self.model.offload_adapter(adapter_idx, adapter_source, adapter_index)
+
+            # Ensure there is enough memory for the next adapter
+            torch.cuda.empty_cache()
+            torch.cuda.synchronize(self.model.device)
+
+            return generate_pb2.OffloadAdapterResponse(offloaded=True)
+        except Exception:
+            logger.exception("Error when offloading adapter")
+            raise
+
+
+def serve(
+    model_id: str,
+    adapter_id: str,
+    revision: Optional[str],
+    sharded: bool,
+    quantize: Optional[str],
+    compile: bool,
+    dtype: Optional[str],
+    trust_remote_code: bool,
+    uds_path: Path,
+    source: str,
+    adapter_source: str,
+    speculative_tokens: int,
+):
+    async def serve_inner(
+        model_id: str,
+        adapter_id: str,
+        revision: Optional[str],
+        sharded: bool,
+        quantize: Optional[str],
+        compile: bool,
+        dtype: Optional[str],
+        trust_remote_code: bool,
+        speculative_tokens: int,
+    ):
+        unix_socket_template = "unix://{}-{}"
+        if sharded:
+            server_urls = [unix_socket_template.format(uds_path, rank) for rank in range(int(os.environ["WORLD_SIZE"]))]
+            local_url = server_urls[int(os.environ["RANK"])]
+        else:
+            local_url = unix_socket_template.format(uds_path, 0)
+            server_urls = [local_url]
+
+        try:
+            model = get_model(
+                model_id,
+                adapter_id,
+                revision,
+                sharded,
+                quantize,
+                compile,
+                dtype,
+                trust_remote_code,
+                source,
+                adapter_source,
+            )
+        except Exception:
+            logger.exception("Error when initializing model")
+            raise
+
+        if quantize == "gptq":
+            try:
+                # When using GPTQ, Exllama kernels need some global kernels
+                # For which we have the finale shapes only after the model has loaded
+                # This will allocate those buffers.
+                from lorax_server.utils.gptq.exllamav2 import (
+                    create_exllama_buffers,
+                    set_device,
+                )
+
+                set_device(model.device)
+                create_exllama_buffers()
+            except ImportError:
+                pass
+
+        # set speculative decoding tokens
+        speculative_tokens = max(model.max_speculative_tokens, speculative_tokens)
+        if speculative_tokens > 0:
+            set_speculative_tokens(speculative_tokens)
+
+        server = aio.server(
+            interceptors=[
+                ExceptionInterceptor(),
+                UDSOpenTelemetryAioServerInterceptor(),
+            ]
+        )
+        generate_pb2_grpc.add_LoraxServiceServicer_to_server(LoraxService(model, Cache(), server_urls), server)
+        SERVICE_NAMES = (
+            generate_pb2.DESCRIPTOR.services_by_name["LoraxService"].full_name,
+            reflection.SERVICE_NAME,
+        )
+        reflection.enable_server_reflection(SERVICE_NAMES, server)
+        server.add_insecure_port(local_url)
+
+        await server.start()
+
+        # Log SGMV kernel status
+        if has_sgmv():
+            logger.info("SGMV kernel is enabled, multi-LoRA inference will be fast!")
+        else:
+            logger.info("SGMV kernel is disabled, multi-LoRA inference may be slow")
+
+        logger.info("Server started at {}".format(local_url))
+
+        try:
+            await server.wait_for_termination()
+        except KeyboardInterrupt:
+            logger.info("Signal received. Shutting down")
+            await server.stop(0)
+
+    asyncio.run(
+        serve_inner(
+            model_id,
+            adapter_id,
+            revision,
+            sharded,
+            quantize,
+            compile,
+            dtype,
+            trust_remote_code,
+            speculative_tokens,
+        )
+    )
+
+
+def _adapter_source_enum_to_string(adapter_source: int) -> str:
+    # TODO(travis): refactor this to be less hacky
+    if adapter_source == generate_pb2.AdapterSource.HUB:
+        return HUB
+    elif adapter_source == generate_pb2.AdapterSource.S3:
+        return S3
+    elif adapter_source == generate_pb2.AdapterSource.LOCAL:
+        return LOCAL
+    elif adapter_source == generate_pb2.AdapterSource.PBASE:
+        return PBASE
+    else:
+        raise ValueError(f"Unknown adapter source {adapter_source}")

From 50f564a381ce36245c7679a2b139b0cf813afd97 Mon Sep 17 00:00:00 2001
From: Noah Yoshida <noahcy117@gmail.com>
Date: Thu, 13 Jun 2024 12:15:14 -0700
Subject: [PATCH 02/22] fix

---
 runpod/Dockerfile | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/runpod/Dockerfile b/runpod/Dockerfile
index 3fee1dd51..583782a98 100644
--- a/runpod/Dockerfile
+++ b/runpod/Dockerfile
@@ -12,9 +12,8 @@ RUN /bin/bash /setup.sh && \
 
 # Install Python dependencies (Worker Template)
 # COPY builder/requirements.txt /requirements.txt
-# RUN python3 -m pip install --upgrade pip && \
-#     python3 -m pip install --upgrade -r /requirements.txt --no-cache-dir && \
-#     rm /requirements.txt
+RUN python3 -m pip install --upgrade pip && \
+    python3 -m pip install runpod
 
 # Add src files (Worker Template)
 ADD src .

From 2c1544ec72e98e49f7c0fa639ed010ad8848ff52 Mon Sep 17 00:00:00 2001
From: Noah Yoshida <noahcy117@gmail.com>
Date: Thu, 13 Jun 2024 16:23:42 -0700
Subject: [PATCH 03/22] runpod POC

---
 runpod/src/handler.py |  15 +-
 runpod/src/server.py  | 352 ------------------------------------------
 2 files changed, 5 insertions(+), 362 deletions(-)
 delete mode 100644 runpod/src/server.py

diff --git a/runpod/src/handler.py b/runpod/src/handler.py
index 39ac671ca..576f029e2 100644
--- a/runpod/src/handler.py
+++ b/runpod/src/handler.py
@@ -12,20 +12,21 @@
 # Prepare global variables
 JOBS = set()
 TGI_LOCAL_PORT = int(os.environ.get('TGI_LOCAL_PORT', 8080))
-
+url = "http://127.0.0.1:{}".format(TGI_LOCAL_PORT)
 # Create the client
-client = Client("http://127.0.0.1:{}".format(TGI_LOCAL_PORT))
-
+client = Client(url)
+print(url)
 # Wait for the hugging face TGI worker to start running.
 while True:
     try:
-        client.generate("Why is the sky blue?").generated_text
+        client.generate("Why is the sky blue?", max_new_tokens=1).generated_text
         print("Successfully cold booted the hugging face text generation inference server!")
 
         # Break from the while loop
         break
 
     except Exception as e:
+        print(e)
         print("The hugging face text generation inference server is still cold booting...")
         time.sleep(5)
 
@@ -71,12 +72,6 @@ def aggregate_function(streamed_outputs):
             text_outputs = response.token.text
             ret = {"text": text_outputs}
 
-            # Update the aggregate transformation function
-            runpod.serverless.modules.rp_metrics.metrics_collector.update_stream_aggregate(
-                job_id=job['id'], 
-                aggregate_function=aggregate_function
-            )
-
             yield ret
 
     # Remove job from the set.
diff --git a/runpod/src/server.py b/runpod/src/server.py
deleted file mode 100644
index 63cc39c77..000000000
--- a/runpod/src/server.py
+++ /dev/null
@@ -1,352 +0,0 @@
-import asyncio
-import os
-from pathlib import Path
-from typing import List, Optional
-
-import torch
-from grpc import aio
-from grpc_reflection.v1alpha import reflection
-from loguru import logger
-
-from lorax_server.adapters.utils import download_adapter
-from lorax_server.cache import Cache
-from lorax_server.interceptor import ExceptionInterceptor
-from lorax_server.models import Model, get_model
-from lorax_server.pb import generate_pb2, generate_pb2_grpc
-from lorax_server.tracing import UDSOpenTelemetryAioServerInterceptor
-from lorax_server.utils import HUB, LOCAL, PBASE, S3, map_pbase_model_id_to_s3
-from lorax_server.utils.adapter import BASE_MODEL_ADAPTER_ID, is_base_model
-from lorax_server.utils.sgmv import has_sgmv
-from lorax_server.utils.state import set_speculative_tokens
-
-
-class LoraxService(generate_pb2_grpc.LoraxServiceServicer):
-    """
-    Implementation of the LoraxService gRPC service.
-
-    Args:
-        model (Model): The model used for inference.
-        cache (Cache): The cache used for storing and retrieving batches.
-        server_urls (List[str]): List of server URLs for service discovery.
-    """
-
-    def __init__(self, model: Model, cache: Cache, server_urls: List[str]):
-        self.cache = cache
-        self.model = model
-        self.server_urls = server_urls
-        # For some reason, inference_mode does not work well with GLOO which we use on CPU
-        if model.device.type == "cuda":
-            # Force inference mode for the lifetime of LoraxService
-            self._inference_mode_raii_guard = torch._C._InferenceMode(True)
-
-    async def Info(self, request, context):
-        return self.model.info
-
-    async def Health(self, request, context):
-        if self.model.device.type == "cuda":
-            torch.zeros((2, 2)).cuda()
-        return generate_pb2.HealthResponse()
-
-    async def ServiceDiscovery(self, request, context):
-        return generate_pb2.ServiceDiscoveryResponse(urls=self.server_urls)
-
-    async def ClearCache(self, request, context):
-        if request.HasField("id"):
-            self.cache.delete(request.id)
-        else:
-            self.cache.clear()
-        return generate_pb2.ClearCacheResponse()
-
-    async def FilterBatch(self, request, context):
-        batch = self.cache.pop(request.batch_id)
-        if batch is None:
-            raise ValueError(f"Batch ID {request.batch_id} not found in cache.")
-        filtered_batch = batch.filter(request.request_ids)
-        self.cache.set(filtered_batch)
-
-        return generate_pb2.FilterBatchResponse(batch=filtered_batch.to_pb())
-
-    async def Warmup(self, request: generate_pb2.WarmupRequest, context):
-        batch = self.model.batch_type.from_pb(
-            request.batch,
-            self.model.tokenizer,
-            self.model.tokenizers,
-            self.model.dtype,
-            self.model.device,
-        )
-        max_supported_total_tokens = self.model.warmup(batch, request.max_new_tokens)
-
-        return generate_pb2.WarmupResponse(max_supported_total_tokens=max_supported_total_tokens)
-
-    async def Prefill(self, request: generate_pb2.PrefillRequest, context):
-        batch = self.model.batch_type.from_pb(
-            request.batch,
-            self.model.tokenizer,
-            self.model.tokenizers,
-            self.model.dtype,
-            self.model.device,
-        )
-
-        generations, next_batch = self.model.generate_token(batch)
-        self.cache.set(next_batch)
-
-        return generate_pb2.PrefillResponse(
-            generations=[generation.to_pb() for generation in generations],
-            batch=next_batch.to_pb() if next_batch else None,
-        )
-
-    async def Embed(self, request: generate_pb2.EmbedRequest, context):
-        if not self.model.supports_embeddings:
-            raise ValueError("Model does not support embeddings")
-        
-        batch = self.model.batch_type.from_pb(
-            request.batch,
-            self.model.tokenizer,
-            self.model.tokenizers,
-            self.model.dtype,
-            self.model.device,
-        )
-        embeddings = self.model.embed(batch)
-        embeddings_proto = []
-        for i, embedding in enumerate(embeddings):
-            embeddings_proto.append(generate_pb2.Embedding(request_id=batch.request_ids[i], values=embedding))
-        return generate_pb2.EmbedResponse(embeddings=embeddings_proto)
-
-    async def Decode(self, request: generate_pb2.DecodeRequest, context):
-        if len(request.batches) == 0:
-            raise ValueError("Must provide at least one batch")
-
-        batches = []
-        for batch_pb in request.batches:
-            batch = self.cache.pop(batch_pb.id)
-            if batch is None:
-                raise ValueError(f"Batch ID {batch_pb.id} not found in cache.")
-            batches.append(batch)
-
-        if len(batches) == 0:
-            raise ValueError("All batches are empty")
-
-        if len(batches) > 1:
-            batch = self.model.batch_type.concatenate(batches)
-        else:
-            batch = batches[0]
-
-        generations, next_batch = self.model.generate_token(batch)
-        self.cache.set(next_batch)
-
-        return generate_pb2.DecodeResponse(
-            generations=[generation.to_pb() for generation in generations],
-            batch=next_batch.to_pb() if next_batch else None,
-        )
-
-    async def DownloadAdapter(self, request: generate_pb2.DownloadAdapterRequest, context):
-        adapter_parameters = request.adapter_parameters
-        if is_base_model(adapter_parameters):
-            logger.info("No adapter to download for base model. Skipping.")
-            return generate_pb2.DownloadAdapterResponse(downloaded=False)
-
-        adapter_bytes = 0
-        api_token = request.api_token
-        adapter_source = _adapter_source_enum_to_string(request.adapter_source)
-        for adapter_id in adapter_parameters.adapter_ids:
-            if adapter_id == BASE_MODEL_ADAPTER_ID:
-                logger.info("No adapter to download for base model. Skipping.")
-                continue
-
-            adapter_bytes += download_adapter(adapter_id, adapter_source, api_token)
-
-        adapter_memory_size = self.model.adapter_memory_size()
-        if adapter_memory_size > 0:
-            logger.info(
-                f"Downloaded adapter {adapter_id} memory size: {adapter_bytes} bytes "
-                f"(reservation: {adapter_memory_size} bytes)"
-            )
-            adapter_memory_fraction = adapter_bytes / adapter_memory_size
-            if adapter_memory_fraction > 1:
-                raise ValueError(
-                    f"Adapter {adapter_id} is larger than adapter memory reservation: "
-                    f"{adapter_bytes} / {adapter_memory_size} bytes"
-                )
-        else:
-            # Assume 0.0 memory fraction if adapter memory size is not set
-            logger.info(
-                f"Downloaded adapter {adapter_id} memory size: {adapter_bytes} bytes " f"(no reservation limit)"
-            )
-            adapter_memory_fraction = 0.0
-
-        return generate_pb2.DownloadAdapterResponse(downloaded=True, memory_fraction=adapter_memory_fraction)
-
-    async def LoadAdapter(self, request: generate_pb2.LoadAdapterRequest, context):
-        adapter_parameters = request.adapter_parameters
-        if is_base_model(adapter_parameters):
-            logger.info("No adapter to load for base model. Skipping.")
-            return generate_pb2.LoadAdapterResponse(loaded=False)
-
-        try:
-            adapter_source = _adapter_source_enum_to_string(request.adapter_source)
-            adapter_index = request.adapter_index
-            api_token = request.api_token
-
-            if adapter_source == PBASE:
-                for i in range(len(adapter_parameters.adapter_ids)):
-                    adapter_id = adapter_parameters.adapter_ids[i]
-                    adapter_id = map_pbase_model_id_to_s3(adapter_id, api_token)
-                    adapter_parameters.adapter_ids[i] = adapter_id
-                adapter_source = S3
-
-            self.model.load_adapter(adapter_parameters, adapter_source, adapter_index, api_token)
-
-            return generate_pb2.LoadAdapterResponse(loaded=True)
-        except Exception:
-            logger.exception("Error when loading adapter")
-            raise
-
-    async def OffloadAdapter(self, request: generate_pb2.OffloadAdapterRequest, context):
-        adapter_parameters = request.adapter_parameters
-        if is_base_model(adapter_parameters):
-            logger.info("No adapter to offload for base model. Skipping.")
-            return generate_pb2.OffloadAdapterResponse(offloaded=False)
-
-        try:
-            adapter_idx = request.adapter_index
-            adapter_source = _adapter_source_enum_to_string(request.adapter_source)
-            adapter_index = request.adapter_index
-            self.model.offload_adapter(adapter_idx, adapter_source, adapter_index)
-
-            # Ensure there is enough memory for the next adapter
-            torch.cuda.empty_cache()
-            torch.cuda.synchronize(self.model.device)
-
-            return generate_pb2.OffloadAdapterResponse(offloaded=True)
-        except Exception:
-            logger.exception("Error when offloading adapter")
-            raise
-
-
-def serve(
-    model_id: str,
-    adapter_id: str,
-    revision: Optional[str],
-    sharded: bool,
-    quantize: Optional[str],
-    compile: bool,
-    dtype: Optional[str],
-    trust_remote_code: bool,
-    uds_path: Path,
-    source: str,
-    adapter_source: str,
-    speculative_tokens: int,
-):
-    async def serve_inner(
-        model_id: str,
-        adapter_id: str,
-        revision: Optional[str],
-        sharded: bool,
-        quantize: Optional[str],
-        compile: bool,
-        dtype: Optional[str],
-        trust_remote_code: bool,
-        speculative_tokens: int,
-    ):
-        unix_socket_template = "unix://{}-{}"
-        if sharded:
-            server_urls = [unix_socket_template.format(uds_path, rank) for rank in range(int(os.environ["WORLD_SIZE"]))]
-            local_url = server_urls[int(os.environ["RANK"])]
-        else:
-            local_url = unix_socket_template.format(uds_path, 0)
-            server_urls = [local_url]
-
-        try:
-            model = get_model(
-                model_id,
-                adapter_id,
-                revision,
-                sharded,
-                quantize,
-                compile,
-                dtype,
-                trust_remote_code,
-                source,
-                adapter_source,
-            )
-        except Exception:
-            logger.exception("Error when initializing model")
-            raise
-
-        if quantize == "gptq":
-            try:
-                # When using GPTQ, Exllama kernels need some global kernels
-                # For which we have the finale shapes only after the model has loaded
-                # This will allocate those buffers.
-                from lorax_server.utils.gptq.exllamav2 import (
-                    create_exllama_buffers,
-                    set_device,
-                )
-
-                set_device(model.device)
-                create_exllama_buffers()
-            except ImportError:
-                pass
-
-        # set speculative decoding tokens
-        speculative_tokens = max(model.max_speculative_tokens, speculative_tokens)
-        if speculative_tokens > 0:
-            set_speculative_tokens(speculative_tokens)
-
-        server = aio.server(
-            interceptors=[
-                ExceptionInterceptor(),
-                UDSOpenTelemetryAioServerInterceptor(),
-            ]
-        )
-        generate_pb2_grpc.add_LoraxServiceServicer_to_server(LoraxService(model, Cache(), server_urls), server)
-        SERVICE_NAMES = (
-            generate_pb2.DESCRIPTOR.services_by_name["LoraxService"].full_name,
-            reflection.SERVICE_NAME,
-        )
-        reflection.enable_server_reflection(SERVICE_NAMES, server)
-        server.add_insecure_port(local_url)
-
-        await server.start()
-
-        # Log SGMV kernel status
-        if has_sgmv():
-            logger.info("SGMV kernel is enabled, multi-LoRA inference will be fast!")
-        else:
-            logger.info("SGMV kernel is disabled, multi-LoRA inference may be slow")
-
-        logger.info("Server started at {}".format(local_url))
-
-        try:
-            await server.wait_for_termination()
-        except KeyboardInterrupt:
-            logger.info("Signal received. Shutting down")
-            await server.stop(0)
-
-    asyncio.run(
-        serve_inner(
-            model_id,
-            adapter_id,
-            revision,
-            sharded,
-            quantize,
-            compile,
-            dtype,
-            trust_remote_code,
-            speculative_tokens,
-        )
-    )
-
-
-def _adapter_source_enum_to_string(adapter_source: int) -> str:
-    # TODO(travis): refactor this to be less hacky
-    if adapter_source == generate_pb2.AdapterSource.HUB:
-        return HUB
-    elif adapter_source == generate_pb2.AdapterSource.S3:
-        return S3
-    elif adapter_source == generate_pb2.AdapterSource.LOCAL:
-        return LOCAL
-    elif adapter_source == generate_pb2.AdapterSource.PBASE:
-        return PBASE
-    else:
-        raise ValueError(f"Unknown adapter source {adapter_source}")

From 8c9d3f1664618631433acb46ed2927a57a90d5db Mon Sep 17 00:00:00 2001
From: Noah Yoshida <noahcy117@gmail.com>
Date: Fri, 14 Jun 2024 10:58:10 -0700
Subject: [PATCH 04/22] test

---
 .github/workflows/build.yaml | 97 ++++++++++++++++++++++++++++++++++++
 1 file changed, 97 insertions(+)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 116120764..966d10e6d 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -152,3 +152,100 @@ jobs:
           # Delete the SHA image(s) from containerd store
           sudo ctr i rm $(sudo ctr i ls -q)
   
+#### new build test
+      - name: Docker meta
+        id: meta1
+        uses: docker/metadata-action@v5
+        with:
+          images: |
+            ghcr.io/predibase/lorax-runpod
+          tags: |
+            type=semver,pattern={{version}}
+            type=semver,pattern={{major}}.{{minor}}
+            type=sha,prefix=,suffix=,format=short
+            type=raw,value=main,enable=${{ github.ref == 'refs/heads/main' }}
+
+      - name: Create a hash from tags
+        env:
+          tags: ${{ steps.meta1.outputs.tags }}
+        id: vars1
+        run: |
+          tag_hash=$(echo -n "$tags" | md5sum | awk '{print $1}')
+          echo "tag_hash=$tag_hash" >> $GITHUB_OUTPUT
+          echo "cache_dir=/runner/build/images/cache" >> $GITHUB_OUTPUT
+          echo "image_dir=/runner/build/images" >> $GITHUB_OUTPUT
+          echo "image_path=/runner/build/images/lorax" >> $GITHUB_OUTPUT
+
+      - name: Create and update image/cache directory
+        env:
+          image_dir: ${{ steps.vars1.outputs.image_dir }}
+          cache_dir: ${{ steps.vars1.outputs.cache_dir }}
+        run: |
+          sudo mkdir -p $image_dir
+          sudo chown ubuntu:ubuntu $image_dir
+
+          sudo mkdir -p $cache_dir
+          sudo chown ubuntu:ubuntu $cache_dir
+
+      - name: Export Docker image as OCI
+        uses: docker/build-push-action@v5
+        with:
+          context: .
+          file: ./runpod/Dockerfile  # Path to your Dockerfile
+          push: false
+          tags: ${{ steps.meta1.outputs.tags }}
+          outputs: type=oci,compression=gzip,dest=${{ steps.vars1.outputs.image_path }}-${{ steps.vars1.outputs.tag_hash }}.tar.gz
+          cache-from: type=local,src=${{ steps.vars1.outputs.cache_dir }}
+          cache-to: type=local,mode=max,image-manifest=true,oci-mediatypes=true,dest=${{ steps.vars1.outputs.cache_dir }}
+
+      - name: Import image in containerd
+        env:
+          tag_hash: ${{ steps.vars1.outputs.tag_hash }}
+          image_path: ${{ steps.vars1.outputs.image_path }}
+        run: |
+          echo "Importing $image_path-$tag_hash to Containerd"
+          sudo ctr i import --no-unpack --all-platforms --digests $image_path-$tag_hash.tar.gz
+
+      - name: Log in to GitHub Container Registry
+        uses: docker/login-action@v1
+        with:
+          registry: ghcr.io
+          username: ${{ github.repository_owner }}
+          password: ${{ secrets.GHCR_PAT }}
+
+      - name: Push image with containerd
+        env:
+          tags: ${{ steps.meta1.outputs.tags }}
+        run: |
+          for tag in $tags
+          do
+            echo "Pushing $tag to GHCR"
+            sudo ctr i push --user "${{ github.repository_owner }}:${{ secrets.GHCR_PAT }}" $tag
+          done
+
+      - name: Create and push soci index
+        env:
+          tags: ${{ steps.meta1.outputs.tags }}
+        run: |
+          export SOCI_PATH=$HOME/.soci/soci
+          for tag in $tags
+          do
+            echo "Creating soci index for $tag"
+            sudo $SOCI_PATH create $tag
+            echo "Pushing soci index for $tag"
+            sudo $SOCI_PATH push --user ${{ github.repository_owner }}:${{ secrets.GHCR_PAT }} $tag
+          done
+
+      - name: Prune older images
+        env:
+          tag_hash: ${{ steps.vars1.outputs.tag_hash }}
+          image_path: ${{ steps.vars1.outputs.image_path }}
+        run: |
+          # Delete images older than a day from docker store
+          docker image prune -a -f --filter "until=24h"
+
+          # Delete the on disk copy
+          rm -rf "$image_path-$tag_hash.tar.gz"
+
+          # Delete the SHA image(s) from containerd store
+          sudo ctr i rm $(sudo ctr i ls -q)

From b6df46679783d53513618c81ef5d63cae47d93f9 Mon Sep 17 00:00:00 2001
From: Noah Yoshida <noahcy117@gmail.com>
Date: Fri, 14 Jun 2024 11:14:41 -0700
Subject: [PATCH 05/22] fix context

---
 .github/workflows/build.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 966d10e6d..0055a1d3e 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -190,7 +190,7 @@ jobs:
       - name: Export Docker image as OCI
         uses: docker/build-push-action@v5
         with:
-          context: .
+          context: ./runpod
           file: ./runpod/Dockerfile  # Path to your Dockerfile
           push: false
           tags: ${{ steps.meta1.outputs.tags }}

From 82b7f5d19efef6f7186d640a30071ddf010b00d8 Mon Sep 17 00:00:00 2001
From: Noah Yoshida <noahcy117@gmail.com>
Date: Fri, 14 Jun 2024 11:34:02 -0700
Subject: [PATCH 06/22] updates

---
 runpod/Dockerfile     |  2 +-
 runpod/src/handler.py | 39 ++++++++++-----------------------------
 2 files changed, 11 insertions(+), 30 deletions(-)

diff --git a/runpod/Dockerfile b/runpod/Dockerfile
index 583782a98..f662516cc 100644
--- a/runpod/Dockerfile
+++ b/runpod/Dockerfile
@@ -1,4 +1,5 @@
 # Base image
+# TODO change the lorax base image
 FROM ghcr.io/predibase/lorax:0.10.0
 ENV DEBIAN_FRONTEND=noninteractive
 
@@ -68,7 +69,6 @@ RUN if [ "$DOWNLOAD_MODEL" = "1" ]; then \
 
 # Quick temporary updates
 RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.10
-RUN python3.10 -m pip install git+https://github.com/runpod/runpod-python@a1#egg=runpod --compile
 RUN python3.10 -m pip install lorax-client
 
 ENTRYPOINT ["./entrypoint.sh"]
diff --git a/runpod/src/handler.py b/runpod/src/handler.py
index 576f029e2..e17d20cd0 100644
--- a/runpod/src/handler.py
+++ b/runpod/src/handler.py
@@ -31,8 +31,8 @@
         time.sleep(5)
 
 def concurrency_controller() -> bool:
-    # Handle at most 100 jobs at a time.
-    return len(JOBS) > 20
+    # Handle at most 1024 jobs at a time.
+    return len(JOBS) > 1024
 
 async def handler_streaming(job: dict) -> Generator[dict[str, list], None, None]:
     '''
@@ -41,38 +41,19 @@ async def handler_streaming(job: dict) -> Generator[dict[str, list], None, None]
     # Get job input
     job_input = job['input']
 
-    # Prompts
-    prompt = job_input['prompt']
-
-    # Validate the inputs
-    sampling_params = job_input.get('sampling_params', {})
-
+    # TODO get stream yes/no and call the client based on that...?
+    # TODO get the auth token or whatever 
+    # TODO figure out how to do auth here - maybe we start it with a secret
+    # and in istio-land we inject the correct secret in requests 
+    # if the user is auth'ed properly for the resource? 
+    # TODO handle key timeouts 
     # Add job to the set.
     JOBS.add(job['id'])
 
-     # Include metrics in the highest level for the job output for aggregrate.
-    def aggregate_function(streamed_outputs):
-        aggregate_output = ""
-        for stream in streamed_outputs:
-            aggregate_output += stream['text']
-
-        # Aggregate metrics to expose to the user
-        # input_tokens = -1 # TBD
-        # output_tokens = -1 # TBD
-
-        return {
-            "text": aggregate_output,
-            # "input_tokens": input_tokens,
-            # "output_tokens": output_tokens,
-        }
-
     # Streaming case
-    for response in client.generate_stream(prompt, **sampling_params):
+    for response in client.generate_stream(**job_input):
         if not response.token.special:
-            text_outputs = response.token.text
-            ret = {"text": text_outputs}
-
-            yield ret
+            yield response
 
     # Remove job from the set.
     JOBS.remove(job['id'])

From bfb3e76bf8844487a4b876244fdcefb3b2e0c304 Mon Sep 17 00:00:00 2001
From: Noah Yoshida <noahcy117@gmail.com>
Date: Fri, 14 Jun 2024 11:56:53 -0700
Subject: [PATCH 07/22] test runpod specific tag

---
 .github/workflows/build.yaml | 192 +++++++++++++++++------------------
 1 file changed, 96 insertions(+), 96 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 0055a1d3e..21285a158 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -55,102 +55,102 @@ jobs:
             # persistent data location
             root = "/runner/build/containerd"
 
-      - name: Docker meta
-        id: meta
-        uses: docker/metadata-action@v5
-        with:
-          images: |
-            ghcr.io/predibase/lorax
-          tags: |
-            type=semver,pattern={{version}}
-            type=semver,pattern={{major}}.{{minor}}
-            type=sha,prefix=,suffix=,format=short
-            type=raw,value=main,enable=${{ github.ref == 'refs/heads/main' }}
+      # - name: Docker meta
+      #   id: meta
+      #   uses: docker/metadata-action@v5
+      #   with:
+      #     images: |
+      #       ghcr.io/predibase/lorax
+      #     tags: |
+      #       type=semver,pattern={{version}}
+      #       type=semver,pattern={{major}}.{{minor}}
+      #       type=sha,prefix=,suffix=,format=short
+      #       type=raw,value=main,enable=${{ github.ref == 'refs/heads/main' }}
       
-      - name: Create a hash from tags
-        env:
-          tags: ${{ steps.meta.outputs.tags }}
-        id: vars
-        run: |
-          tag_hash=$(echo -n "$tags" | md5sum | awk '{print $1}')
-          echo "tag_hash=$tag_hash" >> $GITHUB_OUTPUT
-          echo "cache_dir=/runner/build/images/cache" >> $GITHUB_OUTPUT
-          echo "image_dir=/runner/build/images" >> $GITHUB_OUTPUT
-          echo "image_path=/runner/build/images/lorax" >> $GITHUB_OUTPUT
-
-      - name: Create and update image/cache directory
-        env:
-          image_dir: ${{ steps.vars.outputs.image_dir }}
-          cache_dir: ${{ steps.vars.outputs.cache_dir }}
-        run: |
-          sudo mkdir -p $image_dir
-          sudo chown ubuntu:ubuntu $image_dir
-
-          sudo mkdir -p $cache_dir
-          sudo chown ubuntu:ubuntu $cache_dir
-
-      - name: Export Docker image as OCI
-        uses: docker/build-push-action@v5
-        with:
-          context: .
-          file: ./Dockerfile  # Path to your Dockerfile
-          push: false
-          tags: ${{ steps.meta.outputs.tags }}
-          outputs: type=oci,compression=gzip,dest=${{ steps.vars.outputs.image_path }}-${{ steps.vars.outputs.tag_hash }}.tar.gz
-          cache-from: type=local,src=${{ steps.vars.outputs.cache_dir }}
-          cache-to: type=local,mode=max,image-manifest=true,oci-mediatypes=true,dest=${{ steps.vars.outputs.cache_dir }}
-
-      - name: Import image in containerd
-        env:
-          tag_hash: ${{ steps.vars.outputs.tag_hash }}
-          image_path: ${{ steps.vars.outputs.image_path }}
-        run: |
-          echo "Importing $image_path-$tag_hash to Containerd"
-          sudo ctr i import --no-unpack --all-platforms --digests $image_path-$tag_hash.tar.gz
-
-      - name: Log in to GitHub Container Registry
-        uses: docker/login-action@v1
-        with:
-          registry: ghcr.io
-          username: ${{ github.repository_owner }}
-          password: ${{ secrets.GHCR_PAT }}
-
-      - name: Push image with containerd
-        env:
-          tags: ${{ steps.meta.outputs.tags }}
-        run: |
-          for tag in $tags
-          do
-            echo "Pushing $tag to GHCR"
-            sudo ctr i push --user "${{ github.repository_owner }}:${{ secrets.GHCR_PAT }}" $tag
-          done
+      # - name: Create a hash from tags
+      #   env:
+      #     tags: ${{ steps.meta.outputs.tags }}
+      #   id: vars
+      #   run: |
+      #     tag_hash=$(echo -n "$tags" | md5sum | awk '{print $1}')
+      #     echo "tag_hash=$tag_hash" >> $GITHUB_OUTPUT
+      #     echo "cache_dir=/runner/build/images/cache" >> $GITHUB_OUTPUT
+      #     echo "image_dir=/runner/build/images" >> $GITHUB_OUTPUT
+      #     echo "image_path=/runner/build/images/lorax" >> $GITHUB_OUTPUT
+
+      # - name: Create and update image/cache directory
+      #   env:
+      #     image_dir: ${{ steps.vars.outputs.image_dir }}
+      #     cache_dir: ${{ steps.vars.outputs.cache_dir }}
+      #   run: |
+      #     sudo mkdir -p $image_dir
+      #     sudo chown ubuntu:ubuntu $image_dir
+
+      #     sudo mkdir -p $cache_dir
+      #     sudo chown ubuntu:ubuntu $cache_dir
+
+      # - name: Export Docker image as OCI
+      #   uses: docker/build-push-action@v5
+      #   with:
+      #     context: .
+      #     file: ./Dockerfile  # Path to your Dockerfile
+      #     push: false
+      #     tags: ${{ steps.meta.outputs.tags }}
+      #     outputs: type=oci,compression=gzip,dest=${{ steps.vars.outputs.image_path }}-${{ steps.vars.outputs.tag_hash }}.tar.gz
+      #     cache-from: type=local,src=${{ steps.vars.outputs.cache_dir }}
+      #     cache-to: type=local,mode=max,image-manifest=true,oci-mediatypes=true,dest=${{ steps.vars.outputs.cache_dir }}
+
+      # - name: Import image in containerd
+      #   env:
+      #     tag_hash: ${{ steps.vars.outputs.tag_hash }}
+      #     image_path: ${{ steps.vars.outputs.image_path }}
+      #   run: |
+      #     echo "Importing $image_path-$tag_hash to Containerd"
+      #     sudo ctr i import --no-unpack --all-platforms --digests $image_path-$tag_hash.tar.gz
+
+      # - name: Log in to GitHub Container Registry
+      #   uses: docker/login-action@v1
+      #   with:
+      #     registry: ghcr.io
+      #     username: ${{ github.repository_owner }}
+      #     password: ${{ secrets.GHCR_PAT }}
+
+      # - name: Push image with containerd
+      #   env:
+      #     tags: ${{ steps.meta.outputs.tags }}
+      #   run: |
+      #     for tag in $tags
+      #     do
+      #       echo "Pushing $tag to GHCR"
+      #       sudo ctr i push --user "${{ github.repository_owner }}:${{ secrets.GHCR_PAT }}" $tag
+      #     done
       
-      - name: Create and push soci index
-        env:
-          tags: ${{ steps.meta.outputs.tags }}
-        run: |
-          export SOCI_PATH=$HOME/.soci/soci
-          for tag in $tags
-          do
-            echo "Creating soci index for $tag"
-            sudo $SOCI_PATH create $tag
-            echo "Pushing soci index for $tag"
-            sudo $SOCI_PATH push --user ${{ github.repository_owner }}:${{ secrets.GHCR_PAT }} $tag
-          done
-
-      - name: Prune older images
-        env:
-          tag_hash: ${{ steps.vars.outputs.tag_hash }}
-          image_path: ${{ steps.vars.outputs.image_path }}
-        run: |
-          # Delete images older than a day from docker store
-          docker image prune -a -f --filter "until=24h"
-
-          # Delete the on disk copy
-          rm -rf "$image_path-$tag_hash.tar.gz"
-
-          # Delete the SHA image(s) from containerd store
-          sudo ctr i rm $(sudo ctr i ls -q)
+      # - name: Create and push soci index
+      #   env:
+      #     tags: ${{ steps.meta.outputs.tags }}
+      #   run: |
+      #     export SOCI_PATH=$HOME/.soci/soci
+      #     for tag in $tags
+      #     do
+      #       echo "Creating soci index for $tag"
+      #       sudo $SOCI_PATH create $tag
+      #       echo "Pushing soci index for $tag"
+      #       sudo $SOCI_PATH push --user ${{ github.repository_owner }}:${{ secrets.GHCR_PAT }} $tag
+      #     done
+
+      # - name: Prune older images
+      #   env:
+      #     tag_hash: ${{ steps.vars.outputs.tag_hash }}
+      #     image_path: ${{ steps.vars.outputs.image_path }}
+      #   run: |
+      #     # Delete images older than a day from docker store
+      #     docker image prune -a -f --filter "until=24h"
+
+      #     # Delete the on disk copy
+      #     rm -rf "$image_path-$tag_hash.tar.gz"
+
+      #     # Delete the SHA image(s) from containerd store
+      #     sudo ctr i rm $(sudo ctr i ls -q)
   
 #### new build test
       - name: Docker meta
@@ -158,7 +158,7 @@ jobs:
         uses: docker/metadata-action@v5
         with:
           images: |
-            ghcr.io/predibase/lorax-runpod
+            ghcr.io/predibase/lorax
           tags: |
             type=semver,pattern={{version}}
             type=semver,pattern={{major}}.{{minor}}
@@ -170,7 +170,7 @@ jobs:
           tags: ${{ steps.meta1.outputs.tags }}
         id: vars1
         run: |
-          tag_hash=$(echo -n "$tags" | md5sum | awk '{print $1}')
+          tag_hash=runpod-$(echo -n "$tags" | md5sum | awk '{print $1}')
           echo "tag_hash=$tag_hash" >> $GITHUB_OUTPUT
           echo "cache_dir=/runner/build/images/cache" >> $GITHUB_OUTPUT
           echo "image_dir=/runner/build/images" >> $GITHUB_OUTPUT

From cb4b39a8fc71f90d64ff99ebfc13d5c4b22dad3c Mon Sep 17 00:00:00 2001
From: Noah Yoshida <noahcy117@gmail.com>
Date: Fri, 14 Jun 2024 12:05:00 -0700
Subject: [PATCH 08/22] add prefix

---
 .github/workflows/build.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 21285a158..3d3c89102 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -162,7 +162,7 @@ jobs:
           tags: |
             type=semver,pattern={{version}}
             type=semver,pattern={{major}}.{{minor}}
-            type=sha,prefix=,suffix=,format=short
+            type=sha,prefix=runpod,suffix=,format=short
             type=raw,value=main,enable=${{ github.ref == 'refs/heads/main' }}
 
       - name: Create a hash from tags
@@ -170,7 +170,7 @@ jobs:
           tags: ${{ steps.meta1.outputs.tags }}
         id: vars1
         run: |
-          tag_hash=runpod-$(echo -n "$tags" | md5sum | awk '{print $1}')
+          tag_hash=$(echo -n "$tags" | md5sum | awk '{print $1}')
           echo "tag_hash=$tag_hash" >> $GITHUB_OUTPUT
           echo "cache_dir=/runner/build/images/cache" >> $GITHUB_OUTPUT
           echo "image_dir=/runner/build/images" >> $GITHUB_OUTPUT

From 680378910d053934e69bed76a0b5080069cd4837 Mon Sep 17 00:00:00 2001
From: Noah Yoshida <noahcy117@gmail.com>
Date: Fri, 14 Jun 2024 12:20:01 -0700
Subject: [PATCH 09/22] chmod entrypoint

---
 runpod/src/entrypoint.sh | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 mode change 100644 => 100755 runpod/src/entrypoint.sh

diff --git a/runpod/src/entrypoint.sh b/runpod/src/entrypoint.sh
old mode 100644
new mode 100755

From 2f72eaafecceb43a5452ea7fb3cc3784462d8bd8 Mon Sep 17 00:00:00 2001
From: Noah Yoshida <noahcy117@gmail.com>
Date: Fri, 14 Jun 2024 14:30:16 -0700
Subject: [PATCH 10/22] dump the output into a serializable dict

---
 .github/workflows/build.yaml | 2 +-
 runpod/src/handler.py        | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 3d3c89102..8364dd829 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -162,7 +162,7 @@ jobs:
           tags: |
             type=semver,pattern={{version}}
             type=semver,pattern={{major}}.{{minor}}
-            type=sha,prefix=runpod,suffix=,format=short
+            type=sha,prefix=runpod-,suffix=,format=short
             type=raw,value=main,enable=${{ github.ref == 'refs/heads/main' }}
 
       - name: Create a hash from tags
diff --git a/runpod/src/handler.py b/runpod/src/handler.py
index e17d20cd0..652e1c775 100644
--- a/runpod/src/handler.py
+++ b/runpod/src/handler.py
@@ -53,7 +53,8 @@ async def handler_streaming(job: dict) -> Generator[dict[str, list], None, None]
     # Streaming case
     for response in client.generate_stream(**job_input):
         if not response.token.special:
-            yield response
+            # Dump the repsonse into a dictionary
+            yield response.model_dump()
 
     # Remove job from the set.
     JOBS.remove(job['id'])

From dd6c80ea87a9e999d652df6216beac011d64a8b0 Mon Sep 17 00:00:00 2001
From: Noah Yoshida <noahcy117@gmail.com>
Date: Fri, 14 Jun 2024 15:54:55 -0700
Subject: [PATCH 11/22] no stream by default?

---
 runpod/src/entrypoint.sh |  2 +-
 runpod/src/handler.py    | 20 +++++++++++++++-----
 2 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/runpod/src/entrypoint.sh b/runpod/src/entrypoint.sh
index 09e69a875..caadce279 100755
--- a/runpod/src/entrypoint.sh
+++ b/runpod/src/entrypoint.sh
@@ -39,7 +39,7 @@ if [[ -n "${HF_MAX_BATCH_PREFILL_TOKENS}" ]]; then
 fi
 
 # Start the text generation server
-nohup lorax-launcher --port 8080 --model-id      predibase/Mistral-7B-v0.1-dequantized      --adapter-source      hub      --default-adapter-source      pbase      --max-batch-prefill-tokens      32768      --max-total-tokens      8192      --max-input-length      8191      --max-concurrent-requests      1024 &
+nohup lorax-launcher --port 8080 --model-id      predibase/Meta-Llama-3-8B-Instruct-dequantized      --adapter-source      hub      --default-adapter-source      pbase      --max-batch-prefill-tokens      32768      --max-total-tokens      8192      --max-input-length      8191      --max-concurrent-requests      1024 &
 
 # Start the handler using python 3.10
 python3.10 -u /handler.py
diff --git a/runpod/src/handler.py b/runpod/src/handler.py
index 652e1c775..c74886abc 100644
--- a/runpod/src/handler.py
+++ b/runpod/src/handler.py
@@ -40,7 +40,11 @@ async def handler_streaming(job: dict) -> Generator[dict[str, list], None, None]
     ''' 
     # Get job input
     job_input = job['input']
+    
+    # When we are called with a streaming endpoint, then we should have the field 
+    # _stream = True
 
+    # TODO handle the two openAI compatable endpoints as well...!
     # TODO get stream yes/no and call the client based on that...?
     # TODO get the auth token or whatever 
     # TODO figure out how to do auth here - maybe we start it with a secret
@@ -49,12 +53,18 @@ async def handler_streaming(job: dict) -> Generator[dict[str, list], None, None]
     # TODO handle key timeouts 
     # Add job to the set.
     JOBS.add(job['id'])
+    if job_input.get('_stream', False):
+        del job_input['_stream']
+        # Streaming case
+        for response in client.generate_stream(**job_input):
+            if not response.token.special:
+                # Dump the repsonse into a dictionary
+                yield response.model_dump()
+    else:
+        del job_input['_stream']
+        response = client.generate(**job_input)
+        yield response.model_dump()
 
-    # Streaming case
-    for response in client.generate_stream(**job_input):
-        if not response.token.special:
-            # Dump the repsonse into a dictionary
-            yield response.model_dump()
 
     # Remove job from the set.
     JOBS.remove(job['id'])

From c670df885f703eb2ac2b645bb565ab1f692d9967 Mon Sep 17 00:00:00 2001
From: Noah Yoshida <noahcy117@gmail.com>
Date: Fri, 14 Jun 2024 19:29:51 -0700
Subject: [PATCH 12/22] try out openai

---
 runpod/Dockerfile     |  1 +
 runpod/src/handler.py | 17 +++++++++++++++++
 2 files changed, 18 insertions(+)

diff --git a/runpod/Dockerfile b/runpod/Dockerfile
index f662516cc..c037aca40 100644
--- a/runpod/Dockerfile
+++ b/runpod/Dockerfile
@@ -70,5 +70,6 @@ RUN if [ "$DOWNLOAD_MODEL" = "1" ]; then \
 # Quick temporary updates
 RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.10
 RUN python3.10 -m pip install lorax-client
+RUN python3.10 -m pip install openai 
 
 ENTRYPOINT ["./entrypoint.sh"]
diff --git a/runpod/src/handler.py b/runpod/src/handler.py
index c74886abc..534682659 100644
--- a/runpod/src/handler.py
+++ b/runpod/src/handler.py
@@ -9,12 +9,16 @@
 # For download the weights
 from lorax import Client
 
+import openai
+
 # Prepare global variables
 JOBS = set()
 TGI_LOCAL_PORT = int(os.environ.get('TGI_LOCAL_PORT', 8080))
 url = "http://127.0.0.1:{}".format(TGI_LOCAL_PORT)
 # Create the client
 client = Client(url)
+
+
 print(url)
 # Wait for the hugging face TGI worker to start running.
 while True:
@@ -40,6 +44,19 @@ async def handler_streaming(job: dict) -> Generator[dict[str, list], None, None]
     ''' 
     # Get job input
     job_input = job['input']
+    # TODO do different things based on the openai_route. Right now, just assume we are calling the openai 
+    # chat completions.generate method!
+    use_openai = 'openai_route' in job_input
+
+    # Create a new client and pass the token for every handler call
+    openai_client = openai.Openai(
+        base_url=f"{url}/v1",
+    )
+
+    if use_openai:
+        if job_input['stream'] == False:
+            yield openai_client.chat.completions.create(**job_input)
+
     
     # When we are called with a streaming endpoint, then we should have the field 
     # _stream = True

From 57c794d6436fe0cdc5a837b503b1c64ba5d86ee7 Mon Sep 17 00:00:00 2001
From: Noah Yoshida <noahcy117@gmail.com>
Date: Fri, 14 Jun 2024 19:35:15 -0700
Subject: [PATCH 13/22] fix model dump thing

---
 runpod/src/handler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/runpod/src/handler.py b/runpod/src/handler.py
index 534682659..581fc3d03 100644
--- a/runpod/src/handler.py
+++ b/runpod/src/handler.py
@@ -55,7 +55,7 @@ async def handler_streaming(job: dict) -> Generator[dict[str, list], None, None]
 
     if use_openai:
         if job_input['stream'] == False:
-            yield openai_client.chat.completions.create(**job_input)
+            yield openai_client.chat.completions.create(**job_input).model_dump()
 
     
     # When we are called with a streaming endpoint, then we should have the field 

From 743c956c3795170b1098a3078a9b084cdfc7e0b8 Mon Sep 17 00:00:00 2001
From: Noah Yoshida <noahcy117@gmail.com>
Date: Fri, 14 Jun 2024 19:50:16 -0700
Subject: [PATCH 14/22] WIP

---
 runpod/src/handler.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/runpod/src/handler.py b/runpod/src/handler.py
index 581fc3d03..d418b00a5 100644
--- a/runpod/src/handler.py
+++ b/runpod/src/handler.py
@@ -46,16 +46,19 @@ async def handler_streaming(job: dict) -> Generator[dict[str, list], None, None]
     job_input = job['input']
     # TODO do different things based on the openai_route. Right now, just assume we are calling the openai 
     # chat completions.generate method!
+    print(job_input)
+    print("first print :P")
     use_openai = 'openai_route' in job_input
 
     # Create a new client and pass the token for every handler call
     openai_client = openai.Openai(
         base_url=f"{url}/v1",
     )
-
+    print(use_openai)
     if use_openai:
-        if job_input['stream'] == False:
-            yield openai_client.chat.completions.create(**job_input).model_dump()
+        # if job_input['stream'] == False:
+        print(job_input)
+        yield openai_client.chat.completions.create(**job_input).model_dump()
 
     
     # When we are called with a streaming endpoint, then we should have the field 

From 8f5ea6fbd519607c80c1cd9580849cf3eb5d128b Mon Sep 17 00:00:00 2001
From: Noah Yoshida <noahcy117@gmail.com>
Date: Fri, 14 Jun 2024 20:01:13 -0700
Subject: [PATCH 15/22] fix

---
 runpod/src/handler.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/runpod/src/handler.py b/runpod/src/handler.py
index d418b00a5..616bc5305 100644
--- a/runpod/src/handler.py
+++ b/runpod/src/handler.py
@@ -51,14 +51,14 @@ async def handler_streaming(job: dict) -> Generator[dict[str, list], None, None]
     use_openai = 'openai_route' in job_input
 
     # Create a new client and pass the token for every handler call
-    openai_client = openai.Openai(
+    openai_client = openai.OpenAI(
         base_url=f"{url}/v1",
     )
     print(use_openai)
     if use_openai:
         # if job_input['stream'] == False:
         print(job_input)
-        yield openai_client.chat.completions.create(**job_input).model_dump()
+        yield openai_client.chat.completions.create(**job_input["openai_input"]).model_dump()
 
     
     # When we are called with a streaming endpoint, then we should have the field 

From b196a975c71744906cb8d39eaa9a08ca9359797b Mon Sep 17 00:00:00 2001
From: Noah Yoshida <noahcy117@gmail.com>
Date: Fri, 14 Jun 2024 20:36:40 -0700
Subject: [PATCH 16/22] add fake api key?

---
 runpod/src/handler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/runpod/src/handler.py b/runpod/src/handler.py
index 616bc5305..230577d87 100644
--- a/runpod/src/handler.py
+++ b/runpod/src/handler.py
@@ -53,6 +53,7 @@ async def handler_streaming(job: dict) -> Generator[dict[str, list], None, None]
     # Create a new client and pass the token for every handler call
     openai_client = openai.OpenAI(
         base_url=f"{url}/v1",
+        api_key="fake"
     )
     print(use_openai)
     if use_openai:
@@ -60,7 +61,6 @@ async def handler_streaming(job: dict) -> Generator[dict[str, list], None, None]
         print(job_input)
         yield openai_client.chat.completions.create(**job_input["openai_input"]).model_dump()
 
-    
     # When we are called with a streaming endpoint, then we should have the field 
     # _stream = True
 

From 14a312aae65abb99e2366bba5607ffb8e8b7221a Mon Sep 17 00:00:00 2001
From: Noah Yoshida <noahcy117@gmail.com>
Date: Mon, 17 Jun 2024 10:20:57 -0700
Subject: [PATCH 17/22] fix logik

---
 runpod/src/handler.py | 33 +++++++++++++++++----------------
 1 file changed, 17 insertions(+), 16 deletions(-)

diff --git a/runpod/src/handler.py b/runpod/src/handler.py
index 230577d87..111929b62 100644
--- a/runpod/src/handler.py
+++ b/runpod/src/handler.py
@@ -55,12 +55,26 @@ async def handler_streaming(job: dict) -> Generator[dict[str, list], None, None]
         base_url=f"{url}/v1",
         api_key="fake"
     )
+    JOBS.add(job['id'])
+
     print(use_openai)
     if use_openai:
         # if job_input['stream'] == False:
         print(job_input)
-        yield openai_client.chat.completions.create(**job_input["openai_input"]).model_dump()
-
+        result = openai_client.chat.completions.create(**job_input["openai_input"]).model_dump()
+        yield result
+    else:
+        if job_input.get('_stream', False):
+            del job_input['_stream']
+            # Streaming case
+            for response in client.generate_stream(**job_input):
+                if not response.token.special:
+                    # Dump the repsonse into a dictionary
+                    yield response.model_dump()
+        else:
+            del job_input['_stream']
+            response = client.generate(**job_input)
+            yield response.model_dump()
     # When we are called with a streaming endpoint, then we should have the field 
     # _stream = True
 
@@ -72,20 +86,7 @@ async def handler_streaming(job: dict) -> Generator[dict[str, list], None, None]
     # if the user is auth'ed properly for the resource? 
     # TODO handle key timeouts 
     # Add job to the set.
-    JOBS.add(job['id'])
-    if job_input.get('_stream', False):
-        del job_input['_stream']
-        # Streaming case
-        for response in client.generate_stream(**job_input):
-            if not response.token.special:
-                # Dump the repsonse into a dictionary
-                yield response.model_dump()
-    else:
-        del job_input['_stream']
-        response = client.generate(**job_input)
-        yield response.model_dump()
-
-
+    
     # Remove job from the set.
     JOBS.remove(job['id'])
 

From bd4312b42ad3ba4d9e747f6afe6f95746170cea4 Mon Sep 17 00:00:00 2001
From: Noah Yoshida <noahcy117@gmail.com>
Date: Mon, 17 Jun 2024 10:45:49 -0700
Subject: [PATCH 18/22] add in predibase API token as env var

---
 runpod/src/handler.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/runpod/src/handler.py b/runpod/src/handler.py
index 111929b62..4b9c9b7d4 100644
--- a/runpod/src/handler.py
+++ b/runpod/src/handler.py
@@ -17,6 +17,7 @@
 url = "http://127.0.0.1:{}".format(TGI_LOCAL_PORT)
 # Create the client
 client = Client(url)
+api_key = os.environ.get("PREDIBASE_API_KEY", "fake")
 
 
 print(url)
@@ -53,7 +54,7 @@ async def handler_streaming(job: dict) -> Generator[dict[str, list], None, None]
     # Create a new client and pass the token for every handler call
     openai_client = openai.OpenAI(
         base_url=f"{url}/v1",
-        api_key="fake"
+        api_key=api_key
     )
     JOBS.add(job['id'])
 

From d3952d3fe1af633253292d40d0a9ca05f8a1f6d8 Mon Sep 17 00:00:00 2001
From: Noah Yoshida <noahcy117@gmail.com>
Date: Tue, 18 Jun 2024 13:01:02 -0700
Subject: [PATCH 19/22] try s3 crt?

---
 server/lorax_server/utils/sources/s3.py |  5 +-
 server/poetry.lock                      | 87 ++++++++++++++++++++-----
 server/pyproject.toml                   |  3 +-
 3 files changed, 77 insertions(+), 18 deletions(-)

diff --git a/server/lorax_server/utils/sources/s3.py b/server/lorax_server/utils/sources/s3.py
index 085fbaeef..f6558f30d 100644
--- a/server/lorax_server/utils/sources/s3.py
+++ b/server/lorax_server/utils/sources/s3.py
@@ -13,6 +13,7 @@
     LocalEntryNotFoundError,
 )
 from loguru import logger
+from boto3.s3.transfer import S3Transfer
 
 from .source import BaseModelSource, try_to_load_from_cache
 
@@ -112,7 +113,9 @@ def download_file(filename):
         model_id_path = Path(model_id)
         bucket_file_name = model_id_path / filename
         logger.info(f"Downloading file {bucket_file_name} to {local_file_path}")
-        bucket.download_file(str(bucket_file_name), str(local_file_path))
+        # use CRT? TODO change this? 
+        transfer = S3Transfer(boto3.client('s3', region_name="us-west-2"))
+        transfer.download_file(bucket.name, str(bucket_file_name), str(local_file_path))
         # TODO: add support for revision
         logger.info(f"Downloaded {local_file_path} in {timedelta(seconds=int(time.time() - start_time))}.")
         if not local_file_path.is_file():
diff --git a/server/poetry.lock b/server/poetry.lock
index 68649649a..9bcafe7ba 100644
--- a/server/poetry.lock
+++ b/server/poetry.lock
@@ -179,6 +179,57 @@ docs = ["furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphinxcontrib-
 tests = ["attrs[tests-no-zope]", "zope-interface"]
 tests-no-zope = ["cloudpickle", "hypothesis", "mypy (>=1.1.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"]
 
+[[package]]
+name = "awscrt"
+version = "0.20.11"
+description = "A common runtime for AWS Python projects"
+optional = false
+python-versions = ">=3.7"
+files = [
+    {file = "awscrt-0.20.11-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:3b50f70d85ecc2069029573bad8e5e06b9aabad283dd933bee6eb9dd694b9511"},
+    {file = "awscrt-0.20.11-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3b4ec0b471cf7d6a7a0950553ddf97d58a0caf4a8350da9ca12250c7df6add94"},
+    {file = "awscrt-0.20.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a6b91cac82abf9718657e0694f90334e4ef4b2ef32061938ff0ceed67e302469"},
+    {file = "awscrt-0.20.11-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:0261ef47f5000d5ce069dec05edf9d803a3ff89c02bd574ec0585e2e4447aca6"},
+    {file = "awscrt-0.20.11-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:81167651ccd45af55fa659a09b415eba881a9892415e465b6432a4f336311711"},
+    {file = "awscrt-0.20.11-cp310-cp310-win32.whl", hash = "sha256:fb316c27110a19917a45dc7b678349bc329c98ac1b95d5bd872f0ad37300e725"},
+    {file = "awscrt-0.20.11-cp310-cp310-win_amd64.whl", hash = "sha256:ae4910e1f534e0d5bb8bade0ce2b1908bfd36007115ac0a700b9cda5c5655f0c"},
+    {file = "awscrt-0.20.11-cp311-abi3-macosx_10_9_universal2.whl", hash = "sha256:86554f8042dea649b7d63a2e4de593864753aad736a7ca592e72b2f8a94535bb"},
+    {file = "awscrt-0.20.11-cp311-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:45db07c2f0f7c83d8a4cb91a51869b22f1f44c1053db7266486733aca2d2ac41"},
+    {file = "awscrt-0.20.11-cp311-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1c94917cce1df62fc40f53e19f5dcfbd036acfbdb1a88cba217ad6caaeab0d57"},
+    {file = "awscrt-0.20.11-cp311-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:624322e103e62bffecf97731691e05ef0d7a50970d8e3b1872433dcf00c5595a"},
+    {file = "awscrt-0.20.11-cp311-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:f10af50b747c2b237836ab1ed57dc1be0c2553e0fb485374f0d3be470a861e4a"},
+    {file = "awscrt-0.20.11-cp311-abi3-win32.whl", hash = "sha256:fc7a8eecfc51503afd24764033a2061a5f39017ed6e825b6594490e04fd56297"},
+    {file = "awscrt-0.20.11-cp311-abi3-win_amd64.whl", hash = "sha256:106ff16bce775917d4e9a8c93649b4f272c32a91336ae6ca97596dcb2faf2d44"},
+    {file = "awscrt-0.20.11-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:5a7ba2227546522d5767308ff49876fbc0abd1771376710ce2cf4dd8b317b2b9"},
+    {file = "awscrt-0.20.11-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:82acde62286c7a1d7991b5bf92f192603ea9b3752b3bf28dae75300c05de1119"},
+    {file = "awscrt-0.20.11-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:583bff89758f0d2cd9540c2c9b301836df21b71548f0fabfdff7fb484c960bf0"},
+    {file = "awscrt-0.20.11-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:6afdee4b204592eba1c75797407be976e9097682d27de6b0ec0c696ec0851758"},
+    {file = "awscrt-0.20.11-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:0101be8b6b40e252eaead36eb1c4c87d53f6f0cd54d40e1ef571f984a36efb79"},
+    {file = "awscrt-0.20.11-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:d6d984eea9687555ca9d269ebbba8f090e1b7feab6f61d1b046548cd469cb2ab"},
+    {file = "awscrt-0.20.11-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:bb634f7fbf02b5aee95d619ec3066c7f7a4d7cd6a156203115bdf8cbc715c4f3"},
+    {file = "awscrt-0.20.11-cp37-cp37m-win32.whl", hash = "sha256:8fbae85b2d5106dd470b349314b3bcceb8812904675c98a1dbd2fe1efe92eb35"},
+    {file = "awscrt-0.20.11-cp37-cp37m-win_amd64.whl", hash = "sha256:56cc06725038d625365f9bdebd4b9e3c9f876ead1a26473cb124c6dfa4b39fd1"},
+    {file = "awscrt-0.20.11-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:58c4616343b9f4d6fc454816fb3459ac86489a242ade3c8126ec9d1aa8208ec0"},
+    {file = "awscrt-0.20.11-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e323a779b8db858b3412a727c90dc9c058898fd7eb5e0f454ca94623007aa078"},
+    {file = "awscrt-0.20.11-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3e28303eb399d4fa3f5df79cca277d4ae434112590c4c9b60a21c397c6ad9da6"},
+    {file = "awscrt-0.20.11-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:73a8a255d60cdb96bd6a93bf606055c918dc88cd9cca57be860efc113cb256b0"},
+    {file = "awscrt-0.20.11-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:07592b2f9dffcd12745da06c10bc1de1e9f38ebc8996b98689bfaa860d600382"},
+    {file = "awscrt-0.20.11-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:b022a21e8bda0d3403e6115ff15652f65dc7250b0d0c1b3125c2c3e095647940"},
+    {file = "awscrt-0.20.11-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:a28f8d8e6b95a0007d80d07763f21dc2cea73b35586559d5b0635c2d06347efc"},
+    {file = "awscrt-0.20.11-cp38-cp38-win32.whl", hash = "sha256:2427d727494d48253e70c9e6a2135d91546524cf56e13b0a7c5c0713994281b0"},
+    {file = "awscrt-0.20.11-cp38-cp38-win_amd64.whl", hash = "sha256:8f1bf72ba5a3a38215b7487c5fb9421dece5b5f63b22ade8f63315acbf1c3842"},
+    {file = "awscrt-0.20.11-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:981df6f011086d30ee76e9476bf33b381bf3064cb3d02be1ea1aa46fec79110a"},
+    {file = "awscrt-0.20.11-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dfead1d73754718f7c6bb03add095750535237ea14c8226cd36d6b88fe7b5342"},
+    {file = "awscrt-0.20.11-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8eb6d8c72485b0b14eb430ca9b1f280629277cd0a9d5d064ee02afffe787caa8"},
+    {file = "awscrt-0.20.11-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f3fb9cbc3403032f57006806229d2c11ab8c425cee1f47f05d83ca87c1b94b32"},
+    {file = "awscrt-0.20.11-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:a2c4c1a577d55b98ae93af82bb96795a62661585d4560674b1daa034f41e6fb1"},
+    {file = "awscrt-0.20.11-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:7f68ce98ee54b634fbe48689aa4610965e3af0e5e1a11da82a791057c741fb99"},
+    {file = "awscrt-0.20.11-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:4404f6e517a5f0871943463fe59cec657586b2d0d1e6e2efbe9dbb0b42f5b3e8"},
+    {file = "awscrt-0.20.11-cp39-cp39-win32.whl", hash = "sha256:8e95ce32b03006097f833b539d1bc3ea503379d880751ddbfd7bb0440e93c0c4"},
+    {file = "awscrt-0.20.11-cp39-cp39-win_amd64.whl", hash = "sha256:20b00d68a90575121cf04250c93aa4874f7d1f7d2d81f37511c12a157be7421c"},
+    {file = "awscrt-0.20.11.tar.gz", hash = "sha256:c3dbfb7f1909457952e645373e72b69f90c50c465ee6a46d9bbdc12acb79803c"},
+]
+
 [[package]]
 name = "backoff"
 version = "2.2.1"
@@ -203,44 +254,48 @@ files = [
 
 [[package]]
 name = "boto3"
-version = "1.34.4"
+version = "1.34.129"
 description = "The AWS SDK for Python"
 optional = false
-python-versions = ">= 3.8"
+python-versions = ">=3.8"
 files = [
-    {file = "boto3-1.34.4-py3-none-any.whl", hash = "sha256:1e836fe33da2684db29317911d9958389094ca5098cc253dbaed8e4aa146b153"},
-    {file = "boto3-1.34.4.tar.gz", hash = "sha256:a866277fc38b121ac5dab0eec38b6ae6e3a59bbf6f67ed9a9822332d9e5e785f"},
+    {file = "boto3-1.34.129-py3-none-any.whl", hash = "sha256:cc73de1c9d953b1f9da6ee2404af717e93d888f790f3e0291b22d1b8489eb401"},
+    {file = "boto3-1.34.129.tar.gz", hash = "sha256:a7a696fd3e7f5f43a81450b441f3eb6c5a89d28efe867cd97d8fc73ea5d8c139"},
 ]
 
 [package.dependencies]
-botocore = ">=1.34.4,<1.35.0"
+botocore = [
+    {version = ">=1.34.129,<1.35.0"},
+    {version = ">=1.21.0,<2.0a0", extras = ["crt"], optional = true, markers = "extra == \"crt\""},
+]
 jmespath = ">=0.7.1,<2.0.0"
-s3transfer = ">=0.9.0,<0.10.0"
+s3transfer = ">=0.10.0,<0.11.0"
 
 [package.extras]
 crt = ["botocore[crt] (>=1.21.0,<2.0a0)"]
 
 [[package]]
 name = "botocore"
-version = "1.34.4"
+version = "1.34.129"
 description = "Low-level, data-driven core of boto 3."
 optional = false
-python-versions = ">= 3.8"
+python-versions = ">=3.8"
 files = [
-    {file = "botocore-1.34.4-py3-none-any.whl", hash = "sha256:2026d89a46dfcb96d439db17a277de11b808428cba881deb50a5960b134e3a84"},
-    {file = "botocore-1.34.4.tar.gz", hash = "sha256:5dcd63329cb3e65c533a72a68c99b7d07c99a29936ea07d0998120172c10b4f5"},
+    {file = "botocore-1.34.129-py3-none-any.whl", hash = "sha256:86d3dd30996aa459e9c3321edac12aebe47c73cb4acc7556941f9b4c39726088"},
+    {file = "botocore-1.34.129.tar.gz", hash = "sha256:7c56e25af6112d69c5d14a15b42f76ba7687687abc463a96ac5edca19c0a9c2d"},
 ]
 
 [package.dependencies]
+awscrt = {version = "0.20.11", optional = true, markers = "extra == \"crt\""}
 jmespath = ">=0.7.1,<2.0.0"
 python-dateutil = ">=2.1,<3.0.0"
 urllib3 = [
     {version = ">=1.25.4,<1.27", markers = "python_version < \"3.10\""},
-    {version = ">=1.25.4,<2.1", markers = "python_version >= \"3.10\""},
+    {version = ">=1.25.4,<2.2.0 || >2.2.0,<3", markers = "python_version >= \"3.10\""},
 ]
 
 [package.extras]
-crt = ["awscrt (==0.19.17)"]
+crt = ["awscrt (==0.20.11)"]
 
 [[package]]
 name = "certifi"
@@ -2496,13 +2551,13 @@ files = [
 
 [[package]]
 name = "s3transfer"
-version = "0.9.0"
+version = "0.10.1"
 description = "An Amazon S3 Transfer Manager"
 optional = false
 python-versions = ">= 3.8"
 files = [
-    {file = "s3transfer-0.9.0-py3-none-any.whl", hash = "sha256:01d4d2c35a016db8cb14f9a4d5e84c1f8c96e7ffc211422555eed45c11fa7eb1"},
-    {file = "s3transfer-0.9.0.tar.gz", hash = "sha256:9e1b186ec8bb5907a1e82b51237091889a9973a2bb799a924bcd9f301ff79d3d"},
+    {file = "s3transfer-0.10.1-py3-none-any.whl", hash = "sha256:ceb252b11bcf87080fb7850a224fb6e05c8a776bab8f2b64b7f25b969464839d"},
+    {file = "s3transfer-0.10.1.tar.gz", hash = "sha256:5683916b4c724f799e600f41dd9e10a9ff19871bf87623cc8f491cb4f5fa0a19"},
 ]
 
 [package.dependencies]
@@ -3577,4 +3632,4 @@ torch = ["torch"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "e52881fa075d917070103ac7f96cae7c648502cb82b4b48d54ec81d47e1b0ef9"
+content-hash = "31b8e738ad9e0b578b35633680f09a1d2433fcc15501023bb2513b39b9d4c0df"
diff --git a/server/pyproject.toml b/server/pyproject.toml
index 1cfee5a7e..0437b28a8 100644
--- a/server/pyproject.toml
+++ b/server/pyproject.toml
@@ -34,13 +34,14 @@ texttable = { version = "^1.6.7", optional = true }
 datasets = { version = "^2.14.0", optional = true }
 torch = { version = "2.3.0", optional = true }
 peft = { version = "0.4.0", optional = true }
-boto3 = "^1.28.34"
+boto3 = {extras = ["crt"], version = "^1.34.129"}
 urllib3 = "<=1.26.18"
 hqq = { version = "^0.1.7", optional = true }
 stanford-stk = { version = "^0.7.0", markers = "sys_platform == 'linux'" }
 outlines = { version = "^0.0.40", optional = true }
 prometheus-client = "^0.20.0"
 py-cpuinfo = "^9.0.0"
+s3transfer = "0.10.1"
 
 [tool.poetry.extras]
 torch = ["torch"]

From f677da17b66d0ad8f8518f39ce72b75b50a531d8 Mon Sep 17 00:00:00 2001
From: Noah Yoshida <noahcy117@gmail.com>
Date: Tue, 18 Jun 2024 16:10:23 -0700
Subject: [PATCH 20/22] fix runpod sdk

---
 runpod/src/handler.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/runpod/src/handler.py b/runpod/src/handler.py
index 4b9c9b7d4..4a20aec42 100644
--- a/runpod/src/handler.py
+++ b/runpod/src/handler.py
@@ -73,7 +73,8 @@ async def handler_streaming(job: dict) -> Generator[dict[str, list], None, None]
                     # Dump the repsonse into a dictionary
                     yield response.model_dump()
         else:
-            del job_input['_stream']
+            if '_stream' in job_input:
+                del job_input['_stream']
             response = client.generate(**job_input)
             yield response.model_dump()
     # When we are called with a streaming endpoint, then we should have the field 

From 8b4eca47e47d462220e1a3e8ce0e7618ab22dbb6 Mon Sep 17 00:00:00 2001
From: Noah Yoshida <noahcy117@gmail.com>
Date: Tue, 18 Jun 2024 17:28:37 -0700
Subject: [PATCH 21/22] fix

---
 runpod/src/handler.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/runpod/src/handler.py b/runpod/src/handler.py
index 4a20aec42..ed2d8d4fc 100644
--- a/runpod/src/handler.py
+++ b/runpod/src/handler.py
@@ -65,17 +65,18 @@ async def handler_streaming(job: dict) -> Generator[dict[str, list], None, None]
         result = openai_client.chat.completions.create(**job_input["openai_input"]).model_dump()
         yield result
     else:
+        inputs = str(job_input.get('inputs'))
         if job_input.get('_stream', False):
             del job_input['_stream']
             # Streaming case
-            for response in client.generate_stream(**job_input):
+            for response in client.generate_stream(inputs, **job_input.get('parameters', {})):
                 if not response.token.special:
                     # Dump the repsonse into a dictionary
                     yield response.model_dump()
         else:
             if '_stream' in job_input:
                 del job_input['_stream']
-            response = client.generate(**job_input)
+            response = client.generate(inputs, **job_input.get('parameters', {}))
             yield response.model_dump()
     # When we are called with a streaming endpoint, then we should have the field 
     # _stream = True

From d79c04886915bafae677291f5b1be1cee7fc4d4c Mon Sep 17 00:00:00 2001
From: Noah Yoshida <noahcy117@gmail.com>
Date: Thu, 20 Jun 2024 11:55:33 -0700
Subject: [PATCH 22/22] Revert "try s3 crt?"

This reverts commit d3952d3fe1af633253292d40d0a9ca05f8a1f6d8.
---
 server/lorax_server/utils/sources/s3.py |  5 +-
 server/poetry.lock                      | 87 +++++--------------------
 server/pyproject.toml                   |  3 +-
 3 files changed, 18 insertions(+), 77 deletions(-)

diff --git a/server/lorax_server/utils/sources/s3.py b/server/lorax_server/utils/sources/s3.py
index f6558f30d..085fbaeef 100644
--- a/server/lorax_server/utils/sources/s3.py
+++ b/server/lorax_server/utils/sources/s3.py
@@ -13,7 +13,6 @@
     LocalEntryNotFoundError,
 )
 from loguru import logger
-from boto3.s3.transfer import S3Transfer
 
 from .source import BaseModelSource, try_to_load_from_cache
 
@@ -113,9 +112,7 @@ def download_file(filename):
         model_id_path = Path(model_id)
         bucket_file_name = model_id_path / filename
         logger.info(f"Downloading file {bucket_file_name} to {local_file_path}")
-        # use CRT? TODO change this? 
-        transfer = S3Transfer(boto3.client('s3', region_name="us-west-2"))
-        transfer.download_file(bucket.name, str(bucket_file_name), str(local_file_path))
+        bucket.download_file(str(bucket_file_name), str(local_file_path))
         # TODO: add support for revision
         logger.info(f"Downloaded {local_file_path} in {timedelta(seconds=int(time.time() - start_time))}.")
         if not local_file_path.is_file():
diff --git a/server/poetry.lock b/server/poetry.lock
index 9bcafe7ba..68649649a 100644
--- a/server/poetry.lock
+++ b/server/poetry.lock
@@ -179,57 +179,6 @@ docs = ["furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphinxcontrib-
 tests = ["attrs[tests-no-zope]", "zope-interface"]
 tests-no-zope = ["cloudpickle", "hypothesis", "mypy (>=1.1.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"]
 
-[[package]]
-name = "awscrt"
-version = "0.20.11"
-description = "A common runtime for AWS Python projects"
-optional = false
-python-versions = ">=3.7"
-files = [
-    {file = "awscrt-0.20.11-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:3b50f70d85ecc2069029573bad8e5e06b9aabad283dd933bee6eb9dd694b9511"},
-    {file = "awscrt-0.20.11-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3b4ec0b471cf7d6a7a0950553ddf97d58a0caf4a8350da9ca12250c7df6add94"},
-    {file = "awscrt-0.20.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a6b91cac82abf9718657e0694f90334e4ef4b2ef32061938ff0ceed67e302469"},
-    {file = "awscrt-0.20.11-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:0261ef47f5000d5ce069dec05edf9d803a3ff89c02bd574ec0585e2e4447aca6"},
-    {file = "awscrt-0.20.11-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:81167651ccd45af55fa659a09b415eba881a9892415e465b6432a4f336311711"},
-    {file = "awscrt-0.20.11-cp310-cp310-win32.whl", hash = "sha256:fb316c27110a19917a45dc7b678349bc329c98ac1b95d5bd872f0ad37300e725"},
-    {file = "awscrt-0.20.11-cp310-cp310-win_amd64.whl", hash = "sha256:ae4910e1f534e0d5bb8bade0ce2b1908bfd36007115ac0a700b9cda5c5655f0c"},
-    {file = "awscrt-0.20.11-cp311-abi3-macosx_10_9_universal2.whl", hash = "sha256:86554f8042dea649b7d63a2e4de593864753aad736a7ca592e72b2f8a94535bb"},
-    {file = "awscrt-0.20.11-cp311-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:45db07c2f0f7c83d8a4cb91a51869b22f1f44c1053db7266486733aca2d2ac41"},
-    {file = "awscrt-0.20.11-cp311-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1c94917cce1df62fc40f53e19f5dcfbd036acfbdb1a88cba217ad6caaeab0d57"},
-    {file = "awscrt-0.20.11-cp311-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:624322e103e62bffecf97731691e05ef0d7a50970d8e3b1872433dcf00c5595a"},
-    {file = "awscrt-0.20.11-cp311-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:f10af50b747c2b237836ab1ed57dc1be0c2553e0fb485374f0d3be470a861e4a"},
-    {file = "awscrt-0.20.11-cp311-abi3-win32.whl", hash = "sha256:fc7a8eecfc51503afd24764033a2061a5f39017ed6e825b6594490e04fd56297"},
-    {file = "awscrt-0.20.11-cp311-abi3-win_amd64.whl", hash = "sha256:106ff16bce775917d4e9a8c93649b4f272c32a91336ae6ca97596dcb2faf2d44"},
-    {file = "awscrt-0.20.11-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:5a7ba2227546522d5767308ff49876fbc0abd1771376710ce2cf4dd8b317b2b9"},
-    {file = "awscrt-0.20.11-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:82acde62286c7a1d7991b5bf92f192603ea9b3752b3bf28dae75300c05de1119"},
-    {file = "awscrt-0.20.11-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:583bff89758f0d2cd9540c2c9b301836df21b71548f0fabfdff7fb484c960bf0"},
-    {file = "awscrt-0.20.11-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:6afdee4b204592eba1c75797407be976e9097682d27de6b0ec0c696ec0851758"},
-    {file = "awscrt-0.20.11-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:0101be8b6b40e252eaead36eb1c4c87d53f6f0cd54d40e1ef571f984a36efb79"},
-    {file = "awscrt-0.20.11-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:d6d984eea9687555ca9d269ebbba8f090e1b7feab6f61d1b046548cd469cb2ab"},
-    {file = "awscrt-0.20.11-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:bb634f7fbf02b5aee95d619ec3066c7f7a4d7cd6a156203115bdf8cbc715c4f3"},
-    {file = "awscrt-0.20.11-cp37-cp37m-win32.whl", hash = "sha256:8fbae85b2d5106dd470b349314b3bcceb8812904675c98a1dbd2fe1efe92eb35"},
-    {file = "awscrt-0.20.11-cp37-cp37m-win_amd64.whl", hash = "sha256:56cc06725038d625365f9bdebd4b9e3c9f876ead1a26473cb124c6dfa4b39fd1"},
-    {file = "awscrt-0.20.11-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:58c4616343b9f4d6fc454816fb3459ac86489a242ade3c8126ec9d1aa8208ec0"},
-    {file = "awscrt-0.20.11-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e323a779b8db858b3412a727c90dc9c058898fd7eb5e0f454ca94623007aa078"},
-    {file = "awscrt-0.20.11-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3e28303eb399d4fa3f5df79cca277d4ae434112590c4c9b60a21c397c6ad9da6"},
-    {file = "awscrt-0.20.11-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:73a8a255d60cdb96bd6a93bf606055c918dc88cd9cca57be860efc113cb256b0"},
-    {file = "awscrt-0.20.11-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:07592b2f9dffcd12745da06c10bc1de1e9f38ebc8996b98689bfaa860d600382"},
-    {file = "awscrt-0.20.11-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:b022a21e8bda0d3403e6115ff15652f65dc7250b0d0c1b3125c2c3e095647940"},
-    {file = "awscrt-0.20.11-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:a28f8d8e6b95a0007d80d07763f21dc2cea73b35586559d5b0635c2d06347efc"},
-    {file = "awscrt-0.20.11-cp38-cp38-win32.whl", hash = "sha256:2427d727494d48253e70c9e6a2135d91546524cf56e13b0a7c5c0713994281b0"},
-    {file = "awscrt-0.20.11-cp38-cp38-win_amd64.whl", hash = "sha256:8f1bf72ba5a3a38215b7487c5fb9421dece5b5f63b22ade8f63315acbf1c3842"},
-    {file = "awscrt-0.20.11-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:981df6f011086d30ee76e9476bf33b381bf3064cb3d02be1ea1aa46fec79110a"},
-    {file = "awscrt-0.20.11-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dfead1d73754718f7c6bb03add095750535237ea14c8226cd36d6b88fe7b5342"},
-    {file = "awscrt-0.20.11-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8eb6d8c72485b0b14eb430ca9b1f280629277cd0a9d5d064ee02afffe787caa8"},
-    {file = "awscrt-0.20.11-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f3fb9cbc3403032f57006806229d2c11ab8c425cee1f47f05d83ca87c1b94b32"},
-    {file = "awscrt-0.20.11-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:a2c4c1a577d55b98ae93af82bb96795a62661585d4560674b1daa034f41e6fb1"},
-    {file = "awscrt-0.20.11-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:7f68ce98ee54b634fbe48689aa4610965e3af0e5e1a11da82a791057c741fb99"},
-    {file = "awscrt-0.20.11-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:4404f6e517a5f0871943463fe59cec657586b2d0d1e6e2efbe9dbb0b42f5b3e8"},
-    {file = "awscrt-0.20.11-cp39-cp39-win32.whl", hash = "sha256:8e95ce32b03006097f833b539d1bc3ea503379d880751ddbfd7bb0440e93c0c4"},
-    {file = "awscrt-0.20.11-cp39-cp39-win_amd64.whl", hash = "sha256:20b00d68a90575121cf04250c93aa4874f7d1f7d2d81f37511c12a157be7421c"},
-    {file = "awscrt-0.20.11.tar.gz", hash = "sha256:c3dbfb7f1909457952e645373e72b69f90c50c465ee6a46d9bbdc12acb79803c"},
-]
-
 [[package]]
 name = "backoff"
 version = "2.2.1"
@@ -254,48 +203,44 @@ files = [
 
 [[package]]
 name = "boto3"
-version = "1.34.129"
+version = "1.34.4"
 description = "The AWS SDK for Python"
 optional = false
-python-versions = ">=3.8"
+python-versions = ">= 3.8"
 files = [
-    {file = "boto3-1.34.129-py3-none-any.whl", hash = "sha256:cc73de1c9d953b1f9da6ee2404af717e93d888f790f3e0291b22d1b8489eb401"},
-    {file = "boto3-1.34.129.tar.gz", hash = "sha256:a7a696fd3e7f5f43a81450b441f3eb6c5a89d28efe867cd97d8fc73ea5d8c139"},
+    {file = "boto3-1.34.4-py3-none-any.whl", hash = "sha256:1e836fe33da2684db29317911d9958389094ca5098cc253dbaed8e4aa146b153"},
+    {file = "boto3-1.34.4.tar.gz", hash = "sha256:a866277fc38b121ac5dab0eec38b6ae6e3a59bbf6f67ed9a9822332d9e5e785f"},
 ]
 
 [package.dependencies]
-botocore = [
-    {version = ">=1.34.129,<1.35.0"},
-    {version = ">=1.21.0,<2.0a0", extras = ["crt"], optional = true, markers = "extra == \"crt\""},
-]
+botocore = ">=1.34.4,<1.35.0"
 jmespath = ">=0.7.1,<2.0.0"
-s3transfer = ">=0.10.0,<0.11.0"
+s3transfer = ">=0.9.0,<0.10.0"
 
 [package.extras]
 crt = ["botocore[crt] (>=1.21.0,<2.0a0)"]
 
 [[package]]
 name = "botocore"
-version = "1.34.129"
+version = "1.34.4"
 description = "Low-level, data-driven core of boto 3."
 optional = false
-python-versions = ">=3.8"
+python-versions = ">= 3.8"
 files = [
-    {file = "botocore-1.34.129-py3-none-any.whl", hash = "sha256:86d3dd30996aa459e9c3321edac12aebe47c73cb4acc7556941f9b4c39726088"},
-    {file = "botocore-1.34.129.tar.gz", hash = "sha256:7c56e25af6112d69c5d14a15b42f76ba7687687abc463a96ac5edca19c0a9c2d"},
+    {file = "botocore-1.34.4-py3-none-any.whl", hash = "sha256:2026d89a46dfcb96d439db17a277de11b808428cba881deb50a5960b134e3a84"},
+    {file = "botocore-1.34.4.tar.gz", hash = "sha256:5dcd63329cb3e65c533a72a68c99b7d07c99a29936ea07d0998120172c10b4f5"},
 ]
 
 [package.dependencies]
-awscrt = {version = "0.20.11", optional = true, markers = "extra == \"crt\""}
 jmespath = ">=0.7.1,<2.0.0"
 python-dateutil = ">=2.1,<3.0.0"
 urllib3 = [
     {version = ">=1.25.4,<1.27", markers = "python_version < \"3.10\""},
-    {version = ">=1.25.4,<2.2.0 || >2.2.0,<3", markers = "python_version >= \"3.10\""},
+    {version = ">=1.25.4,<2.1", markers = "python_version >= \"3.10\""},
 ]
 
 [package.extras]
-crt = ["awscrt (==0.20.11)"]
+crt = ["awscrt (==0.19.17)"]
 
 [[package]]
 name = "certifi"
@@ -2551,13 +2496,13 @@ files = [
 
 [[package]]
 name = "s3transfer"
-version = "0.10.1"
+version = "0.9.0"
 description = "An Amazon S3 Transfer Manager"
 optional = false
 python-versions = ">= 3.8"
 files = [
-    {file = "s3transfer-0.10.1-py3-none-any.whl", hash = "sha256:ceb252b11bcf87080fb7850a224fb6e05c8a776bab8f2b64b7f25b969464839d"},
-    {file = "s3transfer-0.10.1.tar.gz", hash = "sha256:5683916b4c724f799e600f41dd9e10a9ff19871bf87623cc8f491cb4f5fa0a19"},
+    {file = "s3transfer-0.9.0-py3-none-any.whl", hash = "sha256:01d4d2c35a016db8cb14f9a4d5e84c1f8c96e7ffc211422555eed45c11fa7eb1"},
+    {file = "s3transfer-0.9.0.tar.gz", hash = "sha256:9e1b186ec8bb5907a1e82b51237091889a9973a2bb799a924bcd9f301ff79d3d"},
 ]
 
 [package.dependencies]
@@ -3632,4 +3577,4 @@ torch = ["torch"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "31b8e738ad9e0b578b35633680f09a1d2433fcc15501023bb2513b39b9d4c0df"
+content-hash = "e52881fa075d917070103ac7f96cae7c648502cb82b4b48d54ec81d47e1b0ef9"
diff --git a/server/pyproject.toml b/server/pyproject.toml
index 0437b28a8..1cfee5a7e 100644
--- a/server/pyproject.toml
+++ b/server/pyproject.toml
@@ -34,14 +34,13 @@ texttable = { version = "^1.6.7", optional = true }
 datasets = { version = "^2.14.0", optional = true }
 torch = { version = "2.3.0", optional = true }
 peft = { version = "0.4.0", optional = true }
-boto3 = {extras = ["crt"], version = "^1.34.129"}
+boto3 = "^1.28.34"
 urllib3 = "<=1.26.18"
 hqq = { version = "^0.1.7", optional = true }
 stanford-stk = { version = "^0.7.0", markers = "sys_platform == 'linux'" }
 outlines = { version = "^0.0.40", optional = true }
 prometheus-client = "^0.20.0"
 py-cpuinfo = "^9.0.0"
-s3transfer = "0.10.1"
 
 [tool.poetry.extras]
 torch = ["torch"]