From be385ef9f4d708b7b272ab3ba79db824c37b3c6c Mon Sep 17 00:00:00 2001 From: Noah Yoshida Date: Thu, 13 Jun 2024 12:12:59 -0700 Subject: [PATCH 01/22] runpod --- runpod/Dockerfile | 75 +++++++++ runpod/builder/setup.sh | 23 +++ runpod/src/entrypoint.sh | 45 +++++ runpod/src/handler.py | 91 ++++++++++ runpod/src/server.py | 352 +++++++++++++++++++++++++++++++++++++++ 5 files changed, 586 insertions(+) create mode 100644 runpod/Dockerfile create mode 100644 runpod/builder/setup.sh create mode 100644 runpod/src/entrypoint.sh create mode 100644 runpod/src/handler.py create mode 100644 runpod/src/server.py diff --git a/runpod/Dockerfile b/runpod/Dockerfile new file mode 100644 index 000000000..3fee1dd51 --- /dev/null +++ b/runpod/Dockerfile @@ -0,0 +1,75 @@ +# Base image +FROM ghcr.io/predibase/lorax:0.10.0 +ENV DEBIAN_FRONTEND=noninteractive + +# Set the working directory +WORKDIR / + +# Update and upgrade the system packages (Worker Template) +COPY builder/setup.sh /setup.sh +RUN /bin/bash /setup.sh && \ + rm /setup.sh + +# Install Python dependencies (Worker Template) +# COPY builder/requirements.txt /requirements.txt +# RUN python3 -m pip install --upgrade pip && \ +# python3 -m pip install --upgrade -r /requirements.txt --no-cache-dir && \ +# rm /requirements.txt + +# Add src files (Worker Template) +ADD src . + +# Whether to download the model into /runpod-volume or not. +ARG DOWNLOAD_MODEL= +ENV DOWNLOAD_MODEL=$DOWNLOAD_MODEL + +# Set environment variables +ARG HF_MODEL_ID= +ENV HF_MODEL_ID=$HF_MODEL_ID + +ARG HF_MODEL_REVISION= +ENV HF_MODEL_REVISION=$HF_MODEL_REVISION + +ARG SM_NUM_GPUS= +ENV SM_NUM_GPUS=$SM_NUM_GPUS + +ARG HF_MODEL_QUANTIZE= +ENV HF_MODEL_QUANTIZE=$HF_MODEL_QUANTIZE + +ARG HF_MODEL_TRUST_REMOTE_CODE= +ENV HF_MODEL_TRUST_REMOTE_CODE=$HF_MODEL_TRUST_REMOTE_CODE + +ARG MODEL_BASE_PATH="/runpod-volume/" +ENV MODEL_BASE_PATH=$MODEL_BASE_PATH + +ARG HUGGING_FACE_HUB_TOKEN= +ENV HUGGING_FACE_HUB_TOKEN=$HUGGING_FACE_HUB_TOKEN + +ARG HF_MAX_TOTAL_TOKENS= +ENV HF_MAX_TOTAL_TOKENS=$HF_MAX_TOTAL_TOKENS + +ARG HF_MAX_INPUT_LENGTH= +ENV HF_MAX_INPUT_LENGTH=$HF_MAX_INPUT_LENGTH + +ARG HF_MAX_BATCH_TOTAL_TOKENS= +ENV HF_MAX_BATCH_TOTAL_TOKENS=$HF_MAX_BATCH_TOTAL_TOKENS + +ARG HF_MAX_BATCH_PREFILL_TOKENS= +ENV HF_MAX_BATCH_PREFILL_TOKENS=$HF_MAX_BATCH_PREFILL_TOKENS + +# Prepare the hugging face directories for caching datasets, models, and more. +ENV HF_DATASETS_CACHE="/runpod-volume/huggingface-cache/datasets" +ENV HUGGINGFACE_HUB_CACHE="/runpod-volume/huggingface-cache/hub" +ENV TRANSFORMERS_CACHE="/runpod-volume/huggingface-cache/hub" + +# Conditionally download the model weights based on DOWNLOAD_MODEL +RUN if [ "$DOWNLOAD_MODEL" = "1" ]; then \ + lorax-server download-weights $HF_MODEL_ID; \ + fi + +# Quick temporary updates +RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.10 +RUN python3.10 -m pip install git+https://github.com/runpod/runpod-python@a1#egg=runpod --compile +RUN python3.10 -m pip install lorax-client + +ENTRYPOINT ["./entrypoint.sh"] diff --git a/runpod/builder/setup.sh b/runpod/builder/setup.sh new file mode 100644 index 000000000..2b9926ec8 --- /dev/null +++ b/runpod/builder/setup.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +# Stop script on error +set -e + +# Update System +apt-get update && apt-get upgrade -y + +# Install System Dependencies +# - openssh-server: for ssh access and web terminal +apt-get install -y --no-install-recommends software-properties-common curl git openssh-server + +# Install Python 3.10 +add-apt-repository ppa:deadsnakes/ppa -y +apt-get update && apt-get install -y --no-install-recommends python3.10 python3.10-dev python3.10-distutils +update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1 + +# Install pip for Python 3.10 +curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py +python3 get-pip.py + +# Clean up +apt-get autoremove -y && apt-get clean -y && rm -rf /var/lib/apt/lists/* diff --git a/runpod/src/entrypoint.sh b/runpod/src/entrypoint.sh new file mode 100644 index 000000000..09e69a875 --- /dev/null +++ b/runpod/src/entrypoint.sh @@ -0,0 +1,45 @@ +#!/bin/bash + +# if [[ -z "${HF_MODEL_ID}" ]]; then +# echo "HF_MODEL_ID must be set" +# exit 1 +# fi +# export MODEL_ID="${HF_MODEL_ID}" + +if [[ -n "${HF_MODEL_REVISION}" ]]; then + export REVISION="${HF_MODEL_REVISION}" +fi + +if [[ -n "${SM_NUM_GPUS}" ]]; then + export NUM_SHARD="${SM_NUM_GPUS}" +fi + +if [[ -n "${HF_MODEL_QUANTIZE}" ]]; then + export QUANTIZE="${HF_MODEL_QUANTIZE}" +fi + +if [[ -n "${HF_MODEL_TRUST_REMOTE_CODE}" ]]; then + export TRUST_REMOTE_CODE="${HF_MODEL_TRUST_REMOTE_CODE}" +fi + +if [[ -n "${HF_MAX_TOTAL_TOKENS}" ]]; then + export MAX_TOTAL_TOKENS="${HF_MAX_TOTAL_TOKENS}" +fi + +if [[ -n "${HF_MAX_INPUT_LENGTH}" ]]; then + export MAX_INPUT_LENGTH="${HF_MAX_INPUT_LENGTH}" +fi + +if [[ -n "${HF_MAX_BATCH_TOTAL_TOKENS}" ]]; then + export MAX_BATCH_TOTAL_TOKENS="${HF_MAX_BATCH_TOTAL_TOKENS}" +fi + +if [[ -n "${HF_MAX_BATCH_PREFILL_TOKENS}" ]]; then + export MAX_BATCH_PREFILL_TOKENS="${HF_MAX_BATCH_PREFILL_TOKENS}" +fi + +# Start the text generation server +nohup lorax-launcher --port 8080 --model-id predibase/Mistral-7B-v0.1-dequantized --adapter-source hub --default-adapter-source pbase --max-batch-prefill-tokens 32768 --max-total-tokens 8192 --max-input-length 8191 --max-concurrent-requests 1024 & + +# Start the handler using python 3.10 +python3.10 -u /handler.py diff --git a/runpod/src/handler.py b/runpod/src/handler.py new file mode 100644 index 000000000..39ac671ca --- /dev/null +++ b/runpod/src/handler.py @@ -0,0 +1,91 @@ +#!/usr/bin/env python +''' Contains the handler function that will be called by the serverless. ''' + +from typing import Generator +import runpod +import os +import time + +# For download the weights +from lorax import Client + +# Prepare global variables +JOBS = set() +TGI_LOCAL_PORT = int(os.environ.get('TGI_LOCAL_PORT', 8080)) + +# Create the client +client = Client("http://127.0.0.1:{}".format(TGI_LOCAL_PORT)) + +# Wait for the hugging face TGI worker to start running. +while True: + try: + client.generate("Why is the sky blue?").generated_text + print("Successfully cold booted the hugging face text generation inference server!") + + # Break from the while loop + break + + except Exception as e: + print("The hugging face text generation inference server is still cold booting...") + time.sleep(5) + +def concurrency_controller() -> bool: + # Handle at most 100 jobs at a time. + return len(JOBS) > 20 + +async def handler_streaming(job: dict) -> Generator[dict[str, list], None, None]: + ''' + This is the handler function that will be called by the serverless. + ''' + # Get job input + job_input = job['input'] + + # Prompts + prompt = job_input['prompt'] + + # Validate the inputs + sampling_params = job_input.get('sampling_params', {}) + + # Add job to the set. + JOBS.add(job['id']) + + # Include metrics in the highest level for the job output for aggregrate. + def aggregate_function(streamed_outputs): + aggregate_output = "" + for stream in streamed_outputs: + aggregate_output += stream['text'] + + # Aggregate metrics to expose to the user + # input_tokens = -1 # TBD + # output_tokens = -1 # TBD + + return { + "text": aggregate_output, + # "input_tokens": input_tokens, + # "output_tokens": output_tokens, + } + + # Streaming case + for response in client.generate_stream(prompt, **sampling_params): + if not response.token.special: + text_outputs = response.token.text + ret = {"text": text_outputs} + + # Update the aggregate transformation function + runpod.serverless.modules.rp_metrics.metrics_collector.update_stream_aggregate( + job_id=job['id'], + aggregate_function=aggregate_function + ) + + yield ret + + # Remove job from the set. + JOBS.remove(job['id']) + +# Start the serverless worker with appropriate settings +print("Starting the TGI serverless worker with streaming enabled.") +runpod.serverless.start({ + "handler": handler_streaming, + "concurrency_controller": concurrency_controller, + "return_aggregate_stream": True +}) diff --git a/runpod/src/server.py b/runpod/src/server.py new file mode 100644 index 000000000..63cc39c77 --- /dev/null +++ b/runpod/src/server.py @@ -0,0 +1,352 @@ +import asyncio +import os +from pathlib import Path +from typing import List, Optional + +import torch +from grpc import aio +from grpc_reflection.v1alpha import reflection +from loguru import logger + +from lorax_server.adapters.utils import download_adapter +from lorax_server.cache import Cache +from lorax_server.interceptor import ExceptionInterceptor +from lorax_server.models import Model, get_model +from lorax_server.pb import generate_pb2, generate_pb2_grpc +from lorax_server.tracing import UDSOpenTelemetryAioServerInterceptor +from lorax_server.utils import HUB, LOCAL, PBASE, S3, map_pbase_model_id_to_s3 +from lorax_server.utils.adapter import BASE_MODEL_ADAPTER_ID, is_base_model +from lorax_server.utils.sgmv import has_sgmv +from lorax_server.utils.state import set_speculative_tokens + + +class LoraxService(generate_pb2_grpc.LoraxServiceServicer): + """ + Implementation of the LoraxService gRPC service. + + Args: + model (Model): The model used for inference. + cache (Cache): The cache used for storing and retrieving batches. + server_urls (List[str]): List of server URLs for service discovery. + """ + + def __init__(self, model: Model, cache: Cache, server_urls: List[str]): + self.cache = cache + self.model = model + self.server_urls = server_urls + # For some reason, inference_mode does not work well with GLOO which we use on CPU + if model.device.type == "cuda": + # Force inference mode for the lifetime of LoraxService + self._inference_mode_raii_guard = torch._C._InferenceMode(True) + + async def Info(self, request, context): + return self.model.info + + async def Health(self, request, context): + if self.model.device.type == "cuda": + torch.zeros((2, 2)).cuda() + return generate_pb2.HealthResponse() + + async def ServiceDiscovery(self, request, context): + return generate_pb2.ServiceDiscoveryResponse(urls=self.server_urls) + + async def ClearCache(self, request, context): + if request.HasField("id"): + self.cache.delete(request.id) + else: + self.cache.clear() + return generate_pb2.ClearCacheResponse() + + async def FilterBatch(self, request, context): + batch = self.cache.pop(request.batch_id) + if batch is None: + raise ValueError(f"Batch ID {request.batch_id} not found in cache.") + filtered_batch = batch.filter(request.request_ids) + self.cache.set(filtered_batch) + + return generate_pb2.FilterBatchResponse(batch=filtered_batch.to_pb()) + + async def Warmup(self, request: generate_pb2.WarmupRequest, context): + batch = self.model.batch_type.from_pb( + request.batch, + self.model.tokenizer, + self.model.tokenizers, + self.model.dtype, + self.model.device, + ) + max_supported_total_tokens = self.model.warmup(batch, request.max_new_tokens) + + return generate_pb2.WarmupResponse(max_supported_total_tokens=max_supported_total_tokens) + + async def Prefill(self, request: generate_pb2.PrefillRequest, context): + batch = self.model.batch_type.from_pb( + request.batch, + self.model.tokenizer, + self.model.tokenizers, + self.model.dtype, + self.model.device, + ) + + generations, next_batch = self.model.generate_token(batch) + self.cache.set(next_batch) + + return generate_pb2.PrefillResponse( + generations=[generation.to_pb() for generation in generations], + batch=next_batch.to_pb() if next_batch else None, + ) + + async def Embed(self, request: generate_pb2.EmbedRequest, context): + if not self.model.supports_embeddings: + raise ValueError("Model does not support embeddings") + + batch = self.model.batch_type.from_pb( + request.batch, + self.model.tokenizer, + self.model.tokenizers, + self.model.dtype, + self.model.device, + ) + embeddings = self.model.embed(batch) + embeddings_proto = [] + for i, embedding in enumerate(embeddings): + embeddings_proto.append(generate_pb2.Embedding(request_id=batch.request_ids[i], values=embedding)) + return generate_pb2.EmbedResponse(embeddings=embeddings_proto) + + async def Decode(self, request: generate_pb2.DecodeRequest, context): + if len(request.batches) == 0: + raise ValueError("Must provide at least one batch") + + batches = [] + for batch_pb in request.batches: + batch = self.cache.pop(batch_pb.id) + if batch is None: + raise ValueError(f"Batch ID {batch_pb.id} not found in cache.") + batches.append(batch) + + if len(batches) == 0: + raise ValueError("All batches are empty") + + if len(batches) > 1: + batch = self.model.batch_type.concatenate(batches) + else: + batch = batches[0] + + generations, next_batch = self.model.generate_token(batch) + self.cache.set(next_batch) + + return generate_pb2.DecodeResponse( + generations=[generation.to_pb() for generation in generations], + batch=next_batch.to_pb() if next_batch else None, + ) + + async def DownloadAdapter(self, request: generate_pb2.DownloadAdapterRequest, context): + adapter_parameters = request.adapter_parameters + if is_base_model(adapter_parameters): + logger.info("No adapter to download for base model. Skipping.") + return generate_pb2.DownloadAdapterResponse(downloaded=False) + + adapter_bytes = 0 + api_token = request.api_token + adapter_source = _adapter_source_enum_to_string(request.adapter_source) + for adapter_id in adapter_parameters.adapter_ids: + if adapter_id == BASE_MODEL_ADAPTER_ID: + logger.info("No adapter to download for base model. Skipping.") + continue + + adapter_bytes += download_adapter(adapter_id, adapter_source, api_token) + + adapter_memory_size = self.model.adapter_memory_size() + if adapter_memory_size > 0: + logger.info( + f"Downloaded adapter {adapter_id} memory size: {adapter_bytes} bytes " + f"(reservation: {adapter_memory_size} bytes)" + ) + adapter_memory_fraction = adapter_bytes / adapter_memory_size + if adapter_memory_fraction > 1: + raise ValueError( + f"Adapter {adapter_id} is larger than adapter memory reservation: " + f"{adapter_bytes} / {adapter_memory_size} bytes" + ) + else: + # Assume 0.0 memory fraction if adapter memory size is not set + logger.info( + f"Downloaded adapter {adapter_id} memory size: {adapter_bytes} bytes " f"(no reservation limit)" + ) + adapter_memory_fraction = 0.0 + + return generate_pb2.DownloadAdapterResponse(downloaded=True, memory_fraction=adapter_memory_fraction) + + async def LoadAdapter(self, request: generate_pb2.LoadAdapterRequest, context): + adapter_parameters = request.adapter_parameters + if is_base_model(adapter_parameters): + logger.info("No adapter to load for base model. Skipping.") + return generate_pb2.LoadAdapterResponse(loaded=False) + + try: + adapter_source = _adapter_source_enum_to_string(request.adapter_source) + adapter_index = request.adapter_index + api_token = request.api_token + + if adapter_source == PBASE: + for i in range(len(adapter_parameters.adapter_ids)): + adapter_id = adapter_parameters.adapter_ids[i] + adapter_id = map_pbase_model_id_to_s3(adapter_id, api_token) + adapter_parameters.adapter_ids[i] = adapter_id + adapter_source = S3 + + self.model.load_adapter(adapter_parameters, adapter_source, adapter_index, api_token) + + return generate_pb2.LoadAdapterResponse(loaded=True) + except Exception: + logger.exception("Error when loading adapter") + raise + + async def OffloadAdapter(self, request: generate_pb2.OffloadAdapterRequest, context): + adapter_parameters = request.adapter_parameters + if is_base_model(adapter_parameters): + logger.info("No adapter to offload for base model. Skipping.") + return generate_pb2.OffloadAdapterResponse(offloaded=False) + + try: + adapter_idx = request.adapter_index + adapter_source = _adapter_source_enum_to_string(request.adapter_source) + adapter_index = request.adapter_index + self.model.offload_adapter(adapter_idx, adapter_source, adapter_index) + + # Ensure there is enough memory for the next adapter + torch.cuda.empty_cache() + torch.cuda.synchronize(self.model.device) + + return generate_pb2.OffloadAdapterResponse(offloaded=True) + except Exception: + logger.exception("Error when offloading adapter") + raise + + +def serve( + model_id: str, + adapter_id: str, + revision: Optional[str], + sharded: bool, + quantize: Optional[str], + compile: bool, + dtype: Optional[str], + trust_remote_code: bool, + uds_path: Path, + source: str, + adapter_source: str, + speculative_tokens: int, +): + async def serve_inner( + model_id: str, + adapter_id: str, + revision: Optional[str], + sharded: bool, + quantize: Optional[str], + compile: bool, + dtype: Optional[str], + trust_remote_code: bool, + speculative_tokens: int, + ): + unix_socket_template = "unix://{}-{}" + if sharded: + server_urls = [unix_socket_template.format(uds_path, rank) for rank in range(int(os.environ["WORLD_SIZE"]))] + local_url = server_urls[int(os.environ["RANK"])] + else: + local_url = unix_socket_template.format(uds_path, 0) + server_urls = [local_url] + + try: + model = get_model( + model_id, + adapter_id, + revision, + sharded, + quantize, + compile, + dtype, + trust_remote_code, + source, + adapter_source, + ) + except Exception: + logger.exception("Error when initializing model") + raise + + if quantize == "gptq": + try: + # When using GPTQ, Exllama kernels need some global kernels + # For which we have the finale shapes only after the model has loaded + # This will allocate those buffers. + from lorax_server.utils.gptq.exllamav2 import ( + create_exllama_buffers, + set_device, + ) + + set_device(model.device) + create_exllama_buffers() + except ImportError: + pass + + # set speculative decoding tokens + speculative_tokens = max(model.max_speculative_tokens, speculative_tokens) + if speculative_tokens > 0: + set_speculative_tokens(speculative_tokens) + + server = aio.server( + interceptors=[ + ExceptionInterceptor(), + UDSOpenTelemetryAioServerInterceptor(), + ] + ) + generate_pb2_grpc.add_LoraxServiceServicer_to_server(LoraxService(model, Cache(), server_urls), server) + SERVICE_NAMES = ( + generate_pb2.DESCRIPTOR.services_by_name["LoraxService"].full_name, + reflection.SERVICE_NAME, + ) + reflection.enable_server_reflection(SERVICE_NAMES, server) + server.add_insecure_port(local_url) + + await server.start() + + # Log SGMV kernel status + if has_sgmv(): + logger.info("SGMV kernel is enabled, multi-LoRA inference will be fast!") + else: + logger.info("SGMV kernel is disabled, multi-LoRA inference may be slow") + + logger.info("Server started at {}".format(local_url)) + + try: + await server.wait_for_termination() + except KeyboardInterrupt: + logger.info("Signal received. Shutting down") + await server.stop(0) + + asyncio.run( + serve_inner( + model_id, + adapter_id, + revision, + sharded, + quantize, + compile, + dtype, + trust_remote_code, + speculative_tokens, + ) + ) + + +def _adapter_source_enum_to_string(adapter_source: int) -> str: + # TODO(travis): refactor this to be less hacky + if adapter_source == generate_pb2.AdapterSource.HUB: + return HUB + elif adapter_source == generate_pb2.AdapterSource.S3: + return S3 + elif adapter_source == generate_pb2.AdapterSource.LOCAL: + return LOCAL + elif adapter_source == generate_pb2.AdapterSource.PBASE: + return PBASE + else: + raise ValueError(f"Unknown adapter source {adapter_source}") From 50f564a381ce36245c7679a2b139b0cf813afd97 Mon Sep 17 00:00:00 2001 From: Noah Yoshida Date: Thu, 13 Jun 2024 12:15:14 -0700 Subject: [PATCH 02/22] fix --- runpod/Dockerfile | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/runpod/Dockerfile b/runpod/Dockerfile index 3fee1dd51..583782a98 100644 --- a/runpod/Dockerfile +++ b/runpod/Dockerfile @@ -12,9 +12,8 @@ RUN /bin/bash /setup.sh && \ # Install Python dependencies (Worker Template) # COPY builder/requirements.txt /requirements.txt -# RUN python3 -m pip install --upgrade pip && \ -# python3 -m pip install --upgrade -r /requirements.txt --no-cache-dir && \ -# rm /requirements.txt +RUN python3 -m pip install --upgrade pip && \ + python3 -m pip install runpod # Add src files (Worker Template) ADD src . From 2c1544ec72e98e49f7c0fa639ed010ad8848ff52 Mon Sep 17 00:00:00 2001 From: Noah Yoshida Date: Thu, 13 Jun 2024 16:23:42 -0700 Subject: [PATCH 03/22] runpod POC --- runpod/src/handler.py | 15 +- runpod/src/server.py | 352 ------------------------------------------ 2 files changed, 5 insertions(+), 362 deletions(-) delete mode 100644 runpod/src/server.py diff --git a/runpod/src/handler.py b/runpod/src/handler.py index 39ac671ca..576f029e2 100644 --- a/runpod/src/handler.py +++ b/runpod/src/handler.py @@ -12,20 +12,21 @@ # Prepare global variables JOBS = set() TGI_LOCAL_PORT = int(os.environ.get('TGI_LOCAL_PORT', 8080)) - +url = "http://127.0.0.1:{}".format(TGI_LOCAL_PORT) # Create the client -client = Client("http://127.0.0.1:{}".format(TGI_LOCAL_PORT)) - +client = Client(url) +print(url) # Wait for the hugging face TGI worker to start running. while True: try: - client.generate("Why is the sky blue?").generated_text + client.generate("Why is the sky blue?", max_new_tokens=1).generated_text print("Successfully cold booted the hugging face text generation inference server!") # Break from the while loop break except Exception as e: + print(e) print("The hugging face text generation inference server is still cold booting...") time.sleep(5) @@ -71,12 +72,6 @@ def aggregate_function(streamed_outputs): text_outputs = response.token.text ret = {"text": text_outputs} - # Update the aggregate transformation function - runpod.serverless.modules.rp_metrics.metrics_collector.update_stream_aggregate( - job_id=job['id'], - aggregate_function=aggregate_function - ) - yield ret # Remove job from the set. diff --git a/runpod/src/server.py b/runpod/src/server.py deleted file mode 100644 index 63cc39c77..000000000 --- a/runpod/src/server.py +++ /dev/null @@ -1,352 +0,0 @@ -import asyncio -import os -from pathlib import Path -from typing import List, Optional - -import torch -from grpc import aio -from grpc_reflection.v1alpha import reflection -from loguru import logger - -from lorax_server.adapters.utils import download_adapter -from lorax_server.cache import Cache -from lorax_server.interceptor import ExceptionInterceptor -from lorax_server.models import Model, get_model -from lorax_server.pb import generate_pb2, generate_pb2_grpc -from lorax_server.tracing import UDSOpenTelemetryAioServerInterceptor -from lorax_server.utils import HUB, LOCAL, PBASE, S3, map_pbase_model_id_to_s3 -from lorax_server.utils.adapter import BASE_MODEL_ADAPTER_ID, is_base_model -from lorax_server.utils.sgmv import has_sgmv -from lorax_server.utils.state import set_speculative_tokens - - -class LoraxService(generate_pb2_grpc.LoraxServiceServicer): - """ - Implementation of the LoraxService gRPC service. - - Args: - model (Model): The model used for inference. - cache (Cache): The cache used for storing and retrieving batches. - server_urls (List[str]): List of server URLs for service discovery. - """ - - def __init__(self, model: Model, cache: Cache, server_urls: List[str]): - self.cache = cache - self.model = model - self.server_urls = server_urls - # For some reason, inference_mode does not work well with GLOO which we use on CPU - if model.device.type == "cuda": - # Force inference mode for the lifetime of LoraxService - self._inference_mode_raii_guard = torch._C._InferenceMode(True) - - async def Info(self, request, context): - return self.model.info - - async def Health(self, request, context): - if self.model.device.type == "cuda": - torch.zeros((2, 2)).cuda() - return generate_pb2.HealthResponse() - - async def ServiceDiscovery(self, request, context): - return generate_pb2.ServiceDiscoveryResponse(urls=self.server_urls) - - async def ClearCache(self, request, context): - if request.HasField("id"): - self.cache.delete(request.id) - else: - self.cache.clear() - return generate_pb2.ClearCacheResponse() - - async def FilterBatch(self, request, context): - batch = self.cache.pop(request.batch_id) - if batch is None: - raise ValueError(f"Batch ID {request.batch_id} not found in cache.") - filtered_batch = batch.filter(request.request_ids) - self.cache.set(filtered_batch) - - return generate_pb2.FilterBatchResponse(batch=filtered_batch.to_pb()) - - async def Warmup(self, request: generate_pb2.WarmupRequest, context): - batch = self.model.batch_type.from_pb( - request.batch, - self.model.tokenizer, - self.model.tokenizers, - self.model.dtype, - self.model.device, - ) - max_supported_total_tokens = self.model.warmup(batch, request.max_new_tokens) - - return generate_pb2.WarmupResponse(max_supported_total_tokens=max_supported_total_tokens) - - async def Prefill(self, request: generate_pb2.PrefillRequest, context): - batch = self.model.batch_type.from_pb( - request.batch, - self.model.tokenizer, - self.model.tokenizers, - self.model.dtype, - self.model.device, - ) - - generations, next_batch = self.model.generate_token(batch) - self.cache.set(next_batch) - - return generate_pb2.PrefillResponse( - generations=[generation.to_pb() for generation in generations], - batch=next_batch.to_pb() if next_batch else None, - ) - - async def Embed(self, request: generate_pb2.EmbedRequest, context): - if not self.model.supports_embeddings: - raise ValueError("Model does not support embeddings") - - batch = self.model.batch_type.from_pb( - request.batch, - self.model.tokenizer, - self.model.tokenizers, - self.model.dtype, - self.model.device, - ) - embeddings = self.model.embed(batch) - embeddings_proto = [] - for i, embedding in enumerate(embeddings): - embeddings_proto.append(generate_pb2.Embedding(request_id=batch.request_ids[i], values=embedding)) - return generate_pb2.EmbedResponse(embeddings=embeddings_proto) - - async def Decode(self, request: generate_pb2.DecodeRequest, context): - if len(request.batches) == 0: - raise ValueError("Must provide at least one batch") - - batches = [] - for batch_pb in request.batches: - batch = self.cache.pop(batch_pb.id) - if batch is None: - raise ValueError(f"Batch ID {batch_pb.id} not found in cache.") - batches.append(batch) - - if len(batches) == 0: - raise ValueError("All batches are empty") - - if len(batches) > 1: - batch = self.model.batch_type.concatenate(batches) - else: - batch = batches[0] - - generations, next_batch = self.model.generate_token(batch) - self.cache.set(next_batch) - - return generate_pb2.DecodeResponse( - generations=[generation.to_pb() for generation in generations], - batch=next_batch.to_pb() if next_batch else None, - ) - - async def DownloadAdapter(self, request: generate_pb2.DownloadAdapterRequest, context): - adapter_parameters = request.adapter_parameters - if is_base_model(adapter_parameters): - logger.info("No adapter to download for base model. Skipping.") - return generate_pb2.DownloadAdapterResponse(downloaded=False) - - adapter_bytes = 0 - api_token = request.api_token - adapter_source = _adapter_source_enum_to_string(request.adapter_source) - for adapter_id in adapter_parameters.adapter_ids: - if adapter_id == BASE_MODEL_ADAPTER_ID: - logger.info("No adapter to download for base model. Skipping.") - continue - - adapter_bytes += download_adapter(adapter_id, adapter_source, api_token) - - adapter_memory_size = self.model.adapter_memory_size() - if adapter_memory_size > 0: - logger.info( - f"Downloaded adapter {adapter_id} memory size: {adapter_bytes} bytes " - f"(reservation: {adapter_memory_size} bytes)" - ) - adapter_memory_fraction = adapter_bytes / adapter_memory_size - if adapter_memory_fraction > 1: - raise ValueError( - f"Adapter {adapter_id} is larger than adapter memory reservation: " - f"{adapter_bytes} / {adapter_memory_size} bytes" - ) - else: - # Assume 0.0 memory fraction if adapter memory size is not set - logger.info( - f"Downloaded adapter {adapter_id} memory size: {adapter_bytes} bytes " f"(no reservation limit)" - ) - adapter_memory_fraction = 0.0 - - return generate_pb2.DownloadAdapterResponse(downloaded=True, memory_fraction=adapter_memory_fraction) - - async def LoadAdapter(self, request: generate_pb2.LoadAdapterRequest, context): - adapter_parameters = request.adapter_parameters - if is_base_model(adapter_parameters): - logger.info("No adapter to load for base model. Skipping.") - return generate_pb2.LoadAdapterResponse(loaded=False) - - try: - adapter_source = _adapter_source_enum_to_string(request.adapter_source) - adapter_index = request.adapter_index - api_token = request.api_token - - if adapter_source == PBASE: - for i in range(len(adapter_parameters.adapter_ids)): - adapter_id = adapter_parameters.adapter_ids[i] - adapter_id = map_pbase_model_id_to_s3(adapter_id, api_token) - adapter_parameters.adapter_ids[i] = adapter_id - adapter_source = S3 - - self.model.load_adapter(adapter_parameters, adapter_source, adapter_index, api_token) - - return generate_pb2.LoadAdapterResponse(loaded=True) - except Exception: - logger.exception("Error when loading adapter") - raise - - async def OffloadAdapter(self, request: generate_pb2.OffloadAdapterRequest, context): - adapter_parameters = request.adapter_parameters - if is_base_model(adapter_parameters): - logger.info("No adapter to offload for base model. Skipping.") - return generate_pb2.OffloadAdapterResponse(offloaded=False) - - try: - adapter_idx = request.adapter_index - adapter_source = _adapter_source_enum_to_string(request.adapter_source) - adapter_index = request.adapter_index - self.model.offload_adapter(adapter_idx, adapter_source, adapter_index) - - # Ensure there is enough memory for the next adapter - torch.cuda.empty_cache() - torch.cuda.synchronize(self.model.device) - - return generate_pb2.OffloadAdapterResponse(offloaded=True) - except Exception: - logger.exception("Error when offloading adapter") - raise - - -def serve( - model_id: str, - adapter_id: str, - revision: Optional[str], - sharded: bool, - quantize: Optional[str], - compile: bool, - dtype: Optional[str], - trust_remote_code: bool, - uds_path: Path, - source: str, - adapter_source: str, - speculative_tokens: int, -): - async def serve_inner( - model_id: str, - adapter_id: str, - revision: Optional[str], - sharded: bool, - quantize: Optional[str], - compile: bool, - dtype: Optional[str], - trust_remote_code: bool, - speculative_tokens: int, - ): - unix_socket_template = "unix://{}-{}" - if sharded: - server_urls = [unix_socket_template.format(uds_path, rank) for rank in range(int(os.environ["WORLD_SIZE"]))] - local_url = server_urls[int(os.environ["RANK"])] - else: - local_url = unix_socket_template.format(uds_path, 0) - server_urls = [local_url] - - try: - model = get_model( - model_id, - adapter_id, - revision, - sharded, - quantize, - compile, - dtype, - trust_remote_code, - source, - adapter_source, - ) - except Exception: - logger.exception("Error when initializing model") - raise - - if quantize == "gptq": - try: - # When using GPTQ, Exllama kernels need some global kernels - # For which we have the finale shapes only after the model has loaded - # This will allocate those buffers. - from lorax_server.utils.gptq.exllamav2 import ( - create_exllama_buffers, - set_device, - ) - - set_device(model.device) - create_exllama_buffers() - except ImportError: - pass - - # set speculative decoding tokens - speculative_tokens = max(model.max_speculative_tokens, speculative_tokens) - if speculative_tokens > 0: - set_speculative_tokens(speculative_tokens) - - server = aio.server( - interceptors=[ - ExceptionInterceptor(), - UDSOpenTelemetryAioServerInterceptor(), - ] - ) - generate_pb2_grpc.add_LoraxServiceServicer_to_server(LoraxService(model, Cache(), server_urls), server) - SERVICE_NAMES = ( - generate_pb2.DESCRIPTOR.services_by_name["LoraxService"].full_name, - reflection.SERVICE_NAME, - ) - reflection.enable_server_reflection(SERVICE_NAMES, server) - server.add_insecure_port(local_url) - - await server.start() - - # Log SGMV kernel status - if has_sgmv(): - logger.info("SGMV kernel is enabled, multi-LoRA inference will be fast!") - else: - logger.info("SGMV kernel is disabled, multi-LoRA inference may be slow") - - logger.info("Server started at {}".format(local_url)) - - try: - await server.wait_for_termination() - except KeyboardInterrupt: - logger.info("Signal received. Shutting down") - await server.stop(0) - - asyncio.run( - serve_inner( - model_id, - adapter_id, - revision, - sharded, - quantize, - compile, - dtype, - trust_remote_code, - speculative_tokens, - ) - ) - - -def _adapter_source_enum_to_string(adapter_source: int) -> str: - # TODO(travis): refactor this to be less hacky - if adapter_source == generate_pb2.AdapterSource.HUB: - return HUB - elif adapter_source == generate_pb2.AdapterSource.S3: - return S3 - elif adapter_source == generate_pb2.AdapterSource.LOCAL: - return LOCAL - elif adapter_source == generate_pb2.AdapterSource.PBASE: - return PBASE - else: - raise ValueError(f"Unknown adapter source {adapter_source}") From 8c9d3f1664618631433acb46ed2927a57a90d5db Mon Sep 17 00:00:00 2001 From: Noah Yoshida Date: Fri, 14 Jun 2024 10:58:10 -0700 Subject: [PATCH 04/22] test --- .github/workflows/build.yaml | 97 ++++++++++++++++++++++++++++++++++++ 1 file changed, 97 insertions(+) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 116120764..966d10e6d 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -152,3 +152,100 @@ jobs: # Delete the SHA image(s) from containerd store sudo ctr i rm $(sudo ctr i ls -q) +#### new build test + - name: Docker meta + id: meta1 + uses: docker/metadata-action@v5 + with: + images: | + ghcr.io/predibase/lorax-runpod + tags: | + type=semver,pattern={{version}} + type=semver,pattern={{major}}.{{minor}} + type=sha,prefix=,suffix=,format=short + type=raw,value=main,enable=${{ github.ref == 'refs/heads/main' }} + + - name: Create a hash from tags + env: + tags: ${{ steps.meta1.outputs.tags }} + id: vars1 + run: | + tag_hash=$(echo -n "$tags" | md5sum | awk '{print $1}') + echo "tag_hash=$tag_hash" >> $GITHUB_OUTPUT + echo "cache_dir=/runner/build/images/cache" >> $GITHUB_OUTPUT + echo "image_dir=/runner/build/images" >> $GITHUB_OUTPUT + echo "image_path=/runner/build/images/lorax" >> $GITHUB_OUTPUT + + - name: Create and update image/cache directory + env: + image_dir: ${{ steps.vars1.outputs.image_dir }} + cache_dir: ${{ steps.vars1.outputs.cache_dir }} + run: | + sudo mkdir -p $image_dir + sudo chown ubuntu:ubuntu $image_dir + + sudo mkdir -p $cache_dir + sudo chown ubuntu:ubuntu $cache_dir + + - name: Export Docker image as OCI + uses: docker/build-push-action@v5 + with: + context: . + file: ./runpod/Dockerfile # Path to your Dockerfile + push: false + tags: ${{ steps.meta1.outputs.tags }} + outputs: type=oci,compression=gzip,dest=${{ steps.vars1.outputs.image_path }}-${{ steps.vars1.outputs.tag_hash }}.tar.gz + cache-from: type=local,src=${{ steps.vars1.outputs.cache_dir }} + cache-to: type=local,mode=max,image-manifest=true,oci-mediatypes=true,dest=${{ steps.vars1.outputs.cache_dir }} + + - name: Import image in containerd + env: + tag_hash: ${{ steps.vars1.outputs.tag_hash }} + image_path: ${{ steps.vars1.outputs.image_path }} + run: | + echo "Importing $image_path-$tag_hash to Containerd" + sudo ctr i import --no-unpack --all-platforms --digests $image_path-$tag_hash.tar.gz + + - name: Log in to GitHub Container Registry + uses: docker/login-action@v1 + with: + registry: ghcr.io + username: ${{ github.repository_owner }} + password: ${{ secrets.GHCR_PAT }} + + - name: Push image with containerd + env: + tags: ${{ steps.meta1.outputs.tags }} + run: | + for tag in $tags + do + echo "Pushing $tag to GHCR" + sudo ctr i push --user "${{ github.repository_owner }}:${{ secrets.GHCR_PAT }}" $tag + done + + - name: Create and push soci index + env: + tags: ${{ steps.meta1.outputs.tags }} + run: | + export SOCI_PATH=$HOME/.soci/soci + for tag in $tags + do + echo "Creating soci index for $tag" + sudo $SOCI_PATH create $tag + echo "Pushing soci index for $tag" + sudo $SOCI_PATH push --user ${{ github.repository_owner }}:${{ secrets.GHCR_PAT }} $tag + done + + - name: Prune older images + env: + tag_hash: ${{ steps.vars1.outputs.tag_hash }} + image_path: ${{ steps.vars1.outputs.image_path }} + run: | + # Delete images older than a day from docker store + docker image prune -a -f --filter "until=24h" + + # Delete the on disk copy + rm -rf "$image_path-$tag_hash.tar.gz" + + # Delete the SHA image(s) from containerd store + sudo ctr i rm $(sudo ctr i ls -q) From b6df46679783d53513618c81ef5d63cae47d93f9 Mon Sep 17 00:00:00 2001 From: Noah Yoshida Date: Fri, 14 Jun 2024 11:14:41 -0700 Subject: [PATCH 05/22] fix context --- .github/workflows/build.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 966d10e6d..0055a1d3e 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -190,7 +190,7 @@ jobs: - name: Export Docker image as OCI uses: docker/build-push-action@v5 with: - context: . + context: ./runpod file: ./runpod/Dockerfile # Path to your Dockerfile push: false tags: ${{ steps.meta1.outputs.tags }} From 82b7f5d19efef6f7186d640a30071ddf010b00d8 Mon Sep 17 00:00:00 2001 From: Noah Yoshida Date: Fri, 14 Jun 2024 11:34:02 -0700 Subject: [PATCH 06/22] updates --- runpod/Dockerfile | 2 +- runpod/src/handler.py | 39 ++++++++++----------------------------- 2 files changed, 11 insertions(+), 30 deletions(-) diff --git a/runpod/Dockerfile b/runpod/Dockerfile index 583782a98..f662516cc 100644 --- a/runpod/Dockerfile +++ b/runpod/Dockerfile @@ -1,4 +1,5 @@ # Base image +# TODO change the lorax base image FROM ghcr.io/predibase/lorax:0.10.0 ENV DEBIAN_FRONTEND=noninteractive @@ -68,7 +69,6 @@ RUN if [ "$DOWNLOAD_MODEL" = "1" ]; then \ # Quick temporary updates RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.10 -RUN python3.10 -m pip install git+https://github.com/runpod/runpod-python@a1#egg=runpod --compile RUN python3.10 -m pip install lorax-client ENTRYPOINT ["./entrypoint.sh"] diff --git a/runpod/src/handler.py b/runpod/src/handler.py index 576f029e2..e17d20cd0 100644 --- a/runpod/src/handler.py +++ b/runpod/src/handler.py @@ -31,8 +31,8 @@ time.sleep(5) def concurrency_controller() -> bool: - # Handle at most 100 jobs at a time. - return len(JOBS) > 20 + # Handle at most 1024 jobs at a time. + return len(JOBS) > 1024 async def handler_streaming(job: dict) -> Generator[dict[str, list], None, None]: ''' @@ -41,38 +41,19 @@ async def handler_streaming(job: dict) -> Generator[dict[str, list], None, None] # Get job input job_input = job['input'] - # Prompts - prompt = job_input['prompt'] - - # Validate the inputs - sampling_params = job_input.get('sampling_params', {}) - + # TODO get stream yes/no and call the client based on that...? + # TODO get the auth token or whatever + # TODO figure out how to do auth here - maybe we start it with a secret + # and in istio-land we inject the correct secret in requests + # if the user is auth'ed properly for the resource? + # TODO handle key timeouts # Add job to the set. JOBS.add(job['id']) - # Include metrics in the highest level for the job output for aggregrate. - def aggregate_function(streamed_outputs): - aggregate_output = "" - for stream in streamed_outputs: - aggregate_output += stream['text'] - - # Aggregate metrics to expose to the user - # input_tokens = -1 # TBD - # output_tokens = -1 # TBD - - return { - "text": aggregate_output, - # "input_tokens": input_tokens, - # "output_tokens": output_tokens, - } - # Streaming case - for response in client.generate_stream(prompt, **sampling_params): + for response in client.generate_stream(**job_input): if not response.token.special: - text_outputs = response.token.text - ret = {"text": text_outputs} - - yield ret + yield response # Remove job from the set. JOBS.remove(job['id']) From bfb3e76bf8844487a4b876244fdcefb3b2e0c304 Mon Sep 17 00:00:00 2001 From: Noah Yoshida Date: Fri, 14 Jun 2024 11:56:53 -0700 Subject: [PATCH 07/22] test runpod specific tag --- .github/workflows/build.yaml | 192 +++++++++++++++++------------------ 1 file changed, 96 insertions(+), 96 deletions(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 0055a1d3e..21285a158 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -55,102 +55,102 @@ jobs: # persistent data location root = "/runner/build/containerd" - - name: Docker meta - id: meta - uses: docker/metadata-action@v5 - with: - images: | - ghcr.io/predibase/lorax - tags: | - type=semver,pattern={{version}} - type=semver,pattern={{major}}.{{minor}} - type=sha,prefix=,suffix=,format=short - type=raw,value=main,enable=${{ github.ref == 'refs/heads/main' }} + # - name: Docker meta + # id: meta + # uses: docker/metadata-action@v5 + # with: + # images: | + # ghcr.io/predibase/lorax + # tags: | + # type=semver,pattern={{version}} + # type=semver,pattern={{major}}.{{minor}} + # type=sha,prefix=,suffix=,format=short + # type=raw,value=main,enable=${{ github.ref == 'refs/heads/main' }} - - name: Create a hash from tags - env: - tags: ${{ steps.meta.outputs.tags }} - id: vars - run: | - tag_hash=$(echo -n "$tags" | md5sum | awk '{print $1}') - echo "tag_hash=$tag_hash" >> $GITHUB_OUTPUT - echo "cache_dir=/runner/build/images/cache" >> $GITHUB_OUTPUT - echo "image_dir=/runner/build/images" >> $GITHUB_OUTPUT - echo "image_path=/runner/build/images/lorax" >> $GITHUB_OUTPUT - - - name: Create and update image/cache directory - env: - image_dir: ${{ steps.vars.outputs.image_dir }} - cache_dir: ${{ steps.vars.outputs.cache_dir }} - run: | - sudo mkdir -p $image_dir - sudo chown ubuntu:ubuntu $image_dir - - sudo mkdir -p $cache_dir - sudo chown ubuntu:ubuntu $cache_dir - - - name: Export Docker image as OCI - uses: docker/build-push-action@v5 - with: - context: . - file: ./Dockerfile # Path to your Dockerfile - push: false - tags: ${{ steps.meta.outputs.tags }} - outputs: type=oci,compression=gzip,dest=${{ steps.vars.outputs.image_path }}-${{ steps.vars.outputs.tag_hash }}.tar.gz - cache-from: type=local,src=${{ steps.vars.outputs.cache_dir }} - cache-to: type=local,mode=max,image-manifest=true,oci-mediatypes=true,dest=${{ steps.vars.outputs.cache_dir }} - - - name: Import image in containerd - env: - tag_hash: ${{ steps.vars.outputs.tag_hash }} - image_path: ${{ steps.vars.outputs.image_path }} - run: | - echo "Importing $image_path-$tag_hash to Containerd" - sudo ctr i import --no-unpack --all-platforms --digests $image_path-$tag_hash.tar.gz - - - name: Log in to GitHub Container Registry - uses: docker/login-action@v1 - with: - registry: ghcr.io - username: ${{ github.repository_owner }} - password: ${{ secrets.GHCR_PAT }} - - - name: Push image with containerd - env: - tags: ${{ steps.meta.outputs.tags }} - run: | - for tag in $tags - do - echo "Pushing $tag to GHCR" - sudo ctr i push --user "${{ github.repository_owner }}:${{ secrets.GHCR_PAT }}" $tag - done + # - name: Create a hash from tags + # env: + # tags: ${{ steps.meta.outputs.tags }} + # id: vars + # run: | + # tag_hash=$(echo -n "$tags" | md5sum | awk '{print $1}') + # echo "tag_hash=$tag_hash" >> $GITHUB_OUTPUT + # echo "cache_dir=/runner/build/images/cache" >> $GITHUB_OUTPUT + # echo "image_dir=/runner/build/images" >> $GITHUB_OUTPUT + # echo "image_path=/runner/build/images/lorax" >> $GITHUB_OUTPUT + + # - name: Create and update image/cache directory + # env: + # image_dir: ${{ steps.vars.outputs.image_dir }} + # cache_dir: ${{ steps.vars.outputs.cache_dir }} + # run: | + # sudo mkdir -p $image_dir + # sudo chown ubuntu:ubuntu $image_dir + + # sudo mkdir -p $cache_dir + # sudo chown ubuntu:ubuntu $cache_dir + + # - name: Export Docker image as OCI + # uses: docker/build-push-action@v5 + # with: + # context: . + # file: ./Dockerfile # Path to your Dockerfile + # push: false + # tags: ${{ steps.meta.outputs.tags }} + # outputs: type=oci,compression=gzip,dest=${{ steps.vars.outputs.image_path }}-${{ steps.vars.outputs.tag_hash }}.tar.gz + # cache-from: type=local,src=${{ steps.vars.outputs.cache_dir }} + # cache-to: type=local,mode=max,image-manifest=true,oci-mediatypes=true,dest=${{ steps.vars.outputs.cache_dir }} + + # - name: Import image in containerd + # env: + # tag_hash: ${{ steps.vars.outputs.tag_hash }} + # image_path: ${{ steps.vars.outputs.image_path }} + # run: | + # echo "Importing $image_path-$tag_hash to Containerd" + # sudo ctr i import --no-unpack --all-platforms --digests $image_path-$tag_hash.tar.gz + + # - name: Log in to GitHub Container Registry + # uses: docker/login-action@v1 + # with: + # registry: ghcr.io + # username: ${{ github.repository_owner }} + # password: ${{ secrets.GHCR_PAT }} + + # - name: Push image with containerd + # env: + # tags: ${{ steps.meta.outputs.tags }} + # run: | + # for tag in $tags + # do + # echo "Pushing $tag to GHCR" + # sudo ctr i push --user "${{ github.repository_owner }}:${{ secrets.GHCR_PAT }}" $tag + # done - - name: Create and push soci index - env: - tags: ${{ steps.meta.outputs.tags }} - run: | - export SOCI_PATH=$HOME/.soci/soci - for tag in $tags - do - echo "Creating soci index for $tag" - sudo $SOCI_PATH create $tag - echo "Pushing soci index for $tag" - sudo $SOCI_PATH push --user ${{ github.repository_owner }}:${{ secrets.GHCR_PAT }} $tag - done - - - name: Prune older images - env: - tag_hash: ${{ steps.vars.outputs.tag_hash }} - image_path: ${{ steps.vars.outputs.image_path }} - run: | - # Delete images older than a day from docker store - docker image prune -a -f --filter "until=24h" - - # Delete the on disk copy - rm -rf "$image_path-$tag_hash.tar.gz" - - # Delete the SHA image(s) from containerd store - sudo ctr i rm $(sudo ctr i ls -q) + # - name: Create and push soci index + # env: + # tags: ${{ steps.meta.outputs.tags }} + # run: | + # export SOCI_PATH=$HOME/.soci/soci + # for tag in $tags + # do + # echo "Creating soci index for $tag" + # sudo $SOCI_PATH create $tag + # echo "Pushing soci index for $tag" + # sudo $SOCI_PATH push --user ${{ github.repository_owner }}:${{ secrets.GHCR_PAT }} $tag + # done + + # - name: Prune older images + # env: + # tag_hash: ${{ steps.vars.outputs.tag_hash }} + # image_path: ${{ steps.vars.outputs.image_path }} + # run: | + # # Delete images older than a day from docker store + # docker image prune -a -f --filter "until=24h" + + # # Delete the on disk copy + # rm -rf "$image_path-$tag_hash.tar.gz" + + # # Delete the SHA image(s) from containerd store + # sudo ctr i rm $(sudo ctr i ls -q) #### new build test - name: Docker meta @@ -158,7 +158,7 @@ jobs: uses: docker/metadata-action@v5 with: images: | - ghcr.io/predibase/lorax-runpod + ghcr.io/predibase/lorax tags: | type=semver,pattern={{version}} type=semver,pattern={{major}}.{{minor}} @@ -170,7 +170,7 @@ jobs: tags: ${{ steps.meta1.outputs.tags }} id: vars1 run: | - tag_hash=$(echo -n "$tags" | md5sum | awk '{print $1}') + tag_hash=runpod-$(echo -n "$tags" | md5sum | awk '{print $1}') echo "tag_hash=$tag_hash" >> $GITHUB_OUTPUT echo "cache_dir=/runner/build/images/cache" >> $GITHUB_OUTPUT echo "image_dir=/runner/build/images" >> $GITHUB_OUTPUT From cb4b39a8fc71f90d64ff99ebfc13d5c4b22dad3c Mon Sep 17 00:00:00 2001 From: Noah Yoshida Date: Fri, 14 Jun 2024 12:05:00 -0700 Subject: [PATCH 08/22] add prefix --- .github/workflows/build.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 21285a158..3d3c89102 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -162,7 +162,7 @@ jobs: tags: | type=semver,pattern={{version}} type=semver,pattern={{major}}.{{minor}} - type=sha,prefix=,suffix=,format=short + type=sha,prefix=runpod,suffix=,format=short type=raw,value=main,enable=${{ github.ref == 'refs/heads/main' }} - name: Create a hash from tags @@ -170,7 +170,7 @@ jobs: tags: ${{ steps.meta1.outputs.tags }} id: vars1 run: | - tag_hash=runpod-$(echo -n "$tags" | md5sum | awk '{print $1}') + tag_hash=$(echo -n "$tags" | md5sum | awk '{print $1}') echo "tag_hash=$tag_hash" >> $GITHUB_OUTPUT echo "cache_dir=/runner/build/images/cache" >> $GITHUB_OUTPUT echo "image_dir=/runner/build/images" >> $GITHUB_OUTPUT From 680378910d053934e69bed76a0b5080069cd4837 Mon Sep 17 00:00:00 2001 From: Noah Yoshida Date: Fri, 14 Jun 2024 12:20:01 -0700 Subject: [PATCH 09/22] chmod entrypoint --- runpod/src/entrypoint.sh | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 runpod/src/entrypoint.sh diff --git a/runpod/src/entrypoint.sh b/runpod/src/entrypoint.sh old mode 100644 new mode 100755 From 2f72eaafecceb43a5452ea7fb3cc3784462d8bd8 Mon Sep 17 00:00:00 2001 From: Noah Yoshida Date: Fri, 14 Jun 2024 14:30:16 -0700 Subject: [PATCH 10/22] dump the output into a serializable dict --- .github/workflows/build.yaml | 2 +- runpod/src/handler.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 3d3c89102..8364dd829 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -162,7 +162,7 @@ jobs: tags: | type=semver,pattern={{version}} type=semver,pattern={{major}}.{{minor}} - type=sha,prefix=runpod,suffix=,format=short + type=sha,prefix=runpod-,suffix=,format=short type=raw,value=main,enable=${{ github.ref == 'refs/heads/main' }} - name: Create a hash from tags diff --git a/runpod/src/handler.py b/runpod/src/handler.py index e17d20cd0..652e1c775 100644 --- a/runpod/src/handler.py +++ b/runpod/src/handler.py @@ -53,7 +53,8 @@ async def handler_streaming(job: dict) -> Generator[dict[str, list], None, None] # Streaming case for response in client.generate_stream(**job_input): if not response.token.special: - yield response + # Dump the repsonse into a dictionary + yield response.model_dump() # Remove job from the set. JOBS.remove(job['id']) From dd6c80ea87a9e999d652df6216beac011d64a8b0 Mon Sep 17 00:00:00 2001 From: Noah Yoshida Date: Fri, 14 Jun 2024 15:54:55 -0700 Subject: [PATCH 11/22] no stream by default? --- runpod/src/entrypoint.sh | 2 +- runpod/src/handler.py | 20 +++++++++++++++----- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/runpod/src/entrypoint.sh b/runpod/src/entrypoint.sh index 09e69a875..caadce279 100755 --- a/runpod/src/entrypoint.sh +++ b/runpod/src/entrypoint.sh @@ -39,7 +39,7 @@ if [[ -n "${HF_MAX_BATCH_PREFILL_TOKENS}" ]]; then fi # Start the text generation server -nohup lorax-launcher --port 8080 --model-id predibase/Mistral-7B-v0.1-dequantized --adapter-source hub --default-adapter-source pbase --max-batch-prefill-tokens 32768 --max-total-tokens 8192 --max-input-length 8191 --max-concurrent-requests 1024 & +nohup lorax-launcher --port 8080 --model-id predibase/Meta-Llama-3-8B-Instruct-dequantized --adapter-source hub --default-adapter-source pbase --max-batch-prefill-tokens 32768 --max-total-tokens 8192 --max-input-length 8191 --max-concurrent-requests 1024 & # Start the handler using python 3.10 python3.10 -u /handler.py diff --git a/runpod/src/handler.py b/runpod/src/handler.py index 652e1c775..c74886abc 100644 --- a/runpod/src/handler.py +++ b/runpod/src/handler.py @@ -40,7 +40,11 @@ async def handler_streaming(job: dict) -> Generator[dict[str, list], None, None] ''' # Get job input job_input = job['input'] + + # When we are called with a streaming endpoint, then we should have the field + # _stream = True + # TODO handle the two openAI compatable endpoints as well...! # TODO get stream yes/no and call the client based on that...? # TODO get the auth token or whatever # TODO figure out how to do auth here - maybe we start it with a secret @@ -49,12 +53,18 @@ async def handler_streaming(job: dict) -> Generator[dict[str, list], None, None] # TODO handle key timeouts # Add job to the set. JOBS.add(job['id']) + if job_input.get('_stream', False): + del job_input['_stream'] + # Streaming case + for response in client.generate_stream(**job_input): + if not response.token.special: + # Dump the repsonse into a dictionary + yield response.model_dump() + else: + del job_input['_stream'] + response = client.generate(**job_input) + yield response.model_dump() - # Streaming case - for response in client.generate_stream(**job_input): - if not response.token.special: - # Dump the repsonse into a dictionary - yield response.model_dump() # Remove job from the set. JOBS.remove(job['id']) From c670df885f703eb2ac2b645bb565ab1f692d9967 Mon Sep 17 00:00:00 2001 From: Noah Yoshida Date: Fri, 14 Jun 2024 19:29:51 -0700 Subject: [PATCH 12/22] try out openai --- runpod/Dockerfile | 1 + runpod/src/handler.py | 17 +++++++++++++++++ 2 files changed, 18 insertions(+) diff --git a/runpod/Dockerfile b/runpod/Dockerfile index f662516cc..c037aca40 100644 --- a/runpod/Dockerfile +++ b/runpod/Dockerfile @@ -70,5 +70,6 @@ RUN if [ "$DOWNLOAD_MODEL" = "1" ]; then \ # Quick temporary updates RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.10 RUN python3.10 -m pip install lorax-client +RUN python3.10 -m pip install openai ENTRYPOINT ["./entrypoint.sh"] diff --git a/runpod/src/handler.py b/runpod/src/handler.py index c74886abc..534682659 100644 --- a/runpod/src/handler.py +++ b/runpod/src/handler.py @@ -9,12 +9,16 @@ # For download the weights from lorax import Client +import openai + # Prepare global variables JOBS = set() TGI_LOCAL_PORT = int(os.environ.get('TGI_LOCAL_PORT', 8080)) url = "http://127.0.0.1:{}".format(TGI_LOCAL_PORT) # Create the client client = Client(url) + + print(url) # Wait for the hugging face TGI worker to start running. while True: @@ -40,6 +44,19 @@ async def handler_streaming(job: dict) -> Generator[dict[str, list], None, None] ''' # Get job input job_input = job['input'] + # TODO do different things based on the openai_route. Right now, just assume we are calling the openai + # chat completions.generate method! + use_openai = 'openai_route' in job_input + + # Create a new client and pass the token for every handler call + openai_client = openai.Openai( + base_url=f"{url}/v1", + ) + + if use_openai: + if job_input['stream'] == False: + yield openai_client.chat.completions.create(**job_input) + # When we are called with a streaming endpoint, then we should have the field # _stream = True From 57c794d6436fe0cdc5a837b503b1c64ba5d86ee7 Mon Sep 17 00:00:00 2001 From: Noah Yoshida Date: Fri, 14 Jun 2024 19:35:15 -0700 Subject: [PATCH 13/22] fix model dump thing --- runpod/src/handler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runpod/src/handler.py b/runpod/src/handler.py index 534682659..581fc3d03 100644 --- a/runpod/src/handler.py +++ b/runpod/src/handler.py @@ -55,7 +55,7 @@ async def handler_streaming(job: dict) -> Generator[dict[str, list], None, None] if use_openai: if job_input['stream'] == False: - yield openai_client.chat.completions.create(**job_input) + yield openai_client.chat.completions.create(**job_input).model_dump() # When we are called with a streaming endpoint, then we should have the field From 743c956c3795170b1098a3078a9b084cdfc7e0b8 Mon Sep 17 00:00:00 2001 From: Noah Yoshida Date: Fri, 14 Jun 2024 19:50:16 -0700 Subject: [PATCH 14/22] WIP --- runpod/src/handler.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/runpod/src/handler.py b/runpod/src/handler.py index 581fc3d03..d418b00a5 100644 --- a/runpod/src/handler.py +++ b/runpod/src/handler.py @@ -46,16 +46,19 @@ async def handler_streaming(job: dict) -> Generator[dict[str, list], None, None] job_input = job['input'] # TODO do different things based on the openai_route. Right now, just assume we are calling the openai # chat completions.generate method! + print(job_input) + print("first print :P") use_openai = 'openai_route' in job_input # Create a new client and pass the token for every handler call openai_client = openai.Openai( base_url=f"{url}/v1", ) - + print(use_openai) if use_openai: - if job_input['stream'] == False: - yield openai_client.chat.completions.create(**job_input).model_dump() + # if job_input['stream'] == False: + print(job_input) + yield openai_client.chat.completions.create(**job_input).model_dump() # When we are called with a streaming endpoint, then we should have the field From 8f5ea6fbd519607c80c1cd9580849cf3eb5d128b Mon Sep 17 00:00:00 2001 From: Noah Yoshida Date: Fri, 14 Jun 2024 20:01:13 -0700 Subject: [PATCH 15/22] fix --- runpod/src/handler.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/runpod/src/handler.py b/runpod/src/handler.py index d418b00a5..616bc5305 100644 --- a/runpod/src/handler.py +++ b/runpod/src/handler.py @@ -51,14 +51,14 @@ async def handler_streaming(job: dict) -> Generator[dict[str, list], None, None] use_openai = 'openai_route' in job_input # Create a new client and pass the token for every handler call - openai_client = openai.Openai( + openai_client = openai.OpenAI( base_url=f"{url}/v1", ) print(use_openai) if use_openai: # if job_input['stream'] == False: print(job_input) - yield openai_client.chat.completions.create(**job_input).model_dump() + yield openai_client.chat.completions.create(**job_input["openai_input"]).model_dump() # When we are called with a streaming endpoint, then we should have the field From b196a975c71744906cb8d39eaa9a08ca9359797b Mon Sep 17 00:00:00 2001 From: Noah Yoshida Date: Fri, 14 Jun 2024 20:36:40 -0700 Subject: [PATCH 16/22] add fake api key? --- runpod/src/handler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runpod/src/handler.py b/runpod/src/handler.py index 616bc5305..230577d87 100644 --- a/runpod/src/handler.py +++ b/runpod/src/handler.py @@ -53,6 +53,7 @@ async def handler_streaming(job: dict) -> Generator[dict[str, list], None, None] # Create a new client and pass the token for every handler call openai_client = openai.OpenAI( base_url=f"{url}/v1", + api_key="fake" ) print(use_openai) if use_openai: @@ -60,7 +61,6 @@ async def handler_streaming(job: dict) -> Generator[dict[str, list], None, None] print(job_input) yield openai_client.chat.completions.create(**job_input["openai_input"]).model_dump() - # When we are called with a streaming endpoint, then we should have the field # _stream = True From 14a312aae65abb99e2366bba5607ffb8e8b7221a Mon Sep 17 00:00:00 2001 From: Noah Yoshida Date: Mon, 17 Jun 2024 10:20:57 -0700 Subject: [PATCH 17/22] fix logik --- runpod/src/handler.py | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/runpod/src/handler.py b/runpod/src/handler.py index 230577d87..111929b62 100644 --- a/runpod/src/handler.py +++ b/runpod/src/handler.py @@ -55,12 +55,26 @@ async def handler_streaming(job: dict) -> Generator[dict[str, list], None, None] base_url=f"{url}/v1", api_key="fake" ) + JOBS.add(job['id']) + print(use_openai) if use_openai: # if job_input['stream'] == False: print(job_input) - yield openai_client.chat.completions.create(**job_input["openai_input"]).model_dump() - + result = openai_client.chat.completions.create(**job_input["openai_input"]).model_dump() + yield result + else: + if job_input.get('_stream', False): + del job_input['_stream'] + # Streaming case + for response in client.generate_stream(**job_input): + if not response.token.special: + # Dump the repsonse into a dictionary + yield response.model_dump() + else: + del job_input['_stream'] + response = client.generate(**job_input) + yield response.model_dump() # When we are called with a streaming endpoint, then we should have the field # _stream = True @@ -72,20 +86,7 @@ async def handler_streaming(job: dict) -> Generator[dict[str, list], None, None] # if the user is auth'ed properly for the resource? # TODO handle key timeouts # Add job to the set. - JOBS.add(job['id']) - if job_input.get('_stream', False): - del job_input['_stream'] - # Streaming case - for response in client.generate_stream(**job_input): - if not response.token.special: - # Dump the repsonse into a dictionary - yield response.model_dump() - else: - del job_input['_stream'] - response = client.generate(**job_input) - yield response.model_dump() - - + # Remove job from the set. JOBS.remove(job['id']) From bd4312b42ad3ba4d9e747f6afe6f95746170cea4 Mon Sep 17 00:00:00 2001 From: Noah Yoshida Date: Mon, 17 Jun 2024 10:45:49 -0700 Subject: [PATCH 18/22] add in predibase API token as env var --- runpod/src/handler.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/runpod/src/handler.py b/runpod/src/handler.py index 111929b62..4b9c9b7d4 100644 --- a/runpod/src/handler.py +++ b/runpod/src/handler.py @@ -17,6 +17,7 @@ url = "http://127.0.0.1:{}".format(TGI_LOCAL_PORT) # Create the client client = Client(url) +api_key = os.environ.get("PREDIBASE_API_KEY", "fake") print(url) @@ -53,7 +54,7 @@ async def handler_streaming(job: dict) -> Generator[dict[str, list], None, None] # Create a new client and pass the token for every handler call openai_client = openai.OpenAI( base_url=f"{url}/v1", - api_key="fake" + api_key=api_key ) JOBS.add(job['id']) From d3952d3fe1af633253292d40d0a9ca05f8a1f6d8 Mon Sep 17 00:00:00 2001 From: Noah Yoshida Date: Tue, 18 Jun 2024 13:01:02 -0700 Subject: [PATCH 19/22] try s3 crt? --- server/lorax_server/utils/sources/s3.py | 5 +- server/poetry.lock | 87 ++++++++++++++++++++----- server/pyproject.toml | 3 +- 3 files changed, 77 insertions(+), 18 deletions(-) diff --git a/server/lorax_server/utils/sources/s3.py b/server/lorax_server/utils/sources/s3.py index 085fbaeef..f6558f30d 100644 --- a/server/lorax_server/utils/sources/s3.py +++ b/server/lorax_server/utils/sources/s3.py @@ -13,6 +13,7 @@ LocalEntryNotFoundError, ) from loguru import logger +from boto3.s3.transfer import S3Transfer from .source import BaseModelSource, try_to_load_from_cache @@ -112,7 +113,9 @@ def download_file(filename): model_id_path = Path(model_id) bucket_file_name = model_id_path / filename logger.info(f"Downloading file {bucket_file_name} to {local_file_path}") - bucket.download_file(str(bucket_file_name), str(local_file_path)) + # use CRT? TODO change this? + transfer = S3Transfer(boto3.client('s3', region_name="us-west-2")) + transfer.download_file(bucket.name, str(bucket_file_name), str(local_file_path)) # TODO: add support for revision logger.info(f"Downloaded {local_file_path} in {timedelta(seconds=int(time.time() - start_time))}.") if not local_file_path.is_file(): diff --git a/server/poetry.lock b/server/poetry.lock index 68649649a..9bcafe7ba 100644 --- a/server/poetry.lock +++ b/server/poetry.lock @@ -179,6 +179,57 @@ docs = ["furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphinxcontrib- tests = ["attrs[tests-no-zope]", "zope-interface"] tests-no-zope = ["cloudpickle", "hypothesis", "mypy (>=1.1.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"] +[[package]] +name = "awscrt" +version = "0.20.11" +description = "A common runtime for AWS Python projects" +optional = false +python-versions = ">=3.7" +files = [ + {file = "awscrt-0.20.11-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:3b50f70d85ecc2069029573bad8e5e06b9aabad283dd933bee6eb9dd694b9511"}, + {file = "awscrt-0.20.11-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3b4ec0b471cf7d6a7a0950553ddf97d58a0caf4a8350da9ca12250c7df6add94"}, + {file = "awscrt-0.20.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a6b91cac82abf9718657e0694f90334e4ef4b2ef32061938ff0ceed67e302469"}, + {file = "awscrt-0.20.11-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:0261ef47f5000d5ce069dec05edf9d803a3ff89c02bd574ec0585e2e4447aca6"}, + {file = "awscrt-0.20.11-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:81167651ccd45af55fa659a09b415eba881a9892415e465b6432a4f336311711"}, + {file = "awscrt-0.20.11-cp310-cp310-win32.whl", hash = "sha256:fb316c27110a19917a45dc7b678349bc329c98ac1b95d5bd872f0ad37300e725"}, + {file = "awscrt-0.20.11-cp310-cp310-win_amd64.whl", hash = "sha256:ae4910e1f534e0d5bb8bade0ce2b1908bfd36007115ac0a700b9cda5c5655f0c"}, + {file = "awscrt-0.20.11-cp311-abi3-macosx_10_9_universal2.whl", hash = "sha256:86554f8042dea649b7d63a2e4de593864753aad736a7ca592e72b2f8a94535bb"}, + {file = "awscrt-0.20.11-cp311-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:45db07c2f0f7c83d8a4cb91a51869b22f1f44c1053db7266486733aca2d2ac41"}, + {file = "awscrt-0.20.11-cp311-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1c94917cce1df62fc40f53e19f5dcfbd036acfbdb1a88cba217ad6caaeab0d57"}, + {file = "awscrt-0.20.11-cp311-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:624322e103e62bffecf97731691e05ef0d7a50970d8e3b1872433dcf00c5595a"}, + {file = "awscrt-0.20.11-cp311-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:f10af50b747c2b237836ab1ed57dc1be0c2553e0fb485374f0d3be470a861e4a"}, + {file = "awscrt-0.20.11-cp311-abi3-win32.whl", hash = "sha256:fc7a8eecfc51503afd24764033a2061a5f39017ed6e825b6594490e04fd56297"}, + {file = "awscrt-0.20.11-cp311-abi3-win_amd64.whl", hash = "sha256:106ff16bce775917d4e9a8c93649b4f272c32a91336ae6ca97596dcb2faf2d44"}, + {file = "awscrt-0.20.11-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:5a7ba2227546522d5767308ff49876fbc0abd1771376710ce2cf4dd8b317b2b9"}, + {file = "awscrt-0.20.11-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:82acde62286c7a1d7991b5bf92f192603ea9b3752b3bf28dae75300c05de1119"}, + {file = "awscrt-0.20.11-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:583bff89758f0d2cd9540c2c9b301836df21b71548f0fabfdff7fb484c960bf0"}, + {file = "awscrt-0.20.11-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:6afdee4b204592eba1c75797407be976e9097682d27de6b0ec0c696ec0851758"}, + {file = "awscrt-0.20.11-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:0101be8b6b40e252eaead36eb1c4c87d53f6f0cd54d40e1ef571f984a36efb79"}, + {file = "awscrt-0.20.11-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:d6d984eea9687555ca9d269ebbba8f090e1b7feab6f61d1b046548cd469cb2ab"}, + {file = "awscrt-0.20.11-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:bb634f7fbf02b5aee95d619ec3066c7f7a4d7cd6a156203115bdf8cbc715c4f3"}, + {file = "awscrt-0.20.11-cp37-cp37m-win32.whl", hash = "sha256:8fbae85b2d5106dd470b349314b3bcceb8812904675c98a1dbd2fe1efe92eb35"}, + {file = "awscrt-0.20.11-cp37-cp37m-win_amd64.whl", hash = "sha256:56cc06725038d625365f9bdebd4b9e3c9f876ead1a26473cb124c6dfa4b39fd1"}, + {file = "awscrt-0.20.11-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:58c4616343b9f4d6fc454816fb3459ac86489a242ade3c8126ec9d1aa8208ec0"}, + {file = "awscrt-0.20.11-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e323a779b8db858b3412a727c90dc9c058898fd7eb5e0f454ca94623007aa078"}, + {file = "awscrt-0.20.11-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3e28303eb399d4fa3f5df79cca277d4ae434112590c4c9b60a21c397c6ad9da6"}, + {file = "awscrt-0.20.11-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:73a8a255d60cdb96bd6a93bf606055c918dc88cd9cca57be860efc113cb256b0"}, + {file = "awscrt-0.20.11-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:07592b2f9dffcd12745da06c10bc1de1e9f38ebc8996b98689bfaa860d600382"}, + {file = "awscrt-0.20.11-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:b022a21e8bda0d3403e6115ff15652f65dc7250b0d0c1b3125c2c3e095647940"}, + {file = "awscrt-0.20.11-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:a28f8d8e6b95a0007d80d07763f21dc2cea73b35586559d5b0635c2d06347efc"}, + {file = "awscrt-0.20.11-cp38-cp38-win32.whl", hash = "sha256:2427d727494d48253e70c9e6a2135d91546524cf56e13b0a7c5c0713994281b0"}, + {file = "awscrt-0.20.11-cp38-cp38-win_amd64.whl", hash = "sha256:8f1bf72ba5a3a38215b7487c5fb9421dece5b5f63b22ade8f63315acbf1c3842"}, + {file = "awscrt-0.20.11-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:981df6f011086d30ee76e9476bf33b381bf3064cb3d02be1ea1aa46fec79110a"}, + {file = "awscrt-0.20.11-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dfead1d73754718f7c6bb03add095750535237ea14c8226cd36d6b88fe7b5342"}, + {file = "awscrt-0.20.11-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8eb6d8c72485b0b14eb430ca9b1f280629277cd0a9d5d064ee02afffe787caa8"}, + {file = "awscrt-0.20.11-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f3fb9cbc3403032f57006806229d2c11ab8c425cee1f47f05d83ca87c1b94b32"}, + {file = "awscrt-0.20.11-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:a2c4c1a577d55b98ae93af82bb96795a62661585d4560674b1daa034f41e6fb1"}, + {file = "awscrt-0.20.11-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:7f68ce98ee54b634fbe48689aa4610965e3af0e5e1a11da82a791057c741fb99"}, + {file = "awscrt-0.20.11-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:4404f6e517a5f0871943463fe59cec657586b2d0d1e6e2efbe9dbb0b42f5b3e8"}, + {file = "awscrt-0.20.11-cp39-cp39-win32.whl", hash = "sha256:8e95ce32b03006097f833b539d1bc3ea503379d880751ddbfd7bb0440e93c0c4"}, + {file = "awscrt-0.20.11-cp39-cp39-win_amd64.whl", hash = "sha256:20b00d68a90575121cf04250c93aa4874f7d1f7d2d81f37511c12a157be7421c"}, + {file = "awscrt-0.20.11.tar.gz", hash = "sha256:c3dbfb7f1909457952e645373e72b69f90c50c465ee6a46d9bbdc12acb79803c"}, +] + [[package]] name = "backoff" version = "2.2.1" @@ -203,44 +254,48 @@ files = [ [[package]] name = "boto3" -version = "1.34.4" +version = "1.34.129" description = "The AWS SDK for Python" optional = false -python-versions = ">= 3.8" +python-versions = ">=3.8" files = [ - {file = "boto3-1.34.4-py3-none-any.whl", hash = "sha256:1e836fe33da2684db29317911d9958389094ca5098cc253dbaed8e4aa146b153"}, - {file = "boto3-1.34.4.tar.gz", hash = "sha256:a866277fc38b121ac5dab0eec38b6ae6e3a59bbf6f67ed9a9822332d9e5e785f"}, + {file = "boto3-1.34.129-py3-none-any.whl", hash = "sha256:cc73de1c9d953b1f9da6ee2404af717e93d888f790f3e0291b22d1b8489eb401"}, + {file = "boto3-1.34.129.tar.gz", hash = "sha256:a7a696fd3e7f5f43a81450b441f3eb6c5a89d28efe867cd97d8fc73ea5d8c139"}, ] [package.dependencies] -botocore = ">=1.34.4,<1.35.0" +botocore = [ + {version = ">=1.34.129,<1.35.0"}, + {version = ">=1.21.0,<2.0a0", extras = ["crt"], optional = true, markers = "extra == \"crt\""}, +] jmespath = ">=0.7.1,<2.0.0" -s3transfer = ">=0.9.0,<0.10.0" +s3transfer = ">=0.10.0,<0.11.0" [package.extras] crt = ["botocore[crt] (>=1.21.0,<2.0a0)"] [[package]] name = "botocore" -version = "1.34.4" +version = "1.34.129" description = "Low-level, data-driven core of boto 3." optional = false -python-versions = ">= 3.8" +python-versions = ">=3.8" files = [ - {file = "botocore-1.34.4-py3-none-any.whl", hash = "sha256:2026d89a46dfcb96d439db17a277de11b808428cba881deb50a5960b134e3a84"}, - {file = "botocore-1.34.4.tar.gz", hash = "sha256:5dcd63329cb3e65c533a72a68c99b7d07c99a29936ea07d0998120172c10b4f5"}, + {file = "botocore-1.34.129-py3-none-any.whl", hash = "sha256:86d3dd30996aa459e9c3321edac12aebe47c73cb4acc7556941f9b4c39726088"}, + {file = "botocore-1.34.129.tar.gz", hash = "sha256:7c56e25af6112d69c5d14a15b42f76ba7687687abc463a96ac5edca19c0a9c2d"}, ] [package.dependencies] +awscrt = {version = "0.20.11", optional = true, markers = "extra == \"crt\""} jmespath = ">=0.7.1,<2.0.0" python-dateutil = ">=2.1,<3.0.0" urllib3 = [ {version = ">=1.25.4,<1.27", markers = "python_version < \"3.10\""}, - {version = ">=1.25.4,<2.1", markers = "python_version >= \"3.10\""}, + {version = ">=1.25.4,<2.2.0 || >2.2.0,<3", markers = "python_version >= \"3.10\""}, ] [package.extras] -crt = ["awscrt (==0.19.17)"] +crt = ["awscrt (==0.20.11)"] [[package]] name = "certifi" @@ -2496,13 +2551,13 @@ files = [ [[package]] name = "s3transfer" -version = "0.9.0" +version = "0.10.1" description = "An Amazon S3 Transfer Manager" optional = false python-versions = ">= 3.8" files = [ - {file = "s3transfer-0.9.0-py3-none-any.whl", hash = "sha256:01d4d2c35a016db8cb14f9a4d5e84c1f8c96e7ffc211422555eed45c11fa7eb1"}, - {file = "s3transfer-0.9.0.tar.gz", hash = "sha256:9e1b186ec8bb5907a1e82b51237091889a9973a2bb799a924bcd9f301ff79d3d"}, + {file = "s3transfer-0.10.1-py3-none-any.whl", hash = "sha256:ceb252b11bcf87080fb7850a224fb6e05c8a776bab8f2b64b7f25b969464839d"}, + {file = "s3transfer-0.10.1.tar.gz", hash = "sha256:5683916b4c724f799e600f41dd9e10a9ff19871bf87623cc8f491cb4f5fa0a19"}, ] [package.dependencies] @@ -3577,4 +3632,4 @@ torch = ["torch"] [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "e52881fa075d917070103ac7f96cae7c648502cb82b4b48d54ec81d47e1b0ef9" +content-hash = "31b8e738ad9e0b578b35633680f09a1d2433fcc15501023bb2513b39b9d4c0df" diff --git a/server/pyproject.toml b/server/pyproject.toml index 1cfee5a7e..0437b28a8 100644 --- a/server/pyproject.toml +++ b/server/pyproject.toml @@ -34,13 +34,14 @@ texttable = { version = "^1.6.7", optional = true } datasets = { version = "^2.14.0", optional = true } torch = { version = "2.3.0", optional = true } peft = { version = "0.4.0", optional = true } -boto3 = "^1.28.34" +boto3 = {extras = ["crt"], version = "^1.34.129"} urllib3 = "<=1.26.18" hqq = { version = "^0.1.7", optional = true } stanford-stk = { version = "^0.7.0", markers = "sys_platform == 'linux'" } outlines = { version = "^0.0.40", optional = true } prometheus-client = "^0.20.0" py-cpuinfo = "^9.0.0" +s3transfer = "0.10.1" [tool.poetry.extras] torch = ["torch"] From f677da17b66d0ad8f8518f39ce72b75b50a531d8 Mon Sep 17 00:00:00 2001 From: Noah Yoshida Date: Tue, 18 Jun 2024 16:10:23 -0700 Subject: [PATCH 20/22] fix runpod sdk --- runpod/src/handler.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/runpod/src/handler.py b/runpod/src/handler.py index 4b9c9b7d4..4a20aec42 100644 --- a/runpod/src/handler.py +++ b/runpod/src/handler.py @@ -73,7 +73,8 @@ async def handler_streaming(job: dict) -> Generator[dict[str, list], None, None] # Dump the repsonse into a dictionary yield response.model_dump() else: - del job_input['_stream'] + if '_stream' in job_input: + del job_input['_stream'] response = client.generate(**job_input) yield response.model_dump() # When we are called with a streaming endpoint, then we should have the field From 8b4eca47e47d462220e1a3e8ce0e7618ab22dbb6 Mon Sep 17 00:00:00 2001 From: Noah Yoshida Date: Tue, 18 Jun 2024 17:28:37 -0700 Subject: [PATCH 21/22] fix --- runpod/src/handler.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/runpod/src/handler.py b/runpod/src/handler.py index 4a20aec42..ed2d8d4fc 100644 --- a/runpod/src/handler.py +++ b/runpod/src/handler.py @@ -65,17 +65,18 @@ async def handler_streaming(job: dict) -> Generator[dict[str, list], None, None] result = openai_client.chat.completions.create(**job_input["openai_input"]).model_dump() yield result else: + inputs = str(job_input.get('inputs')) if job_input.get('_stream', False): del job_input['_stream'] # Streaming case - for response in client.generate_stream(**job_input): + for response in client.generate_stream(inputs, **job_input.get('parameters', {})): if not response.token.special: # Dump the repsonse into a dictionary yield response.model_dump() else: if '_stream' in job_input: del job_input['_stream'] - response = client.generate(**job_input) + response = client.generate(inputs, **job_input.get('parameters', {})) yield response.model_dump() # When we are called with a streaming endpoint, then we should have the field # _stream = True From d79c04886915bafae677291f5b1be1cee7fc4d4c Mon Sep 17 00:00:00 2001 From: Noah Yoshida Date: Thu, 20 Jun 2024 11:55:33 -0700 Subject: [PATCH 22/22] Revert "try s3 crt?" This reverts commit d3952d3fe1af633253292d40d0a9ca05f8a1f6d8. --- server/lorax_server/utils/sources/s3.py | 5 +- server/poetry.lock | 87 +++++-------------------- server/pyproject.toml | 3 +- 3 files changed, 18 insertions(+), 77 deletions(-) diff --git a/server/lorax_server/utils/sources/s3.py b/server/lorax_server/utils/sources/s3.py index f6558f30d..085fbaeef 100644 --- a/server/lorax_server/utils/sources/s3.py +++ b/server/lorax_server/utils/sources/s3.py @@ -13,7 +13,6 @@ LocalEntryNotFoundError, ) from loguru import logger -from boto3.s3.transfer import S3Transfer from .source import BaseModelSource, try_to_load_from_cache @@ -113,9 +112,7 @@ def download_file(filename): model_id_path = Path(model_id) bucket_file_name = model_id_path / filename logger.info(f"Downloading file {bucket_file_name} to {local_file_path}") - # use CRT? TODO change this? - transfer = S3Transfer(boto3.client('s3', region_name="us-west-2")) - transfer.download_file(bucket.name, str(bucket_file_name), str(local_file_path)) + bucket.download_file(str(bucket_file_name), str(local_file_path)) # TODO: add support for revision logger.info(f"Downloaded {local_file_path} in {timedelta(seconds=int(time.time() - start_time))}.") if not local_file_path.is_file(): diff --git a/server/poetry.lock b/server/poetry.lock index 9bcafe7ba..68649649a 100644 --- a/server/poetry.lock +++ b/server/poetry.lock @@ -179,57 +179,6 @@ docs = ["furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphinxcontrib- tests = ["attrs[tests-no-zope]", "zope-interface"] tests-no-zope = ["cloudpickle", "hypothesis", "mypy (>=1.1.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"] -[[package]] -name = "awscrt" -version = "0.20.11" -description = "A common runtime for AWS Python projects" -optional = false -python-versions = ">=3.7" -files = [ - {file = "awscrt-0.20.11-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:3b50f70d85ecc2069029573bad8e5e06b9aabad283dd933bee6eb9dd694b9511"}, - {file = "awscrt-0.20.11-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3b4ec0b471cf7d6a7a0950553ddf97d58a0caf4a8350da9ca12250c7df6add94"}, - {file = "awscrt-0.20.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a6b91cac82abf9718657e0694f90334e4ef4b2ef32061938ff0ceed67e302469"}, - {file = "awscrt-0.20.11-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:0261ef47f5000d5ce069dec05edf9d803a3ff89c02bd574ec0585e2e4447aca6"}, - {file = "awscrt-0.20.11-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:81167651ccd45af55fa659a09b415eba881a9892415e465b6432a4f336311711"}, - {file = "awscrt-0.20.11-cp310-cp310-win32.whl", hash = "sha256:fb316c27110a19917a45dc7b678349bc329c98ac1b95d5bd872f0ad37300e725"}, - {file = "awscrt-0.20.11-cp310-cp310-win_amd64.whl", hash = "sha256:ae4910e1f534e0d5bb8bade0ce2b1908bfd36007115ac0a700b9cda5c5655f0c"}, - {file = "awscrt-0.20.11-cp311-abi3-macosx_10_9_universal2.whl", hash = "sha256:86554f8042dea649b7d63a2e4de593864753aad736a7ca592e72b2f8a94535bb"}, - {file = "awscrt-0.20.11-cp311-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:45db07c2f0f7c83d8a4cb91a51869b22f1f44c1053db7266486733aca2d2ac41"}, - {file = "awscrt-0.20.11-cp311-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1c94917cce1df62fc40f53e19f5dcfbd036acfbdb1a88cba217ad6caaeab0d57"}, - {file = "awscrt-0.20.11-cp311-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:624322e103e62bffecf97731691e05ef0d7a50970d8e3b1872433dcf00c5595a"}, - {file = "awscrt-0.20.11-cp311-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:f10af50b747c2b237836ab1ed57dc1be0c2553e0fb485374f0d3be470a861e4a"}, - {file = "awscrt-0.20.11-cp311-abi3-win32.whl", hash = "sha256:fc7a8eecfc51503afd24764033a2061a5f39017ed6e825b6594490e04fd56297"}, - {file = "awscrt-0.20.11-cp311-abi3-win_amd64.whl", hash = "sha256:106ff16bce775917d4e9a8c93649b4f272c32a91336ae6ca97596dcb2faf2d44"}, - {file = "awscrt-0.20.11-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:5a7ba2227546522d5767308ff49876fbc0abd1771376710ce2cf4dd8b317b2b9"}, - {file = "awscrt-0.20.11-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:82acde62286c7a1d7991b5bf92f192603ea9b3752b3bf28dae75300c05de1119"}, - {file = "awscrt-0.20.11-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:583bff89758f0d2cd9540c2c9b301836df21b71548f0fabfdff7fb484c960bf0"}, - {file = "awscrt-0.20.11-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:6afdee4b204592eba1c75797407be976e9097682d27de6b0ec0c696ec0851758"}, - {file = "awscrt-0.20.11-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:0101be8b6b40e252eaead36eb1c4c87d53f6f0cd54d40e1ef571f984a36efb79"}, - {file = "awscrt-0.20.11-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:d6d984eea9687555ca9d269ebbba8f090e1b7feab6f61d1b046548cd469cb2ab"}, - {file = "awscrt-0.20.11-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:bb634f7fbf02b5aee95d619ec3066c7f7a4d7cd6a156203115bdf8cbc715c4f3"}, - {file = "awscrt-0.20.11-cp37-cp37m-win32.whl", hash = "sha256:8fbae85b2d5106dd470b349314b3bcceb8812904675c98a1dbd2fe1efe92eb35"}, - {file = "awscrt-0.20.11-cp37-cp37m-win_amd64.whl", hash = "sha256:56cc06725038d625365f9bdebd4b9e3c9f876ead1a26473cb124c6dfa4b39fd1"}, - {file = "awscrt-0.20.11-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:58c4616343b9f4d6fc454816fb3459ac86489a242ade3c8126ec9d1aa8208ec0"}, - {file = "awscrt-0.20.11-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e323a779b8db858b3412a727c90dc9c058898fd7eb5e0f454ca94623007aa078"}, - {file = "awscrt-0.20.11-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3e28303eb399d4fa3f5df79cca277d4ae434112590c4c9b60a21c397c6ad9da6"}, - {file = "awscrt-0.20.11-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:73a8a255d60cdb96bd6a93bf606055c918dc88cd9cca57be860efc113cb256b0"}, - {file = "awscrt-0.20.11-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:07592b2f9dffcd12745da06c10bc1de1e9f38ebc8996b98689bfaa860d600382"}, - {file = "awscrt-0.20.11-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:b022a21e8bda0d3403e6115ff15652f65dc7250b0d0c1b3125c2c3e095647940"}, - {file = "awscrt-0.20.11-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:a28f8d8e6b95a0007d80d07763f21dc2cea73b35586559d5b0635c2d06347efc"}, - {file = "awscrt-0.20.11-cp38-cp38-win32.whl", hash = "sha256:2427d727494d48253e70c9e6a2135d91546524cf56e13b0a7c5c0713994281b0"}, - {file = "awscrt-0.20.11-cp38-cp38-win_amd64.whl", hash = "sha256:8f1bf72ba5a3a38215b7487c5fb9421dece5b5f63b22ade8f63315acbf1c3842"}, - {file = "awscrt-0.20.11-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:981df6f011086d30ee76e9476bf33b381bf3064cb3d02be1ea1aa46fec79110a"}, - {file = "awscrt-0.20.11-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dfead1d73754718f7c6bb03add095750535237ea14c8226cd36d6b88fe7b5342"}, - {file = "awscrt-0.20.11-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8eb6d8c72485b0b14eb430ca9b1f280629277cd0a9d5d064ee02afffe787caa8"}, - {file = "awscrt-0.20.11-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f3fb9cbc3403032f57006806229d2c11ab8c425cee1f47f05d83ca87c1b94b32"}, - {file = "awscrt-0.20.11-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:a2c4c1a577d55b98ae93af82bb96795a62661585d4560674b1daa034f41e6fb1"}, - {file = "awscrt-0.20.11-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:7f68ce98ee54b634fbe48689aa4610965e3af0e5e1a11da82a791057c741fb99"}, - {file = "awscrt-0.20.11-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:4404f6e517a5f0871943463fe59cec657586b2d0d1e6e2efbe9dbb0b42f5b3e8"}, - {file = "awscrt-0.20.11-cp39-cp39-win32.whl", hash = "sha256:8e95ce32b03006097f833b539d1bc3ea503379d880751ddbfd7bb0440e93c0c4"}, - {file = "awscrt-0.20.11-cp39-cp39-win_amd64.whl", hash = "sha256:20b00d68a90575121cf04250c93aa4874f7d1f7d2d81f37511c12a157be7421c"}, - {file = "awscrt-0.20.11.tar.gz", hash = "sha256:c3dbfb7f1909457952e645373e72b69f90c50c465ee6a46d9bbdc12acb79803c"}, -] - [[package]] name = "backoff" version = "2.2.1" @@ -254,48 +203,44 @@ files = [ [[package]] name = "boto3" -version = "1.34.129" +version = "1.34.4" description = "The AWS SDK for Python" optional = false -python-versions = ">=3.8" +python-versions = ">= 3.8" files = [ - {file = "boto3-1.34.129-py3-none-any.whl", hash = "sha256:cc73de1c9d953b1f9da6ee2404af717e93d888f790f3e0291b22d1b8489eb401"}, - {file = "boto3-1.34.129.tar.gz", hash = "sha256:a7a696fd3e7f5f43a81450b441f3eb6c5a89d28efe867cd97d8fc73ea5d8c139"}, + {file = "boto3-1.34.4-py3-none-any.whl", hash = "sha256:1e836fe33da2684db29317911d9958389094ca5098cc253dbaed8e4aa146b153"}, + {file = "boto3-1.34.4.tar.gz", hash = "sha256:a866277fc38b121ac5dab0eec38b6ae6e3a59bbf6f67ed9a9822332d9e5e785f"}, ] [package.dependencies] -botocore = [ - {version = ">=1.34.129,<1.35.0"}, - {version = ">=1.21.0,<2.0a0", extras = ["crt"], optional = true, markers = "extra == \"crt\""}, -] +botocore = ">=1.34.4,<1.35.0" jmespath = ">=0.7.1,<2.0.0" -s3transfer = ">=0.10.0,<0.11.0" +s3transfer = ">=0.9.0,<0.10.0" [package.extras] crt = ["botocore[crt] (>=1.21.0,<2.0a0)"] [[package]] name = "botocore" -version = "1.34.129" +version = "1.34.4" description = "Low-level, data-driven core of boto 3." optional = false -python-versions = ">=3.8" +python-versions = ">= 3.8" files = [ - {file = "botocore-1.34.129-py3-none-any.whl", hash = "sha256:86d3dd30996aa459e9c3321edac12aebe47c73cb4acc7556941f9b4c39726088"}, - {file = "botocore-1.34.129.tar.gz", hash = "sha256:7c56e25af6112d69c5d14a15b42f76ba7687687abc463a96ac5edca19c0a9c2d"}, + {file = "botocore-1.34.4-py3-none-any.whl", hash = "sha256:2026d89a46dfcb96d439db17a277de11b808428cba881deb50a5960b134e3a84"}, + {file = "botocore-1.34.4.tar.gz", hash = "sha256:5dcd63329cb3e65c533a72a68c99b7d07c99a29936ea07d0998120172c10b4f5"}, ] [package.dependencies] -awscrt = {version = "0.20.11", optional = true, markers = "extra == \"crt\""} jmespath = ">=0.7.1,<2.0.0" python-dateutil = ">=2.1,<3.0.0" urllib3 = [ {version = ">=1.25.4,<1.27", markers = "python_version < \"3.10\""}, - {version = ">=1.25.4,<2.2.0 || >2.2.0,<3", markers = "python_version >= \"3.10\""}, + {version = ">=1.25.4,<2.1", markers = "python_version >= \"3.10\""}, ] [package.extras] -crt = ["awscrt (==0.20.11)"] +crt = ["awscrt (==0.19.17)"] [[package]] name = "certifi" @@ -2551,13 +2496,13 @@ files = [ [[package]] name = "s3transfer" -version = "0.10.1" +version = "0.9.0" description = "An Amazon S3 Transfer Manager" optional = false python-versions = ">= 3.8" files = [ - {file = "s3transfer-0.10.1-py3-none-any.whl", hash = "sha256:ceb252b11bcf87080fb7850a224fb6e05c8a776bab8f2b64b7f25b969464839d"}, - {file = "s3transfer-0.10.1.tar.gz", hash = "sha256:5683916b4c724f799e600f41dd9e10a9ff19871bf87623cc8f491cb4f5fa0a19"}, + {file = "s3transfer-0.9.0-py3-none-any.whl", hash = "sha256:01d4d2c35a016db8cb14f9a4d5e84c1f8c96e7ffc211422555eed45c11fa7eb1"}, + {file = "s3transfer-0.9.0.tar.gz", hash = "sha256:9e1b186ec8bb5907a1e82b51237091889a9973a2bb799a924bcd9f301ff79d3d"}, ] [package.dependencies] @@ -3632,4 +3577,4 @@ torch = ["torch"] [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "31b8e738ad9e0b578b35633680f09a1d2433fcc15501023bb2513b39b9d4c0df" +content-hash = "e52881fa075d917070103ac7f96cae7c648502cb82b4b48d54ec81d47e1b0ef9" diff --git a/server/pyproject.toml b/server/pyproject.toml index 0437b28a8..1cfee5a7e 100644 --- a/server/pyproject.toml +++ b/server/pyproject.toml @@ -34,14 +34,13 @@ texttable = { version = "^1.6.7", optional = true } datasets = { version = "^2.14.0", optional = true } torch = { version = "2.3.0", optional = true } peft = { version = "0.4.0", optional = true } -boto3 = {extras = ["crt"], version = "^1.34.129"} +boto3 = "^1.28.34" urllib3 = "<=1.26.18" hqq = { version = "^0.1.7", optional = true } stanford-stk = { version = "^0.7.0", markers = "sys_platform == 'linux'" } outlines = { version = "^0.0.40", optional = true } prometheus-client = "^0.20.0" py-cpuinfo = "^9.0.0" -s3transfer = "0.10.1" [tool.poetry.extras] torch = ["torch"]