diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 116120764..8364dd829 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -55,8 +55,106 @@ jobs: # persistent data location root = "/runner/build/containerd" + # - name: Docker meta + # id: meta + # uses: docker/metadata-action@v5 + # with: + # images: | + # ghcr.io/predibase/lorax + # tags: | + # type=semver,pattern={{version}} + # type=semver,pattern={{major}}.{{minor}} + # type=sha,prefix=,suffix=,format=short + # type=raw,value=main,enable=${{ github.ref == 'refs/heads/main' }} + + # - name: Create a hash from tags + # env: + # tags: ${{ steps.meta.outputs.tags }} + # id: vars + # run: | + # tag_hash=$(echo -n "$tags" | md5sum | awk '{print $1}') + # echo "tag_hash=$tag_hash" >> $GITHUB_OUTPUT + # echo "cache_dir=/runner/build/images/cache" >> $GITHUB_OUTPUT + # echo "image_dir=/runner/build/images" >> $GITHUB_OUTPUT + # echo "image_path=/runner/build/images/lorax" >> $GITHUB_OUTPUT + + # - name: Create and update image/cache directory + # env: + # image_dir: ${{ steps.vars.outputs.image_dir }} + # cache_dir: ${{ steps.vars.outputs.cache_dir }} + # run: | + # sudo mkdir -p $image_dir + # sudo chown ubuntu:ubuntu $image_dir + + # sudo mkdir -p $cache_dir + # sudo chown ubuntu:ubuntu $cache_dir + + # - name: Export Docker image as OCI + # uses: docker/build-push-action@v5 + # with: + # context: . + # file: ./Dockerfile # Path to your Dockerfile + # push: false + # tags: ${{ steps.meta.outputs.tags }} + # outputs: type=oci,compression=gzip,dest=${{ steps.vars.outputs.image_path }}-${{ steps.vars.outputs.tag_hash }}.tar.gz + # cache-from: type=local,src=${{ steps.vars.outputs.cache_dir }} + # cache-to: type=local,mode=max,image-manifest=true,oci-mediatypes=true,dest=${{ steps.vars.outputs.cache_dir }} + + # - name: Import image in containerd + # env: + # tag_hash: ${{ steps.vars.outputs.tag_hash }} + # image_path: ${{ steps.vars.outputs.image_path }} + # run: | + # echo "Importing $image_path-$tag_hash to Containerd" + # sudo ctr i import --no-unpack --all-platforms --digests $image_path-$tag_hash.tar.gz + + # - name: Log in to GitHub Container Registry + # uses: docker/login-action@v1 + # with: + # registry: ghcr.io + # username: ${{ github.repository_owner }} + # password: ${{ secrets.GHCR_PAT }} + + # - name: Push image with containerd + # env: + # tags: ${{ steps.meta.outputs.tags }} + # run: | + # for tag in $tags + # do + # echo "Pushing $tag to GHCR" + # sudo ctr i push --user "${{ github.repository_owner }}:${{ secrets.GHCR_PAT }}" $tag + # done + + # - name: Create and push soci index + # env: + # tags: ${{ steps.meta.outputs.tags }} + # run: | + # export SOCI_PATH=$HOME/.soci/soci + # for tag in $tags + # do + # echo "Creating soci index for $tag" + # sudo $SOCI_PATH create $tag + # echo "Pushing soci index for $tag" + # sudo $SOCI_PATH push --user ${{ github.repository_owner }}:${{ secrets.GHCR_PAT }} $tag + # done + + # - name: Prune older images + # env: + # tag_hash: ${{ steps.vars.outputs.tag_hash }} + # image_path: ${{ steps.vars.outputs.image_path }} + # run: | + # # Delete images older than a day from docker store + # docker image prune -a -f --filter "until=24h" + + # # Delete the on disk copy + # rm -rf "$image_path-$tag_hash.tar.gz" + + # # Delete the SHA image(s) from containerd store + # sudo ctr i rm $(sudo ctr i ls -q) + +#### new build test - name: Docker meta - id: meta + id: meta1 uses: docker/metadata-action@v5 with: images: | @@ -64,13 +162,13 @@ jobs: tags: | type=semver,pattern={{version}} type=semver,pattern={{major}}.{{minor}} - type=sha,prefix=,suffix=,format=short + type=sha,prefix=runpod-,suffix=,format=short type=raw,value=main,enable=${{ github.ref == 'refs/heads/main' }} - + - name: Create a hash from tags env: - tags: ${{ steps.meta.outputs.tags }} - id: vars + tags: ${{ steps.meta1.outputs.tags }} + id: vars1 run: | tag_hash=$(echo -n "$tags" | md5sum | awk '{print $1}') echo "tag_hash=$tag_hash" >> $GITHUB_OUTPUT @@ -80,8 +178,8 @@ jobs: - name: Create and update image/cache directory env: - image_dir: ${{ steps.vars.outputs.image_dir }} - cache_dir: ${{ steps.vars.outputs.cache_dir }} + image_dir: ${{ steps.vars1.outputs.image_dir }} + cache_dir: ${{ steps.vars1.outputs.cache_dir }} run: | sudo mkdir -p $image_dir sudo chown ubuntu:ubuntu $image_dir @@ -92,18 +190,18 @@ jobs: - name: Export Docker image as OCI uses: docker/build-push-action@v5 with: - context: . - file: ./Dockerfile # Path to your Dockerfile + context: ./runpod + file: ./runpod/Dockerfile # Path to your Dockerfile push: false - tags: ${{ steps.meta.outputs.tags }} - outputs: type=oci,compression=gzip,dest=${{ steps.vars.outputs.image_path }}-${{ steps.vars.outputs.tag_hash }}.tar.gz - cache-from: type=local,src=${{ steps.vars.outputs.cache_dir }} - cache-to: type=local,mode=max,image-manifest=true,oci-mediatypes=true,dest=${{ steps.vars.outputs.cache_dir }} + tags: ${{ steps.meta1.outputs.tags }} + outputs: type=oci,compression=gzip,dest=${{ steps.vars1.outputs.image_path }}-${{ steps.vars1.outputs.tag_hash }}.tar.gz + cache-from: type=local,src=${{ steps.vars1.outputs.cache_dir }} + cache-to: type=local,mode=max,image-manifest=true,oci-mediatypes=true,dest=${{ steps.vars1.outputs.cache_dir }} - name: Import image in containerd env: - tag_hash: ${{ steps.vars.outputs.tag_hash }} - image_path: ${{ steps.vars.outputs.image_path }} + tag_hash: ${{ steps.vars1.outputs.tag_hash }} + image_path: ${{ steps.vars1.outputs.image_path }} run: | echo "Importing $image_path-$tag_hash to Containerd" sudo ctr i import --no-unpack --all-platforms --digests $image_path-$tag_hash.tar.gz @@ -117,17 +215,17 @@ jobs: - name: Push image with containerd env: - tags: ${{ steps.meta.outputs.tags }} + tags: ${{ steps.meta1.outputs.tags }} run: | for tag in $tags do echo "Pushing $tag to GHCR" sudo ctr i push --user "${{ github.repository_owner }}:${{ secrets.GHCR_PAT }}" $tag done - + - name: Create and push soci index env: - tags: ${{ steps.meta.outputs.tags }} + tags: ${{ steps.meta1.outputs.tags }} run: | export SOCI_PATH=$HOME/.soci/soci for tag in $tags @@ -140,8 +238,8 @@ jobs: - name: Prune older images env: - tag_hash: ${{ steps.vars.outputs.tag_hash }} - image_path: ${{ steps.vars.outputs.image_path }} + tag_hash: ${{ steps.vars1.outputs.tag_hash }} + image_path: ${{ steps.vars1.outputs.image_path }} run: | # Delete images older than a day from docker store docker image prune -a -f --filter "until=24h" @@ -151,4 +249,3 @@ jobs: # Delete the SHA image(s) from containerd store sudo ctr i rm $(sudo ctr i ls -q) - diff --git a/runpod/Dockerfile b/runpod/Dockerfile new file mode 100644 index 000000000..c037aca40 --- /dev/null +++ b/runpod/Dockerfile @@ -0,0 +1,75 @@ +# Base image +# TODO change the lorax base image +FROM ghcr.io/predibase/lorax:0.10.0 +ENV DEBIAN_FRONTEND=noninteractive + +# Set the working directory +WORKDIR / + +# Update and upgrade the system packages (Worker Template) +COPY builder/setup.sh /setup.sh +RUN /bin/bash /setup.sh && \ + rm /setup.sh + +# Install Python dependencies (Worker Template) +# COPY builder/requirements.txt /requirements.txt +RUN python3 -m pip install --upgrade pip && \ + python3 -m pip install runpod + +# Add src files (Worker Template) +ADD src . + +# Whether to download the model into /runpod-volume or not. +ARG DOWNLOAD_MODEL= +ENV DOWNLOAD_MODEL=$DOWNLOAD_MODEL + +# Set environment variables +ARG HF_MODEL_ID= +ENV HF_MODEL_ID=$HF_MODEL_ID + +ARG HF_MODEL_REVISION= +ENV HF_MODEL_REVISION=$HF_MODEL_REVISION + +ARG SM_NUM_GPUS= +ENV SM_NUM_GPUS=$SM_NUM_GPUS + +ARG HF_MODEL_QUANTIZE= +ENV HF_MODEL_QUANTIZE=$HF_MODEL_QUANTIZE + +ARG HF_MODEL_TRUST_REMOTE_CODE= +ENV HF_MODEL_TRUST_REMOTE_CODE=$HF_MODEL_TRUST_REMOTE_CODE + +ARG MODEL_BASE_PATH="/runpod-volume/" +ENV MODEL_BASE_PATH=$MODEL_BASE_PATH + +ARG HUGGING_FACE_HUB_TOKEN= +ENV HUGGING_FACE_HUB_TOKEN=$HUGGING_FACE_HUB_TOKEN + +ARG HF_MAX_TOTAL_TOKENS= +ENV HF_MAX_TOTAL_TOKENS=$HF_MAX_TOTAL_TOKENS + +ARG HF_MAX_INPUT_LENGTH= +ENV HF_MAX_INPUT_LENGTH=$HF_MAX_INPUT_LENGTH + +ARG HF_MAX_BATCH_TOTAL_TOKENS= +ENV HF_MAX_BATCH_TOTAL_TOKENS=$HF_MAX_BATCH_TOTAL_TOKENS + +ARG HF_MAX_BATCH_PREFILL_TOKENS= +ENV HF_MAX_BATCH_PREFILL_TOKENS=$HF_MAX_BATCH_PREFILL_TOKENS + +# Prepare the hugging face directories for caching datasets, models, and more. +ENV HF_DATASETS_CACHE="/runpod-volume/huggingface-cache/datasets" +ENV HUGGINGFACE_HUB_CACHE="/runpod-volume/huggingface-cache/hub" +ENV TRANSFORMERS_CACHE="/runpod-volume/huggingface-cache/hub" + +# Conditionally download the model weights based on DOWNLOAD_MODEL +RUN if [ "$DOWNLOAD_MODEL" = "1" ]; then \ + lorax-server download-weights $HF_MODEL_ID; \ + fi + +# Quick temporary updates +RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.10 +RUN python3.10 -m pip install lorax-client +RUN python3.10 -m pip install openai + +ENTRYPOINT ["./entrypoint.sh"] diff --git a/runpod/builder/setup.sh b/runpod/builder/setup.sh new file mode 100644 index 000000000..2b9926ec8 --- /dev/null +++ b/runpod/builder/setup.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +# Stop script on error +set -e + +# Update System +apt-get update && apt-get upgrade -y + +# Install System Dependencies +# - openssh-server: for ssh access and web terminal +apt-get install -y --no-install-recommends software-properties-common curl git openssh-server + +# Install Python 3.10 +add-apt-repository ppa:deadsnakes/ppa -y +apt-get update && apt-get install -y --no-install-recommends python3.10 python3.10-dev python3.10-distutils +update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1 + +# Install pip for Python 3.10 +curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py +python3 get-pip.py + +# Clean up +apt-get autoremove -y && apt-get clean -y && rm -rf /var/lib/apt/lists/* diff --git a/runpod/src/entrypoint.sh b/runpod/src/entrypoint.sh new file mode 100755 index 000000000..caadce279 --- /dev/null +++ b/runpod/src/entrypoint.sh @@ -0,0 +1,45 @@ +#!/bin/bash + +# if [[ -z "${HF_MODEL_ID}" ]]; then +# echo "HF_MODEL_ID must be set" +# exit 1 +# fi +# export MODEL_ID="${HF_MODEL_ID}" + +if [[ -n "${HF_MODEL_REVISION}" ]]; then + export REVISION="${HF_MODEL_REVISION}" +fi + +if [[ -n "${SM_NUM_GPUS}" ]]; then + export NUM_SHARD="${SM_NUM_GPUS}" +fi + +if [[ -n "${HF_MODEL_QUANTIZE}" ]]; then + export QUANTIZE="${HF_MODEL_QUANTIZE}" +fi + +if [[ -n "${HF_MODEL_TRUST_REMOTE_CODE}" ]]; then + export TRUST_REMOTE_CODE="${HF_MODEL_TRUST_REMOTE_CODE}" +fi + +if [[ -n "${HF_MAX_TOTAL_TOKENS}" ]]; then + export MAX_TOTAL_TOKENS="${HF_MAX_TOTAL_TOKENS}" +fi + +if [[ -n "${HF_MAX_INPUT_LENGTH}" ]]; then + export MAX_INPUT_LENGTH="${HF_MAX_INPUT_LENGTH}" +fi + +if [[ -n "${HF_MAX_BATCH_TOTAL_TOKENS}" ]]; then + export MAX_BATCH_TOTAL_TOKENS="${HF_MAX_BATCH_TOTAL_TOKENS}" +fi + +if [[ -n "${HF_MAX_BATCH_PREFILL_TOKENS}" ]]; then + export MAX_BATCH_PREFILL_TOKENS="${HF_MAX_BATCH_PREFILL_TOKENS}" +fi + +# Start the text generation server +nohup lorax-launcher --port 8080 --model-id predibase/Meta-Llama-3-8B-Instruct-dequantized --adapter-source hub --default-adapter-source pbase --max-batch-prefill-tokens 32768 --max-total-tokens 8192 --max-input-length 8191 --max-concurrent-requests 1024 & + +# Start the handler using python 3.10 +python3.10 -u /handler.py diff --git a/runpod/src/handler.py b/runpod/src/handler.py new file mode 100644 index 000000000..ed2d8d4fc --- /dev/null +++ b/runpod/src/handler.py @@ -0,0 +1,102 @@ +#!/usr/bin/env python +''' Contains the handler function that will be called by the serverless. ''' + +from typing import Generator +import runpod +import os +import time + +# For download the weights +from lorax import Client + +import openai + +# Prepare global variables +JOBS = set() +TGI_LOCAL_PORT = int(os.environ.get('TGI_LOCAL_PORT', 8080)) +url = "http://127.0.0.1:{}".format(TGI_LOCAL_PORT) +# Create the client +client = Client(url) +api_key = os.environ.get("PREDIBASE_API_KEY", "fake") + + +print(url) +# Wait for the hugging face TGI worker to start running. +while True: + try: + client.generate("Why is the sky blue?", max_new_tokens=1).generated_text + print("Successfully cold booted the hugging face text generation inference server!") + + # Break from the while loop + break + + except Exception as e: + print(e) + print("The hugging face text generation inference server is still cold booting...") + time.sleep(5) + +def concurrency_controller() -> bool: + # Handle at most 1024 jobs at a time. + return len(JOBS) > 1024 + +async def handler_streaming(job: dict) -> Generator[dict[str, list], None, None]: + ''' + This is the handler function that will be called by the serverless. + ''' + # Get job input + job_input = job['input'] + # TODO do different things based on the openai_route. Right now, just assume we are calling the openai + # chat completions.generate method! + print(job_input) + print("first print :P") + use_openai = 'openai_route' in job_input + + # Create a new client and pass the token for every handler call + openai_client = openai.OpenAI( + base_url=f"{url}/v1", + api_key=api_key + ) + JOBS.add(job['id']) + + print(use_openai) + if use_openai: + # if job_input['stream'] == False: + print(job_input) + result = openai_client.chat.completions.create(**job_input["openai_input"]).model_dump() + yield result + else: + inputs = str(job_input.get('inputs')) + if job_input.get('_stream', False): + del job_input['_stream'] + # Streaming case + for response in client.generate_stream(inputs, **job_input.get('parameters', {})): + if not response.token.special: + # Dump the repsonse into a dictionary + yield response.model_dump() + else: + if '_stream' in job_input: + del job_input['_stream'] + response = client.generate(inputs, **job_input.get('parameters', {})) + yield response.model_dump() + # When we are called with a streaming endpoint, then we should have the field + # _stream = True + + # TODO handle the two openAI compatable endpoints as well...! + # TODO get stream yes/no and call the client based on that...? + # TODO get the auth token or whatever + # TODO figure out how to do auth here - maybe we start it with a secret + # and in istio-land we inject the correct secret in requests + # if the user is auth'ed properly for the resource? + # TODO handle key timeouts + # Add job to the set. + + # Remove job from the set. + JOBS.remove(job['id']) + +# Start the serverless worker with appropriate settings +print("Starting the TGI serverless worker with streaming enabled.") +runpod.serverless.start({ + "handler": handler_streaming, + "concurrency_controller": concurrency_controller, + "return_aggregate_stream": True +})