diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 116120764..8364dd829 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -55,8 +55,106 @@ jobs:
             # persistent data location
             root = "/runner/build/containerd"
 
+      # - name: Docker meta
+      #   id: meta
+      #   uses: docker/metadata-action@v5
+      #   with:
+      #     images: |
+      #       ghcr.io/predibase/lorax
+      #     tags: |
+      #       type=semver,pattern={{version}}
+      #       type=semver,pattern={{major}}.{{minor}}
+      #       type=sha,prefix=,suffix=,format=short
+      #       type=raw,value=main,enable=${{ github.ref == 'refs/heads/main' }}
+      
+      # - name: Create a hash from tags
+      #   env:
+      #     tags: ${{ steps.meta.outputs.tags }}
+      #   id: vars
+      #   run: |
+      #     tag_hash=$(echo -n "$tags" | md5sum | awk '{print $1}')
+      #     echo "tag_hash=$tag_hash" >> $GITHUB_OUTPUT
+      #     echo "cache_dir=/runner/build/images/cache" >> $GITHUB_OUTPUT
+      #     echo "image_dir=/runner/build/images" >> $GITHUB_OUTPUT
+      #     echo "image_path=/runner/build/images/lorax" >> $GITHUB_OUTPUT
+
+      # - name: Create and update image/cache directory
+      #   env:
+      #     image_dir: ${{ steps.vars.outputs.image_dir }}
+      #     cache_dir: ${{ steps.vars.outputs.cache_dir }}
+      #   run: |
+      #     sudo mkdir -p $image_dir
+      #     sudo chown ubuntu:ubuntu $image_dir
+
+      #     sudo mkdir -p $cache_dir
+      #     sudo chown ubuntu:ubuntu $cache_dir
+
+      # - name: Export Docker image as OCI
+      #   uses: docker/build-push-action@v5
+      #   with:
+      #     context: .
+      #     file: ./Dockerfile  # Path to your Dockerfile
+      #     push: false
+      #     tags: ${{ steps.meta.outputs.tags }}
+      #     outputs: type=oci,compression=gzip,dest=${{ steps.vars.outputs.image_path }}-${{ steps.vars.outputs.tag_hash }}.tar.gz
+      #     cache-from: type=local,src=${{ steps.vars.outputs.cache_dir }}
+      #     cache-to: type=local,mode=max,image-manifest=true,oci-mediatypes=true,dest=${{ steps.vars.outputs.cache_dir }}
+
+      # - name: Import image in containerd
+      #   env:
+      #     tag_hash: ${{ steps.vars.outputs.tag_hash }}
+      #     image_path: ${{ steps.vars.outputs.image_path }}
+      #   run: |
+      #     echo "Importing $image_path-$tag_hash to Containerd"
+      #     sudo ctr i import --no-unpack --all-platforms --digests $image_path-$tag_hash.tar.gz
+
+      # - name: Log in to GitHub Container Registry
+      #   uses: docker/login-action@v1
+      #   with:
+      #     registry: ghcr.io
+      #     username: ${{ github.repository_owner }}
+      #     password: ${{ secrets.GHCR_PAT }}
+
+      # - name: Push image with containerd
+      #   env:
+      #     tags: ${{ steps.meta.outputs.tags }}
+      #   run: |
+      #     for tag in $tags
+      #     do
+      #       echo "Pushing $tag to GHCR"
+      #       sudo ctr i push --user "${{ github.repository_owner }}:${{ secrets.GHCR_PAT }}" $tag
+      #     done
+      
+      # - name: Create and push soci index
+      #   env:
+      #     tags: ${{ steps.meta.outputs.tags }}
+      #   run: |
+      #     export SOCI_PATH=$HOME/.soci/soci
+      #     for tag in $tags
+      #     do
+      #       echo "Creating soci index for $tag"
+      #       sudo $SOCI_PATH create $tag
+      #       echo "Pushing soci index for $tag"
+      #       sudo $SOCI_PATH push --user ${{ github.repository_owner }}:${{ secrets.GHCR_PAT }} $tag
+      #     done
+
+      # - name: Prune older images
+      #   env:
+      #     tag_hash: ${{ steps.vars.outputs.tag_hash }}
+      #     image_path: ${{ steps.vars.outputs.image_path }}
+      #   run: |
+      #     # Delete images older than a day from docker store
+      #     docker image prune -a -f --filter "until=24h"
+
+      #     # Delete the on disk copy
+      #     rm -rf "$image_path-$tag_hash.tar.gz"
+
+      #     # Delete the SHA image(s) from containerd store
+      #     sudo ctr i rm $(sudo ctr i ls -q)
+  
+#### new build test
       - name: Docker meta
-        id: meta
+        id: meta1
         uses: docker/metadata-action@v5
         with:
           images: |
@@ -64,13 +162,13 @@ jobs:
           tags: |
             type=semver,pattern={{version}}
             type=semver,pattern={{major}}.{{minor}}
-            type=sha,prefix=,suffix=,format=short
+            type=sha,prefix=runpod-,suffix=,format=short
             type=raw,value=main,enable=${{ github.ref == 'refs/heads/main' }}
-      
+
       - name: Create a hash from tags
         env:
-          tags: ${{ steps.meta.outputs.tags }}
-        id: vars
+          tags: ${{ steps.meta1.outputs.tags }}
+        id: vars1
         run: |
           tag_hash=$(echo -n "$tags" | md5sum | awk '{print $1}')
           echo "tag_hash=$tag_hash" >> $GITHUB_OUTPUT
@@ -80,8 +178,8 @@ jobs:
 
       - name: Create and update image/cache directory
         env:
-          image_dir: ${{ steps.vars.outputs.image_dir }}
-          cache_dir: ${{ steps.vars.outputs.cache_dir }}
+          image_dir: ${{ steps.vars1.outputs.image_dir }}
+          cache_dir: ${{ steps.vars1.outputs.cache_dir }}
         run: |
           sudo mkdir -p $image_dir
           sudo chown ubuntu:ubuntu $image_dir
@@ -92,18 +190,18 @@ jobs:
       - name: Export Docker image as OCI
         uses: docker/build-push-action@v5
         with:
-          context: .
-          file: ./Dockerfile  # Path to your Dockerfile
+          context: ./runpod
+          file: ./runpod/Dockerfile  # Path to your Dockerfile
           push: false
-          tags: ${{ steps.meta.outputs.tags }}
-          outputs: type=oci,compression=gzip,dest=${{ steps.vars.outputs.image_path }}-${{ steps.vars.outputs.tag_hash }}.tar.gz
-          cache-from: type=local,src=${{ steps.vars.outputs.cache_dir }}
-          cache-to: type=local,mode=max,image-manifest=true,oci-mediatypes=true,dest=${{ steps.vars.outputs.cache_dir }}
+          tags: ${{ steps.meta1.outputs.tags }}
+          outputs: type=oci,compression=gzip,dest=${{ steps.vars1.outputs.image_path }}-${{ steps.vars1.outputs.tag_hash }}.tar.gz
+          cache-from: type=local,src=${{ steps.vars1.outputs.cache_dir }}
+          cache-to: type=local,mode=max,image-manifest=true,oci-mediatypes=true,dest=${{ steps.vars1.outputs.cache_dir }}
 
       - name: Import image in containerd
         env:
-          tag_hash: ${{ steps.vars.outputs.tag_hash }}
-          image_path: ${{ steps.vars.outputs.image_path }}
+          tag_hash: ${{ steps.vars1.outputs.tag_hash }}
+          image_path: ${{ steps.vars1.outputs.image_path }}
         run: |
           echo "Importing $image_path-$tag_hash to Containerd"
           sudo ctr i import --no-unpack --all-platforms --digests $image_path-$tag_hash.tar.gz
@@ -117,17 +215,17 @@ jobs:
 
       - name: Push image with containerd
         env:
-          tags: ${{ steps.meta.outputs.tags }}
+          tags: ${{ steps.meta1.outputs.tags }}
         run: |
           for tag in $tags
           do
             echo "Pushing $tag to GHCR"
             sudo ctr i push --user "${{ github.repository_owner }}:${{ secrets.GHCR_PAT }}" $tag
           done
-      
+
       - name: Create and push soci index
         env:
-          tags: ${{ steps.meta.outputs.tags }}
+          tags: ${{ steps.meta1.outputs.tags }}
         run: |
           export SOCI_PATH=$HOME/.soci/soci
           for tag in $tags
@@ -140,8 +238,8 @@ jobs:
 
       - name: Prune older images
         env:
-          tag_hash: ${{ steps.vars.outputs.tag_hash }}
-          image_path: ${{ steps.vars.outputs.image_path }}
+          tag_hash: ${{ steps.vars1.outputs.tag_hash }}
+          image_path: ${{ steps.vars1.outputs.image_path }}
         run: |
           # Delete images older than a day from docker store
           docker image prune -a -f --filter "until=24h"
@@ -151,4 +249,3 @@ jobs:
 
           # Delete the SHA image(s) from containerd store
           sudo ctr i rm $(sudo ctr i ls -q)
-  
diff --git a/runpod/Dockerfile b/runpod/Dockerfile
new file mode 100644
index 000000000..c037aca40
--- /dev/null
+++ b/runpod/Dockerfile
@@ -0,0 +1,75 @@
+# Base image
+# TODO change the lorax base image
+FROM ghcr.io/predibase/lorax:0.10.0
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Set the working directory
+WORKDIR /
+
+# Update and upgrade the system packages (Worker Template)
+COPY builder/setup.sh /setup.sh
+RUN /bin/bash /setup.sh && \
+    rm /setup.sh
+
+# Install Python dependencies (Worker Template)
+# COPY builder/requirements.txt /requirements.txt
+RUN python3 -m pip install --upgrade pip && \
+    python3 -m pip install runpod
+
+# Add src files (Worker Template)
+ADD src .
+
+# Whether to download the model into /runpod-volume or not.
+ARG DOWNLOAD_MODEL=
+ENV DOWNLOAD_MODEL=$DOWNLOAD_MODEL
+
+# Set environment variables
+ARG HF_MODEL_ID=
+ENV HF_MODEL_ID=$HF_MODEL_ID
+
+ARG HF_MODEL_REVISION=
+ENV HF_MODEL_REVISION=$HF_MODEL_REVISION
+
+ARG SM_NUM_GPUS=
+ENV SM_NUM_GPUS=$SM_NUM_GPUS
+
+ARG HF_MODEL_QUANTIZE=
+ENV HF_MODEL_QUANTIZE=$HF_MODEL_QUANTIZE
+
+ARG HF_MODEL_TRUST_REMOTE_CODE=
+ENV HF_MODEL_TRUST_REMOTE_CODE=$HF_MODEL_TRUST_REMOTE_CODE
+
+ARG MODEL_BASE_PATH="/runpod-volume/"
+ENV MODEL_BASE_PATH=$MODEL_BASE_PATH
+
+ARG HUGGING_FACE_HUB_TOKEN=
+ENV HUGGING_FACE_HUB_TOKEN=$HUGGING_FACE_HUB_TOKEN
+
+ARG HF_MAX_TOTAL_TOKENS=
+ENV HF_MAX_TOTAL_TOKENS=$HF_MAX_TOTAL_TOKENS
+
+ARG HF_MAX_INPUT_LENGTH=
+ENV HF_MAX_INPUT_LENGTH=$HF_MAX_INPUT_LENGTH
+
+ARG HF_MAX_BATCH_TOTAL_TOKENS=
+ENV HF_MAX_BATCH_TOTAL_TOKENS=$HF_MAX_BATCH_TOTAL_TOKENS
+
+ARG HF_MAX_BATCH_PREFILL_TOKENS=
+ENV HF_MAX_BATCH_PREFILL_TOKENS=$HF_MAX_BATCH_PREFILL_TOKENS
+
+# Prepare the hugging face directories for caching datasets, models, and more.
+ENV HF_DATASETS_CACHE="/runpod-volume/huggingface-cache/datasets"
+ENV HUGGINGFACE_HUB_CACHE="/runpod-volume/huggingface-cache/hub"
+ENV TRANSFORMERS_CACHE="/runpod-volume/huggingface-cache/hub"
+
+# Conditionally download the model weights based on DOWNLOAD_MODEL
+RUN if [ "$DOWNLOAD_MODEL" = "1" ]; then \
+    lorax-server download-weights $HF_MODEL_ID; \
+  fi
+
+# Quick temporary updates
+RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.10
+RUN python3.10 -m pip install lorax-client
+RUN python3.10 -m pip install openai 
+
+ENTRYPOINT ["./entrypoint.sh"]
diff --git a/runpod/builder/setup.sh b/runpod/builder/setup.sh
new file mode 100644
index 000000000..2b9926ec8
--- /dev/null
+++ b/runpod/builder/setup.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+# Stop script on error
+set -e
+
+# Update System
+apt-get update && apt-get upgrade -y
+
+# Install System Dependencies
+# - openssh-server: for ssh access and web terminal
+apt-get install -y --no-install-recommends software-properties-common curl git openssh-server
+
+# Install Python 3.10
+add-apt-repository ppa:deadsnakes/ppa -y
+apt-get update && apt-get install -y --no-install-recommends python3.10 python3.10-dev python3.10-distutils
+update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1
+
+# Install pip for Python 3.10
+curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py
+python3 get-pip.py
+
+# Clean up
+apt-get autoremove -y && apt-get clean -y && rm -rf /var/lib/apt/lists/*
diff --git a/runpod/src/entrypoint.sh b/runpod/src/entrypoint.sh
new file mode 100755
index 000000000..caadce279
--- /dev/null
+++ b/runpod/src/entrypoint.sh
@@ -0,0 +1,45 @@
+#!/bin/bash
+
+# if [[ -z "${HF_MODEL_ID}" ]]; then
+#   echo "HF_MODEL_ID must be set"
+#   exit 1
+# fi
+# export MODEL_ID="${HF_MODEL_ID}"
+
+if [[ -n "${HF_MODEL_REVISION}" ]]; then
+  export REVISION="${HF_MODEL_REVISION}"
+fi
+
+if [[ -n "${SM_NUM_GPUS}" ]]; then
+  export NUM_SHARD="${SM_NUM_GPUS}"
+fi
+
+if [[ -n "${HF_MODEL_QUANTIZE}" ]]; then
+  export QUANTIZE="${HF_MODEL_QUANTIZE}"
+fi
+
+if [[ -n "${HF_MODEL_TRUST_REMOTE_CODE}" ]]; then
+  export TRUST_REMOTE_CODE="${HF_MODEL_TRUST_REMOTE_CODE}"
+fi
+
+if [[ -n "${HF_MAX_TOTAL_TOKENS}" ]]; then
+  export MAX_TOTAL_TOKENS="${HF_MAX_TOTAL_TOKENS}"
+fi
+
+if [[ -n "${HF_MAX_INPUT_LENGTH}" ]]; then
+  export MAX_INPUT_LENGTH="${HF_MAX_INPUT_LENGTH}"
+fi
+
+if [[ -n "${HF_MAX_BATCH_TOTAL_TOKENS}" ]]; then
+  export MAX_BATCH_TOTAL_TOKENS="${HF_MAX_BATCH_TOTAL_TOKENS}"
+fi
+
+if [[ -n "${HF_MAX_BATCH_PREFILL_TOKENS}" ]]; then
+  export MAX_BATCH_PREFILL_TOKENS="${HF_MAX_BATCH_PREFILL_TOKENS}"
+fi
+
+# Start the text generation server
+nohup lorax-launcher --port 8080 --model-id      predibase/Meta-Llama-3-8B-Instruct-dequantized      --adapter-source      hub      --default-adapter-source      pbase      --max-batch-prefill-tokens      32768      --max-total-tokens      8192      --max-input-length      8191      --max-concurrent-requests      1024 &
+
+# Start the handler using python 3.10
+python3.10 -u /handler.py
diff --git a/runpod/src/handler.py b/runpod/src/handler.py
new file mode 100644
index 000000000..ed2d8d4fc
--- /dev/null
+++ b/runpod/src/handler.py
@@ -0,0 +1,102 @@
+#!/usr/bin/env python
+''' Contains the handler function that will be called by the serverless. '''
+
+from typing import Generator
+import runpod
+import os
+import time
+
+# For download the weights
+from lorax import Client
+
+import openai
+
+# Prepare global variables
+JOBS = set()
+TGI_LOCAL_PORT = int(os.environ.get('TGI_LOCAL_PORT', 8080))
+url = "http://127.0.0.1:{}".format(TGI_LOCAL_PORT)
+# Create the client
+client = Client(url)
+api_key = os.environ.get("PREDIBASE_API_KEY", "fake")
+
+
+print(url)
+# Wait for the hugging face TGI worker to start running.
+while True:
+    try:
+        client.generate("Why is the sky blue?", max_new_tokens=1).generated_text
+        print("Successfully cold booted the hugging face text generation inference server!")
+
+        # Break from the while loop
+        break
+
+    except Exception as e:
+        print(e)
+        print("The hugging face text generation inference server is still cold booting...")
+        time.sleep(5)
+
+def concurrency_controller() -> bool:
+    # Handle at most 1024 jobs at a time.
+    return len(JOBS) > 1024
+
+async def handler_streaming(job: dict) -> Generator[dict[str, list], None, None]:
+    '''
+    This is the handler function that will be called by the serverless.
+    ''' 
+    # Get job input
+    job_input = job['input']
+    # TODO do different things based on the openai_route. Right now, just assume we are calling the openai 
+    # chat completions.generate method!
+    print(job_input)
+    print("first print :P")
+    use_openai = 'openai_route' in job_input
+
+    # Create a new client and pass the token for every handler call
+    openai_client = openai.OpenAI(
+        base_url=f"{url}/v1",
+        api_key=api_key
+    )
+    JOBS.add(job['id'])
+
+    print(use_openai)
+    if use_openai:
+        # if job_input['stream'] == False:
+        print(job_input)
+        result = openai_client.chat.completions.create(**job_input["openai_input"]).model_dump()
+        yield result
+    else:
+        inputs = str(job_input.get('inputs'))
+        if job_input.get('_stream', False):
+            del job_input['_stream']
+            # Streaming case
+            for response in client.generate_stream(inputs, **job_input.get('parameters', {})):
+                if not response.token.special:
+                    # Dump the repsonse into a dictionary
+                    yield response.model_dump()
+        else:
+            if '_stream' in job_input:
+                del job_input['_stream']
+            response = client.generate(inputs, **job_input.get('parameters', {}))
+            yield response.model_dump()
+    # When we are called with a streaming endpoint, then we should have the field 
+    # _stream = True
+
+    # TODO handle the two openAI compatable endpoints as well...!
+    # TODO get stream yes/no and call the client based on that...?
+    # TODO get the auth token or whatever 
+    # TODO figure out how to do auth here - maybe we start it with a secret
+    # and in istio-land we inject the correct secret in requests 
+    # if the user is auth'ed properly for the resource? 
+    # TODO handle key timeouts 
+    # Add job to the set.
+    
+    # Remove job from the set.
+    JOBS.remove(job['id'])
+
+# Start the serverless worker with appropriate settings
+print("Starting the TGI serverless worker with streaming enabled.")
+runpod.serverless.start({
+    "handler": handler_streaming, 
+    "concurrency_controller": concurrency_controller, 
+    "return_aggregate_stream": True
+})