model server build (#127)

* first commit to have model_server not be dependent on Docker * making changes to fix the docker-compose file for archgw to set DNS_V4 and minor fixes with the build * additional fixes for model server to be separated out in the build * additional fixes for model server to be separated out in the build * fix to get model_server to be built as a separate python process. TODO: fix the embeddings logs after cli completes * fixing init to pull tempfile using the tempfile python package --------- Co-authored-by: Salman Paracha <[email protected]>
katanemo · Oct 7, 2024 · b60ceb9 · b60ceb9
1 parent 7d21359
commit b60ceb9
Show file tree

Hide file tree

Showing 21 changed files with 3,390 additions and 154 deletions.
diff --git a/.gitignore b/.gitignore
@@ -24,4 +24,8 @@ demos/network_copilot/ollama/models/
 arch_log/
 arch/tools/*.egg-info
 arch/tools/config
+arch/tools/build
+model_server/model_server.egg-info
 model_server/venv_model_server
+model_server/build
+model_server/dist
diff --git a/arch/docker-compose.yaml b/arch/docker-compose.yaml
@@ -7,24 +7,5 @@ services:
     volumes:
       - ${ARCH_CONFIG_FILE:-./demos/function_calling/arch_confg.yaml}:/config/arch_config.yaml
       - /etc/ssl/cert.pem:/etc/ssl/cert.pem
-    depends_on:
-      model_server:
-        condition: service_healthy
     env_file:
       - stage.env
-
-  model_server:
-    image: model_server:latest
-    ports:
-      - "18081:80"
-    healthcheck:
-        test: ["CMD", "curl" ,"http://localhost/healthz"]
-        interval: 5s
-        retries: 20
-    volumes:
-      - ~/.cache/huggingface:/root/.cache/huggingface
-    environment:
-      - OLLAMA_ENDPOINT=${OLLAMA_ENDPOINT:-host.docker.internal}
-      - OLLAMA_MODEL=Arch-Function-Calling-3B-Q4_K_M
-      - MODE=${MODE:-cloud}
-      - FC_URL=${FC_URL:-https://arch-fc-free-trial-4mzywewe.uc.gateway.dev/v1}
diff --git a/arch/envoy.template.yaml b/arch/envoy.template.yaml
@@ -123,8 +123,8 @@ static_resources:
               - endpoint:
                   address:
                     socket_address:
-                      address: model_server
-                      port_value: 80
+                      address: host.docker.internal
+                      port_value: 51000
                   hostname: "model_server"
     - name: mistral_7b_instruct
       connect_timeout: 5s
@@ -153,8 +153,8 @@ static_resources:
               - endpoint:
                   address:
                     socket_address:
-                      address: model_server
-                      port_value: 80
+                      address: host.docker.internal
+                      port_value: 51000
                   hostname: "arch_fc"
 {% for _, cluster in arch_clusters.items() %}
     - name: {{ cluster.name }}

diff --git a/arch/tools/cli.py b/arch/tools/cli.py
@@ -5,7 +5,7 @@
 import pkg_resources
 import sys
 import subprocess
-from core import start_arch, stop_arch
+from core import start_arch_modelserver, stop_arch_modelserver, start_arch, stop_arch
 from utils import get_llm_provider_access_keys, load_env_file_to_dict
 
 logo = r"""
@@ -26,7 +26,7 @@ def main(ctx):
 
 # Command to build archgw and model_server Docker images
 ARCHGW_DOCKERFILE = "./arch/Dockerfile"
-MODEL_SERVER_DOCKERFILE = "./model_server/Dockerfile"
+MODEL_SERVER_BUILD_FILE = "./model_server/pyproject.toml"
 
 @click.command()
 def build():
@@ -44,21 +44,22 @@ def build():
         click.echo("Error: Dockerfile not found in /arch")
         sys.exit(1)
 
-    # Check if /model_server/Dockerfile exists
-    if os.path.exists(MODEL_SERVER_DOCKERFILE):
-        click.echo("Building model_server image...")
+    click.echo("All images built successfully.")
+
+    """Install the model server dependencies using Poetry."""
+    # Check if pyproject.toml exists
+    if os.path.exists(MODEL_SERVER_BUILD_FILE):
+        click.echo("Installing model server dependencies with Poetry...")
         try:
-            subprocess.run(["docker", "build", "-f", MODEL_SERVER_DOCKERFILE, "-t", "model_server:latest", "./model_server"], check=True)
-            click.echo("model_server image built successfully.")
+            subprocess.run(["poetry", "install", "--no-cache"], cwd=os.path.dirname(MODEL_SERVER_BUILD_FILE), check=True)
+            click.echo("Model server dependencies installed successfully.")
         except subprocess.CalledProcessError as e:
-            click.echo(f"Error building model_server image: {e}")
+            click.echo(f"Error installing model server dependencies: {e}")
             sys.exit(1)
     else:
-        click.echo("Error: Dockerfile not found in /model_server")
+        click.echo(f"Error: pyproject.toml not found in {MODEL_SERVER_BUILD_FILE}")
         sys.exit(1)
 
-    click.echo("All images built successfully.")
-
 @click.command()
 @click.argument('file', required=False)  # Optional file argument
 @click.option('-path', default='.', help='Path to the directory containing arch_config.yml')
@@ -120,11 +121,14 @@ def up(file, path):
     env = os.environ.copy()
     env.update(env_stage)
     env['ARCH_CONFIG_FILE'] = arch_config_file
+
+    start_arch_modelserver()
     start_arch(arch_config_file, env)
 
 @click.command()
 def down():
     """Stops Arch."""
+    stop_arch_modelserver()
     stop_arch()
 
 @click.command()

diff --git a/arch/tools/core.py b/arch/tools/core.py
@@ -5,14 +5,13 @@
 import select
 from utils import run_docker_compose_ps, print_service_status, check_services_state
 
-def start_arch(arch_config_file, env, log_timeout=120, check_interval=1):
+def start_arch(arch_config_file, env, log_timeout=120):
     """
     Start Docker Compose in detached mode and stream logs until services are healthy.
 
     Args:
         path (str): The path where the prompt_confi.yml file is located.
         log_timeout (int): Time in seconds to show logs before checking for healthy state.
-        check_interval (int): Time in seconds between health status checks.
     """
 
     compose_file = pkg_resources.resource_filename(__name__, 'config/docker-compose.yaml')
@@ -96,3 +95,33 @@ def stop_arch():
 
     except subprocess.CalledProcessError as e:
         print(f"Failed to shut down services: {str(e)}")
+
+def start_arch_modelserver():
+    """
+    Start the model server. This assumes that the archgw_modelserver package is installed locally
+
+    """
+    try:
+        subprocess.run(
+            ['archgw_modelserver', 'restart'],
+            check=True,
+        )
+        print("Successfull run the archgw model_server")
+    except subprocess.CalledProcessError as e:
+        print (f"Failed to start model_server. Please check archgw_modelserver logs")
+        sys.exit(1)
+
+def stop_arch_modelserver():
+    """
+    Stop the model server. This assumes that the archgw_modelserver package is installed locally
+
+    """
+    try:
+        subprocess.run(
+            ['archgw_modelserver', 'stop'],
+            check=True,
+        )
+        print("Successfull stopped the archgw model_server")
+    except subprocess.CalledProcessError as e:
+        print (f"Failed to start model_server. Please check archgw_modelserver logs")
+        sys.exit(1)
diff --git a/model_server/Dockerfile b/model_server/Dockerfile
@@ -18,8 +18,8 @@ WORKDIR /src
 ENV MODELS="BAAI/bge-large-en-v1.5"
 
 COPY ./app ./app
-COPY ./guard_model_config.yaml .
-COPY ./openai_params.yaml .
+COPY ./app/guard_model_config.yaml .
+COPY ./app/openai_params.yaml .
 
 # comment it out for now as we don't want to download the model every time we build the image
 # we will mount host cache to docker image to avoid downloading the model every time

diff --git a/model_server/Dockerfile.gpu b/model_server/Dockerfile.gpu
@@ -44,14 +44,8 @@ RUN if command -v nvcc >/dev/null 2>&1; then \
 
 COPY . /src
 
-#
-# output
-#
-
-
 # Specify list of models that will go into the image as a comma separated list
 ENV MODELS="BAAI/bge-large-en-v1.5"
-ENV NER_MODELS="urchade/gliner_large-v2.1"
 ENV DEBIAN_FRONTEND=noninteractive
 
 COPY /app /app

diff --git a/model_server/README.md b/model_server/README.md
@@ -0,0 +1 @@
+## Model Server Package ##
diff --git a/model_server/__init__.py b/model_server/__init__.py
diff --git a/model_server/app/__init__.py b/model_server/app/__init__.py
@@ -0,0 +1,99 @@
+import sys
+import subprocess
+import os
+import signal
+import time
+import requests
+import psutil
+import tempfile
+
+# Path to the file where the server process ID will be stored
+PID_FILE = os.path.join(tempfile.gettempdir(), "model_server.pid")
+
+def run_server():
+    """Start, stop, or restart the Uvicorn server based on command-line arguments."""
+    if len(sys.argv) > 1:
+        action = sys.argv[1]
+    else:
+        action = "start"
+
+    if action == "start":
+        start_server()
+    elif action == "stop":
+        stop_server()
+    elif action == "restart":
+        restart_server()
+    else:
+        print(f"Unknown action: {action}")
+        sys.exit(1)
+
+
+def start_server():
+    """Start the Uvicorn server and save the process ID."""
+    if os.path.exists(PID_FILE):
+        print("Server is already running. Use 'model_server restart' to restart it.")
+        sys.exit(1)
+
+    print(f"Starting Archgw Model Server")
+    process = subprocess.Popen(
+        ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "51000"],
+    )
+
+    if wait_for_health_check("http://0.0.0.0:51000/healthz"):
+        # Write the process ID to the PID file
+        with open(PID_FILE, "w") as f:
+            f.write(str(process.pid))
+        print(f"ARCH GW Model Server started with PID {process.pid}")
+    else:
+        #Add model_server boot-up logs
+        print(f"ARCH GW Model Server - Didn't Sart In Time. Shutting Down")
+        process.terminate()
+
+def wait_for_health_check(url, timeout=180):
+    """Wait for the Uvicorn server to respond to health-check requests."""
+    start_time = time.time()
+    while time.time() - start_time < timeout:
+        try:
+            response = requests.get(url)
+            if response.status_code == 200:
+                return True
+        except requests.ConnectionError:
+            time.sleep(1)
+    print("Timed out waiting for ARCH GW Model Server to respond.")
+    return False
+
+
+def stop_server():
+    """Stop the running Uvicorn server."""
+    if not os.path.exists(PID_FILE):
+        print("Status: Archgw Model Server not running")
+        return
+
+    # Read the process ID from the PID file
+    with open(PID_FILE, "r") as f:
+        pid = int(f.read())
+
+    try:
+        # Get process by PID
+        process = psutil.Process(pid)
+
+        # Gracefully terminate the process
+        process.terminate()  # Sends SIGTERM by default
+        process.wait(timeout=10)  # Wait for up to 10 seconds for the process to exit
+
+        print(f"Server with PID {pid} stopped.")
+        os.remove(PID_FILE)
+
+    except psutil.NoSuchProcess:
+        print(f"Process with PID {pid} not found. Cleaning up PID file.")
+        os.remove(PID_FILE)
+    except psutil.TimeoutExpired:
+        print(f"Process with PID {pid} did not terminate in time. Forcing shutdown.")
+        process.kill()  # Forcefully kill the process
+        os.remove(PID_FILE)
+
+def restart_server():
+    """Restart the Uvicorn server."""
+    print("Check: Is Archgw Model Server running?")
+    stop_server()
+    start_server()
diff --git a/model_server/app/arch_fc/arch_fc.py b/model_server/app/arch_fc/arch_fc.py
@@ -1,9 +1,10 @@
 import json
 import random
 from fastapi import FastAPI, Response
-from app.arch_fc.arch_handler import ArchHandler
-from app.arch_fc.bolt_handler import BoltHandler
-from app.arch_fc.common import ChatMessage, Message
+from .common import ChatMessage, Message
+from .arch_handler import ArchHandler
+from .bolt_handler import BoltHandler
+from app.utils import load_yaml_config
 import logging
 import yaml
 from openai import OpenAI
@@ -14,17 +15,14 @@
     level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
 )
 logger = logging.getLogger(__name__)
-
-with open("openai_params.yaml") as f:
-    params = yaml.safe_load(f)
-
+params = load_yaml_config("openai_params.yaml")
 ollama_endpoint = os.getenv("OLLAMA_ENDPOINT", "localhost")
 ollama_model = os.getenv("OLLAMA_MODEL", "Arch-Function-Calling-1.5B-Q4_K_M")
-fc_url = os.getenv("FC_URL", ollama_endpoint)
+fc_url = os.getenv("FC_URL", "https://arch-fc-free-trial-4mzywewe.uc.gateway.dev/v1")
+
 mode = os.getenv("MODE", "cloud")
 if mode not in ["cloud", "local-gpu", "local-cpu"]:
     raise ValueError(f"Invalid mode: {mode}")
-arch_api_key = os.getenv("ARCH_API_KEY", "vllm")
 
 handler = None
 if ollama_model.startswith("Arch"):