test

wandb · Mar 15, 2024 · fe0629c · fe0629c
1 parent 1188725
commit fe0629c
Show file tree

Hide file tree

Showing 8 changed files with 359 additions and 0 deletions.
diff --git a/jobs/deploy_to_nvidia_nemo_inference_microservice/Dockerfile.wandb b/jobs/deploy_to_nvidia_nemo_inference_microservice/Dockerfile.wandb
@@ -0,0 +1,22 @@
+# syntax=docker/dockerfile:1.4
+
+# NOTE: You will need to have access to the the private repo
+FROM nvcr.io/ohlfw0olaadg/ea-participants/nemollm-inference-ms:24.01
+
+WORKDIR /launch
+COPY --link requirements.txt ./
+
+USER root
+RUN apt update
+RUN apt install -y python3-pip python3-setuptools
+RUN python3 -m pip install --upgrade pip setuptools wheel
+RUN pip3 install -r requirements.txt
+
+# Where the llama configs live
+COPY --link trt_llm_configs/ trt_llm_configs/
+
+# Example configs for different deployments
+COPY --link job.py configs/ ./
+
+USER nemo
+ENTRYPOINT ["python3", "job.py"]
diff --git a/jobs/deploy_to_nvidia_nemo_inference_microservice/README.md b/jobs/deploy_to_nvidia_nemo_inference_microservice/README.md
@@ -0,0 +1,46 @@
+# NVIDIA NeMo Inference Microservice Deploy Job
+
+Deploy a model from W&B Artifacts to the NVIDIA NeMo Inference Microservice.
+
+This job accepts a compatible model artifact from W&B and deploys to an running NIM/Triton server. It converts supported models to the `.nemo` format
+
+Deployment time varies by model and machine type. The base Llama2-7b config takes about 1 minute on GCP's `a2-ultragpu-1g`.
+
+## Compatible model types
+
+1. Llama2
+2. StarCoder
+3. NV-GPT (coming soon)
+
+## User Quickstart
+
+1. Create a queue if you don't have one already, and launch an agent:
+   ```bash
+   wandb launch-agent -e $ENTITY -p $PROJECT -q $QUEUE
+   ```
+2. Submit the deployment job with your desired configs from the [Launch UI](https://wandb.ai/launch). See `configs/` for examples.
+   1. You can also submit via the CLI:
+      ```bash
+      wandb launch -d gcr.io/playground-111/deploy-to-nemo:latest \
+        -e $ENTITY \
+        -p $PROJECT \
+        -q $QUEUE \
+        -c $CONFIG_JSON_FNAME
+      ```
+3. You can track the deployment process in the Launch UI. Once complete, you can immediately curl the endpoint to test the model. The model name is always `ensemble`.
+   ```bash
+    #!/bin/bash
+    curl -X POST "http://0.0.0.0:9999/v1/completions" \
+        -H "accept: application/json" \
+        -H "Content-Type: application/json" \
+        -d '{
+            "model": "ensemble",
+            "prompt": "Tell me a joke",
+            "max_tokens": 256,
+            "temperature": 0.5,
+            "n": 1,
+            "stream": false,
+            "stop": "string",
+            "frequency_penalty": 0.0
+            }'
+   ```
diff --git a/jobs/deploy_to_nvidia_nemo_inference_microservice/configs/llama2_7b_chat.yaml b/jobs/deploy_to_nvidia_nemo_inference_microservice/configs/llama2_7b_chat.yaml
@@ -0,0 +1,5 @@
+run_name: Deploy Llama 2 7b Chat Model
+config:
+  artifact: "wandb-artifact://megatruong/public-models/llama:v0"
+  artifact_model_type: "llama"
+  bucket_name: "andrew-nemo-bucket"
diff --git a/jobs/deploy_to_nvidia_nemo_inference_microservice/configs/starcoder.yaml b/jobs/deploy_to_nvidia_nemo_inference_microservice/configs/starcoder.yaml
@@ -0,0 +1,5 @@
+run_name: Deploy StarCoder Model
+config:
+  artifact: "wandb-artifact://megatruong/public-models/starcoder:v0"
+  artifact_model_type: "starcoder"
+  bucket_name: "andrew-nemo-bucket"
diff --git a/jobs/deploy_to_nvidia_nemo_inference_microservice/job.py b/jobs/deploy_to_nvidia_nemo_inference_microservice/job.py
@@ -0,0 +1,201 @@
+import logging
+import os
+import shutil
+import subprocess
+import sys
+from pathlib import Path
+from typing import Literal, Optional
+
+import boto3
+import wandb
+import yaml
+from pydantic import BaseModel
+from rich.logging import RichHandler
+
+logging.basicConfig(
+    level="INFO",
+    format="%(message)s",
+    datefmt="[%X]",
+    handlers=[
+        RichHandler(
+            rich_tracebacks=True,
+            tracebacks_show_locals=True,
+        )
+    ],
+)
+logger = logging.getLogger(__name__)
+
+
+model_config_mapping = {
+    "llama": "llama_template.yaml",
+    "starcoder": "starcoder_template.yaml",
+}
+
+
+def run_cmd(cmd: list[str], error_msg: Optional[str] = None, shell: bool = False):
+    command = " ".join(cmd) if shell else cmd
+    logger.debug(f"Running {command=}")
+    with subprocess.Popen(
+        command,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+        text=True,
+        bufsize=1,
+        universal_newlines=True,
+        shell=shell,
+    ) as proc:
+        for line in proc.stdout:
+            logger.info(line.strip())
+
+        if proc.poll() != 0:
+            logger.error(error_msg)
+            sys.exit(1)
+
+
+class Config(BaseModel):
+    artifact: str
+    artifact_model_type: Literal["llama", "starcoder"]
+
+    bucket_name: str
+    nim_model_store_path: str = "/model-store/"
+    s3_model_repo_path: str = "models"
+
+    openai_port: int = 9999
+    nemo_port: int = 9998
+
+    deploy_option: Literal[
+        "local-nim",
+        # "remote-nim",  # in a future release, NIM will have an option to point to external model store
+    ] = "local-nim"
+
+    download_artifact: bool = True
+    generate_model: bool = True
+    update_repo_names: Literal[
+        False
+    ] = False  # TODO: Add this option back when Nvidia officially supports alt repos
+    push_to_s3: bool = False
+
+    log_level: Literal["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"] = "INFO"
+
+
+logger.info("Starting deploy to Nvidia Nemo Inference Microservice...")
+run = wandb.init()
+config = Config(**run.config)
+
+logger.setLevel(config.log_level)
+logger.debug(f"{config=}")
+
+artifact_name_cleaned = config.artifact.replace("/", "__").replace(":", "__")
+triton_model_name = "ensemble"
+triton_trt_model_name = "trt_llm"
+
+base_trt_config_fname = model_config_mapping.get(config.artifact_model_type)
+if base_trt_config_fname is None:
+    logger.error(f"Unsupported model type {config.artifact_model_type=}, exiting.")
+    sys.exit(1)
+
+base_trt_config_path = f"./trt_llm_configs/{base_trt_config_fname}"
+with open(base_trt_config_path) as f:
+    trt_config = yaml.safe_load(f)
+
+
+if config.download_artifact:
+    logger.info("Downloading model artifact...")
+    try:
+        art = run.use_artifact(config.artifact)
+        artifact_path = art.download()
+    except Exception as e:
+        logger.error(f"Error downloading artifact, exiting.  {e=}")
+        sys.exit(1)
+
+
+if config.update_repo_names:
+    triton_model_name = f"{artifact_name_cleaned}__ensemble"
+    triton_trt_model_name = f"{artifact_name_cleaned}__trt_llm"
+
+    trt_config["base_model_id"] = triton_model_name
+    trt_config["trt_llm"]["model_name"] = triton_trt_model_name
+    trt_config["pipeline"]["model_name"] = triton_model_name
+
+
+if config.generate_model:
+    logger.info("Generating TRT-LLM config from template...")
+    trt_config["trt_llm"]["model_path"] = artifact_path
+
+    trt_config_fname = "trt_config.yaml"
+    with open(trt_config_fname, "w") as f:
+        yaml.dump(trt_config, f)
+
+    logger.info("Running model_repo_generator...")
+    cmd = [
+        "model_repo_generator",
+        "llm",
+        "--verbose",
+        f"--yaml_config_file={trt_config_fname}",
+    ]
+    run_cmd(cmd, shell=False)
+    logger.info(f"Generated model repos at {config.nim_model_store_path=}")
+
+
+if config.update_repo_names:
+    logger.info("Updating repo to match wandb.Artifact versions...")
+    # NOTE: Triton starts at v1, but we start at v0 so we'll be off-by-1.
+    # Not sure if this is the best option...
+    if config.download_artifact:
+        _, ver = art.name.split("v", 1)
+        ver = int(ver) + 1
+        ver = str(ver)
+    else:
+        ver = "1"
+
+    base_path = Path(config.nim_model_store_path)
+    for model in ["ensemble", "trt_llm"]:
+        path = base_path / f"{artifact_name_cleaned}__{model}"
+        if path.exists():
+            max_ver = max(
+                [int(p.name) for p in path.iterdir() if p.is_dir()], default=1
+            )
+            new_ver = str(max_ver + 1)
+            new_path = path / new_ver
+
+            logging.info(f"Adding new model as {new_ver=}")
+            src_dir = path / "1"
+            shutil.copytree(src_dir, new_path)
+
+# Optional: Push to S3 (in future release, we can load models from here)
+if config.push_to_s3:
+    logger.info(f"Pushing models to S3 {config.bucket_name=}")
+    s3_client = boto3.client("s3")
+    for root, _, files in os.walk(config.nim_model_store_path):
+        for f in files:
+            full_path = os.path.join(root, f)
+            rel_path = os.path.relpath(full_path, config.nim_model_store_path)
+            remote_obj_path = os.path.join(config.s3_model_repo_path, rel_path)
+            logger.info(f"Uploading {rel_path} to {remote_obj_path}")
+            s3_client.upload_file(full_path, config.bucket_name, remote_obj_path)
+
+
+if config.deploy_option == "local-nim":
+    logger.info("Loading NIM with models locally...")
+elif config.deploy_option == "s3-nim":
+    ...
+    # triton_model_repository = f"s3://{config.bucket_name}/{config.s3_model_repo_path}"
+
+num_gpus = trt_config["trt_llm"]["num_gpus"]
+model_name = triton_model_name
+openai_port = config.openai_port
+nemo_port = config.nemo_port
+
+logger.info("Running inference service...")
+cmd = [
+    "nemollm_inference_ms",
+    f"--{model_name=}",
+    f"--{num_gpus=}",
+    f"--{openai_port=}",
+    f"--{nemo_port=}",
+    f"--{triton_model_name=}",
+    # f"--{triton_model_repository=}",
+]
+run_cmd(cmd, shell=True)
+
+run.finish()
diff --git a/jobs/deploy_to_nvidia_nemo_inference_microservice/requirements.txt b/jobs/deploy_to_nvidia_nemo_inference_microservice/requirements.txt
@@ -0,0 +1,5 @@
+boto3
+wandb
+tritonclient[http]
+pydantic
+rich
diff --git a/jobs/deploy_to_nvidia_nemo_inference_microservice/trt_llm_configs/llama_template.yaml b/jobs/deploy_to_nvidia_nemo_inference_microservice/trt_llm_configs/llama_template.yaml
@@ -0,0 +1,48 @@
+model_repo_path: "/model-store/"
+use_ensemble: false
+model_type: "LLAMA"
+backend: "trt_llm"
+base_model_id: "ensemble"
+prompt_timer: 60
+gateway_ip: "gateway-api"
+server_port_internal: 9009
+customization_cache_capacity: 10000
+logging_level: "INFO"
+enable_chat: true
+preprocessor:
+  chat_cfg:
+    roles:
+      system:
+        prefix: "[INST] <<SYS>>\n"
+        suffix: "\n<</SYS>>\n\n"
+      user:
+        prefix: ""
+        suffix: " [/INST] "
+      assistant:
+        prefix: ""
+        suffix: " </s><s>[INST] "
+    stop_words: ["</s>"]
+    rstrip_turn: true
+    turn_suffix: "\n"
+pipeline:
+  model_name: "ensemble"
+  num_instances: 1
+trt_llm:
+  use: true
+  ckpt_type: "hf"
+  model_name: "trt_llm"
+  backend: "python"
+  num_gpus: 1
+  model_path: /engine_dir
+  max_queue_delay_microseconds: 10000
+  model_type: "llama"
+  max_batch_size: 1
+  max_input_len: 256
+  max_output_len: 256
+  max_beam_width: 1
+  tensor_para_size: 1
+  pipeline_para_size: 1
+  data_type: "float16"
+  int8_mode: 0
+  enable_custom_all_reduce: 0
+  per_column_scaling: false
diff --git a/jobs/deploy_to_nvidia_nemo_inference_microservice/trt_llm_configs/starcoder_template.yaml b/jobs/deploy_to_nvidia_nemo_inference_microservice/trt_llm_configs/starcoder_template.yaml
@@ -0,0 +1,27 @@
+model_repo_path: "/model-store/"
+use_ensemble: false
+model_type: "STARCODER"
+backend: "trt_llm"
+base_model_id: "ensemble"
+prompt_timer: 60
+gateway_ip: "gateway-api"
+server_port_internal: 9009
+customization_cache_capacity: 10000
+logging_level: "INFO"
+pipeline:
+  model_name: "ensemble"
+  num_instances: 1
+trt_llm:
+  use: true
+  model_name: "trt_llm"
+  model_type: "starcoder"
+  use_model_path: "/engine_dir"
+  data_type: "float16"
+  num_gpus: 1
+  tensor_para_size: 1
+  pipeline_para_size: 1
+  max_batch_size: 128
+  max_input_len: 8192
+  max_output_len: 8192
+  max_num_tokens: 40000
+  max_beam_width: 1