-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
nvidia
authored and
nvidia
committed
Mar 15, 2024
1 parent
1188725
commit fe0629c
Showing
8 changed files
with
359 additions
and
0 deletions.
There are no files selected for viewing
22 changes: 22 additions & 0 deletions
22
jobs/deploy_to_nvidia_nemo_inference_microservice/Dockerfile.wandb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
# syntax=docker/dockerfile:1.4 | ||
|
||
# NOTE: You will need to have access to the the private repo | ||
FROM nvcr.io/ohlfw0olaadg/ea-participants/nemollm-inference-ms:24.01 | ||
|
||
WORKDIR /launch | ||
COPY --link requirements.txt ./ | ||
|
||
USER root | ||
RUN apt update | ||
RUN apt install -y python3-pip python3-setuptools | ||
RUN python3 -m pip install --upgrade pip setuptools wheel | ||
RUN pip3 install -r requirements.txt | ||
|
||
# Where the llama configs live | ||
COPY --link trt_llm_configs/ trt_llm_configs/ | ||
|
||
# Example configs for different deployments | ||
COPY --link job.py configs/ ./ | ||
|
||
USER nemo | ||
ENTRYPOINT ["python3", "job.py"] |
46 changes: 46 additions & 0 deletions
46
jobs/deploy_to_nvidia_nemo_inference_microservice/README.md
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
# NVIDIA NeMo Inference Microservice Deploy Job | ||
|
||
Deploy a model from W&B Artifacts to the NVIDIA NeMo Inference Microservice. | ||
|
||
This job accepts a compatible model artifact from W&B and deploys to an running NIM/Triton server. It converts supported models to the `.nemo` format | ||
|
||
Deployment time varies by model and machine type. The base Llama2-7b config takes about 1 minute on GCP's `a2-ultragpu-1g`. | ||
|
||
## Compatible model types | ||
|
||
1. Llama2 | ||
2. StarCoder | ||
3. NV-GPT (coming soon) | ||
|
||
## User Quickstart | ||
|
||
1. Create a queue if you don't have one already, and launch an agent: | ||
```bash | ||
wandb launch-agent -e $ENTITY -p $PROJECT -q $QUEUE | ||
``` | ||
2. Submit the deployment job with your desired configs from the [Launch UI](https://wandb.ai/launch). See `configs/` for examples. | ||
1. You can also submit via the CLI: | ||
```bash | ||
wandb launch -d gcr.io/playground-111/deploy-to-nemo:latest \ | ||
-e $ENTITY \ | ||
-p $PROJECT \ | ||
-q $QUEUE \ | ||
-c $CONFIG_JSON_FNAME | ||
``` | ||
3. You can track the deployment process in the Launch UI. Once complete, you can immediately curl the endpoint to test the model. The model name is always `ensemble`. | ||
```bash | ||
#!/bin/bash | ||
curl -X POST "http://0.0.0.0:9999/v1/completions" \ | ||
-H "accept: application/json" \ | ||
-H "Content-Type: application/json" \ | ||
-d '{ | ||
"model": "ensemble", | ||
"prompt": "Tell me a joke", | ||
"max_tokens": 256, | ||
"temperature": 0.5, | ||
"n": 1, | ||
"stream": false, | ||
"stop": "string", | ||
"frequency_penalty": 0.0 | ||
}' | ||
``` |
5 changes: 5 additions & 0 deletions
5
jobs/deploy_to_nvidia_nemo_inference_microservice/configs/llama2_7b_chat.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
run_name: Deploy Llama 2 7b Chat Model | ||
config: | ||
artifact: "wandb-artifact://megatruong/public-models/llama:v0" | ||
artifact_model_type: "llama" | ||
bucket_name: "andrew-nemo-bucket" |
5 changes: 5 additions & 0 deletions
5
jobs/deploy_to_nvidia_nemo_inference_microservice/configs/starcoder.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
run_name: Deploy StarCoder Model | ||
config: | ||
artifact: "wandb-artifact://megatruong/public-models/starcoder:v0" | ||
artifact_model_type: "starcoder" | ||
bucket_name: "andrew-nemo-bucket" |
201 changes: 201 additions & 0 deletions
201
jobs/deploy_to_nvidia_nemo_inference_microservice/job.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,201 @@ | ||
import logging | ||
import os | ||
import shutil | ||
import subprocess | ||
import sys | ||
from pathlib import Path | ||
from typing import Literal, Optional | ||
|
||
import boto3 | ||
import wandb | ||
import yaml | ||
from pydantic import BaseModel | ||
from rich.logging import RichHandler | ||
|
||
logging.basicConfig( | ||
level="INFO", | ||
format="%(message)s", | ||
datefmt="[%X]", | ||
handlers=[ | ||
RichHandler( | ||
rich_tracebacks=True, | ||
tracebacks_show_locals=True, | ||
) | ||
], | ||
) | ||
logger = logging.getLogger(__name__) | ||
|
||
|
||
model_config_mapping = { | ||
"llama": "llama_template.yaml", | ||
"starcoder": "starcoder_template.yaml", | ||
} | ||
|
||
|
||
def run_cmd(cmd: list[str], error_msg: Optional[str] = None, shell: bool = False): | ||
command = " ".join(cmd) if shell else cmd | ||
logger.debug(f"Running {command=}") | ||
with subprocess.Popen( | ||
command, | ||
stdout=subprocess.PIPE, | ||
stderr=subprocess.STDOUT, | ||
text=True, | ||
bufsize=1, | ||
universal_newlines=True, | ||
shell=shell, | ||
) as proc: | ||
for line in proc.stdout: | ||
logger.info(line.strip()) | ||
|
||
if proc.poll() != 0: | ||
logger.error(error_msg) | ||
sys.exit(1) | ||
|
||
|
||
class Config(BaseModel): | ||
artifact: str | ||
artifact_model_type: Literal["llama", "starcoder"] | ||
|
||
bucket_name: str | ||
nim_model_store_path: str = "/model-store/" | ||
s3_model_repo_path: str = "models" | ||
|
||
openai_port: int = 9999 | ||
nemo_port: int = 9998 | ||
|
||
deploy_option: Literal[ | ||
"local-nim", | ||
# "remote-nim", # in a future release, NIM will have an option to point to external model store | ||
] = "local-nim" | ||
|
||
download_artifact: bool = True | ||
generate_model: bool = True | ||
update_repo_names: Literal[ | ||
False | ||
] = False # TODO: Add this option back when Nvidia officially supports alt repos | ||
push_to_s3: bool = False | ||
|
||
log_level: Literal["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"] = "INFO" | ||
|
||
|
||
logger.info("Starting deploy to Nvidia Nemo Inference Microservice...") | ||
run = wandb.init() | ||
config = Config(**run.config) | ||
|
||
logger.setLevel(config.log_level) | ||
logger.debug(f"{config=}") | ||
|
||
artifact_name_cleaned = config.artifact.replace("/", "__").replace(":", "__") | ||
triton_model_name = "ensemble" | ||
triton_trt_model_name = "trt_llm" | ||
|
||
base_trt_config_fname = model_config_mapping.get(config.artifact_model_type) | ||
if base_trt_config_fname is None: | ||
logger.error(f"Unsupported model type {config.artifact_model_type=}, exiting.") | ||
sys.exit(1) | ||
|
||
base_trt_config_path = f"./trt_llm_configs/{base_trt_config_fname}" | ||
with open(base_trt_config_path) as f: | ||
trt_config = yaml.safe_load(f) | ||
|
||
|
||
if config.download_artifact: | ||
logger.info("Downloading model artifact...") | ||
try: | ||
art = run.use_artifact(config.artifact) | ||
artifact_path = art.download() | ||
except Exception as e: | ||
logger.error(f"Error downloading artifact, exiting. {e=}") | ||
sys.exit(1) | ||
|
||
|
||
if config.update_repo_names: | ||
triton_model_name = f"{artifact_name_cleaned}__ensemble" | ||
triton_trt_model_name = f"{artifact_name_cleaned}__trt_llm" | ||
|
||
trt_config["base_model_id"] = triton_model_name | ||
trt_config["trt_llm"]["model_name"] = triton_trt_model_name | ||
trt_config["pipeline"]["model_name"] = triton_model_name | ||
|
||
|
||
if config.generate_model: | ||
logger.info("Generating TRT-LLM config from template...") | ||
trt_config["trt_llm"]["model_path"] = artifact_path | ||
|
||
trt_config_fname = "trt_config.yaml" | ||
with open(trt_config_fname, "w") as f: | ||
yaml.dump(trt_config, f) | ||
|
||
logger.info("Running model_repo_generator...") | ||
cmd = [ | ||
"model_repo_generator", | ||
"llm", | ||
"--verbose", | ||
f"--yaml_config_file={trt_config_fname}", | ||
] | ||
run_cmd(cmd, shell=False) | ||
logger.info(f"Generated model repos at {config.nim_model_store_path=}") | ||
|
||
|
||
if config.update_repo_names: | ||
logger.info("Updating repo to match wandb.Artifact versions...") | ||
# NOTE: Triton starts at v1, but we start at v0 so we'll be off-by-1. | ||
# Not sure if this is the best option... | ||
if config.download_artifact: | ||
_, ver = art.name.split("v", 1) | ||
ver = int(ver) + 1 | ||
ver = str(ver) | ||
else: | ||
ver = "1" | ||
|
||
base_path = Path(config.nim_model_store_path) | ||
for model in ["ensemble", "trt_llm"]: | ||
path = base_path / f"{artifact_name_cleaned}__{model}" | ||
if path.exists(): | ||
max_ver = max( | ||
[int(p.name) for p in path.iterdir() if p.is_dir()], default=1 | ||
) | ||
new_ver = str(max_ver + 1) | ||
new_path = path / new_ver | ||
|
||
logging.info(f"Adding new model as {new_ver=}") | ||
src_dir = path / "1" | ||
shutil.copytree(src_dir, new_path) | ||
|
||
# Optional: Push to S3 (in future release, we can load models from here) | ||
if config.push_to_s3: | ||
logger.info(f"Pushing models to S3 {config.bucket_name=}") | ||
s3_client = boto3.client("s3") | ||
for root, _, files in os.walk(config.nim_model_store_path): | ||
for f in files: | ||
full_path = os.path.join(root, f) | ||
rel_path = os.path.relpath(full_path, config.nim_model_store_path) | ||
remote_obj_path = os.path.join(config.s3_model_repo_path, rel_path) | ||
logger.info(f"Uploading {rel_path} to {remote_obj_path}") | ||
s3_client.upload_file(full_path, config.bucket_name, remote_obj_path) | ||
|
||
|
||
if config.deploy_option == "local-nim": | ||
logger.info("Loading NIM with models locally...") | ||
elif config.deploy_option == "s3-nim": | ||
... | ||
# triton_model_repository = f"s3://{config.bucket_name}/{config.s3_model_repo_path}" | ||
|
||
num_gpus = trt_config["trt_llm"]["num_gpus"] | ||
model_name = triton_model_name | ||
openai_port = config.openai_port | ||
nemo_port = config.nemo_port | ||
|
||
logger.info("Running inference service...") | ||
cmd = [ | ||
"nemollm_inference_ms", | ||
f"--{model_name=}", | ||
f"--{num_gpus=}", | ||
f"--{openai_port=}", | ||
f"--{nemo_port=}", | ||
f"--{triton_model_name=}", | ||
# f"--{triton_model_repository=}", | ||
] | ||
run_cmd(cmd, shell=True) | ||
|
||
run.finish() |
5 changes: 5 additions & 0 deletions
5
jobs/deploy_to_nvidia_nemo_inference_microservice/requirements.txt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
boto3 | ||
wandb | ||
tritonclient[http] | ||
pydantic | ||
rich |
48 changes: 48 additions & 0 deletions
48
jobs/deploy_to_nvidia_nemo_inference_microservice/trt_llm_configs/llama_template.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
model_repo_path: "/model-store/" | ||
use_ensemble: false | ||
model_type: "LLAMA" | ||
backend: "trt_llm" | ||
base_model_id: "ensemble" | ||
prompt_timer: 60 | ||
gateway_ip: "gateway-api" | ||
server_port_internal: 9009 | ||
customization_cache_capacity: 10000 | ||
logging_level: "INFO" | ||
enable_chat: true | ||
preprocessor: | ||
chat_cfg: | ||
roles: | ||
system: | ||
prefix: "[INST] <<SYS>>\n" | ||
suffix: "\n<</SYS>>\n\n" | ||
user: | ||
prefix: "" | ||
suffix: " [/INST] " | ||
assistant: | ||
prefix: "" | ||
suffix: " </s><s>[INST] " | ||
stop_words: ["</s>"] | ||
rstrip_turn: true | ||
turn_suffix: "\n" | ||
pipeline: | ||
model_name: "ensemble" | ||
num_instances: 1 | ||
trt_llm: | ||
use: true | ||
ckpt_type: "hf" | ||
model_name: "trt_llm" | ||
backend: "python" | ||
num_gpus: 1 | ||
model_path: /engine_dir | ||
max_queue_delay_microseconds: 10000 | ||
model_type: "llama" | ||
max_batch_size: 1 | ||
max_input_len: 256 | ||
max_output_len: 256 | ||
max_beam_width: 1 | ||
tensor_para_size: 1 | ||
pipeline_para_size: 1 | ||
data_type: "float16" | ||
int8_mode: 0 | ||
enable_custom_all_reduce: 0 | ||
per_column_scaling: false |
27 changes: 27 additions & 0 deletions
27
jobs/deploy_to_nvidia_nemo_inference_microservice/trt_llm_configs/starcoder_template.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
model_repo_path: "/model-store/" | ||
use_ensemble: false | ||
model_type: "STARCODER" | ||
backend: "trt_llm" | ||
base_model_id: "ensemble" | ||
prompt_timer: 60 | ||
gateway_ip: "gateway-api" | ||
server_port_internal: 9009 | ||
customization_cache_capacity: 10000 | ||
logging_level: "INFO" | ||
pipeline: | ||
model_name: "ensemble" | ||
num_instances: 1 | ||
trt_llm: | ||
use: true | ||
model_name: "trt_llm" | ||
model_type: "starcoder" | ||
use_model_path: "/engine_dir" | ||
data_type: "float16" | ||
num_gpus: 1 | ||
tensor_para_size: 1 | ||
pipeline_para_size: 1 | ||
max_batch_size: 128 | ||
max_input_len: 8192 | ||
max_output_len: 8192 | ||
max_num_tokens: 40000 | ||
max_beam_width: 1 |