basetenlabs · dsingal0 · Jul 16, 2024 · Jul 2, 2024 · Jul 2, 2024 · Jul 7, 2024
diff --git a/gemma2/gemma2-27b-it-vllm/README.md b/gemma2/gemma2-27b-it-vllm/README.md
@@ -0,0 +1,55 @@
+# Gemma 2 27B 
+
+This is a [Truss](https://truss.baseten.co/) for Gemma 2 27B Instruct. This README will walk you through how to deploy this Truss on Baseten to get your own instance of Gemma 2 27B Instruct.
+
+## Gemma 2 27B Instruct Implementation
+
+This implementation of Gemma 2 uses [vLLM](https://github.com/vllm-project/vllm).
+
+Since Gemma 2 is a gated model, you will also need to provide your Huggingface access token after making sure you have access to [the model](https://huggingface.co/google/gemma-2-27b-it). Please use the [following guide](https://docs.baseten.co/deploy/guides/secrets) to add your Huggingface access token as a secret.
+
+## Deployment
+
+First, clone this repository:
+
+```sh
+git clone https://github.com/basetenlabs/truss-examples/
+cd gemma2/gemma2-27b-it-vllm
+```
+
+Before deployment:
+
+1. Make sure you have a [Baseten account](https://app.baseten.co/signup) and [API key](https://app.baseten.co/settings/account/api_keys).
+2. Install the latest version of Truss: `pip install --upgrade truss`
+
+With `gemma2/gemma2-27b-it-vllm` as your working directory, you can deploy the model with:
+
+```sh
+truss push --trusted
+```
+
+Paste your Baseten API key if prompted.
+
+For more information, see [Truss documentation](https://truss.baseten.co).
+
+## Gemma 2 27B Instruct API documentation
+
+This section provides an overview of the Gemma 2 27B Instruct API, its parameters, and how to use it. The API consists of a single route named  `predict`, which you can invoke to generate text based on the provided prompt.
+
+### API route: `predict`
+
+The predict route is the primary method for generating text completions based on a given prompt. It takes several parameters:
+
+- __prompt__: The input text that you want the model to generate a response for.  
+- __max_tokens__: The maximum number of output tokens.  
+
+## Example usage
+
+You can also invoke your model via a REST API:
+
+```
+curl -X POST " https://app.baseten.co/model_versions/YOUR_MODEL_VERSION_ID/predict" \
+     -H "Content-Type: application/json" \
+     -H 'Authorization: Api-Key {YOUR_API_KEY}' \
+     -d '{"prompt": "what came before, the chicken or the egg?", "max_tokens": 64}'
+```
diff --git a/gemma2/gemma2-27b-it-vllm/config.yaml b/gemma2/gemma2-27b-it-vllm/config.yaml
@@ -0,0 +1,21 @@
+model_name: "Gemma 2 27B Instruct VLLM"
+python_version: py311
+model_metadata:
+  example_model_input: {"prompt": "what is the meaning of life"}
+  main_model: google/gemma-2-27b-it
+  tensor_parallel: 1
+  max_num_seqs: 16
+requirements:
+  - vllm>=0.5.1
+  - ray==2.31.0
+  - huggingface-hub==0.23.4
+  - https://github.com/flashinfer-ai/flashinfer/releases/download/v0.0.8/flashinfer-0.0.8+cu121torch2.3-cp311-cp311-linux_x86_64.whl
+resources:
+  accelerator: A100
+  use_gpu: true
+runtime:
+  predict_concurrency: 128
+system_packages:
+  - git
+secrets:
+  hf_access_token: null
diff --git a/gemma2/gemma2-27b-it-vllm/model/__init__.py b/gemma2/gemma2-27b-it-vllm/model/__init__.py
diff --git a/gemma2/gemma2-27b-it-vllm/model/model.py b/gemma2/gemma2-27b-it-vllm/model/model.py
@@ -0,0 +1,84 @@
+import logging
+import subprocess
+import uuid
+from vllm import SamplingParams
+from vllm.engine.arg_utils import AsyncEngineArgs
+from vllm.engine.async_llm_engine import AsyncLLMEngine
+import os
+import huggingface_hub
+
+os.environ["VLLM_ATTENTION_BACKEND"] = "FLASHINFER"
+logger = logging.getLogger(__name__)
+
+
+class Model:
+    def __init__(self, **kwargs):
+        self._config = kwargs["config"]
+        self.model = None
+        self.llm_engine = None
+        self.model_args = None
+        self.hf_secret_token = kwargs["secrets"]["hf_access_token"]
+        os.environ["HF_TOKEN"] = self.hf_secret_token
+        print(
+            "logging in with huggingface authentication token: ", self.hf_secret_token
+        )
+        huggingface_hub.login(token=self.hf_secret_token, add_to_git_credential=True)
+        num_gpus = self._config["model_metadata"]["tensor_parallel"]
+        logger.info(f"num GPUs ray: {num_gpus}")
+        command = f"ray start --head --num-gpus={num_gpus}"
+        subprocess.check_output(command, shell=True, text=True)
+
+    def load(self):
+        try:
+            result = subprocess.run(
+                ["nvidia-smi"], capture_output=True, text=True, check=True
+            )
+            print(result.stdout)
+        except subprocess.CalledProcessError as e:
+            print(f"Command failed with code {e.returncode}: {e.stderr}")
+        model_metadata = self._config["model_metadata"]
+        logger.info(f"main model: {model_metadata['main_model']}")
+        logger.info(f"tensor parallelism: {model_metadata['tensor_parallel']}")
+        logger.info(f"max num seqs: {model_metadata['max_num_seqs']}")
+
+        self.model_args = AsyncEngineArgs(
+            model=model_metadata["main_model"],
+            trust_remote_code=True,
+            tensor_parallel_size=model_metadata["tensor_parallel"],
+            max_num_seqs=model_metadata["max_num_seqs"],
+            dtype="auto",
+            use_v2_block_manager=True,
+            enforce_eager=True,
+        )
+        self.llm_engine = AsyncLLMEngine.from_engine_args(self.model_args)
+        try:
+            result = subprocess.run(
+                ["nvidia-smi"], capture_output=True, text=True, check=True
+            )
+            print(result.stdout)
+        except subprocess.CalledProcessError as e:
+            print(f"Command failed with code {e.returncode}: {e.stderr}")
+
+    async def predict(self, model_input):
+        prompt = model_input.pop("prompt")
+        stream = model_input.pop("stream", True)
+
+        sampling_params = SamplingParams(**model_input)
+        idx = str(uuid.uuid4().hex)
+        vllm_generator = self.llm_engine.generate(prompt, sampling_params, idx)
+
+        async def generator():
+            full_text = ""
+            async for output in vllm_generator:
+                text = output.outputs[0].text
+                delta = text[len(full_text) :]
+                full_text = text
+                yield delta
+
+        if stream:
+            return generator()
+        else:
+            full_text = ""
+            async for delta in generator():
+                full_text += delta
+            return {"text": full_text}
diff --git a/gemma2/gemma2-27b-it/README.md b/gemma2/gemma2-27b-it/README.md
@@ -0,0 +1,53 @@
+# Gemma 2 27B 
+
+This is a [Truss](https://truss.baseten.co/) for Gemma 2 27B Instruct. This README will walk you through how to deploy this Truss on Baseten to get your own instance of Gemma 2 27B Instruct.
+
+## Gemma 2 27B Instruct Implementation
+
+This implementation of Gemma 2 uses [local-gemma](https://github.com/huggingface/local-gemma) by Huggingface. Which wraps tokenizers, accelerate, and bitsandbytes along with presets based on hardware. It defaults to the "auto" preset which automatically find the most performant preset for your hardware, trading-off speed and memory. The provided truss config deploys the model on an A100. The "auto" preset is biased towards reduced memory consumption at the cost of tok/sec, so for slightly faster performance please use the "exact" preset. 
+
+Since Gemma 2 is a gated model, you will also need to provide your Huggingface access token after making sure you have access to [the model](https://huggingface.co/google/gemma-2-27b-it). Please use the [following guide](https://docs.baseten.co/deploy/guides/secrets) to add your Huggingface access token as a secret.
+
+## Deployment
+
+First, clone this repository:
+
+```sh
+git clone https://github.com/basetenlabs/truss-examples/
+cd gemma2/gemma2-27b-it
+```
+
+Before deployment:
+
+1. Make sure you have a [Baseten account](https://app.baseten.co/signup) and [API key](https://app.baseten.co/settings/account/api_keys).
+2. Install the latest version of Truss: `pip install --upgrade truss`
+
+With `gemma2/gemma2-27b-it` as your working directory, you can deploy the model with:
+
+```sh
+truss push --trusted
+```
+
+Paste your Baseten API key if prompted.
+
+For more information, see [Truss documentation](https://truss.baseten.co).
+
+## Gemma 2 27B Instruct API documentation
+
+This section provides an overview of the Gemma 2 27B Instruct API, its parameters, and how to use it. The API consists of a single route named  `predict`, which you can invoke to generate text based on the provided prompt.
+
+### API route: `predict`
+
+The predict route is the primary method for generating text completions based on a given prompt. It takes several parameters:
+
+- __prompt__: The input text that you want the model to generate a response for.
+## Example usage
+
+You can also invoke your model via a REST API:
+
+```
+curl -X POST " https://app.baseten.co/model_versions/YOUR_MODEL_VERSION_ID/predict" \
+     -H "Content-Type: application/json" \
+     -H 'Authorization: Api-Key {YOUR_API_KEY}' \
+     -d '{"prompt": "what came before, the chicken or the egg?"}'
+```
diff --git a/gemma2/gemma2-27b-it/config.yaml b/gemma2/gemma2-27b-it/config.yaml
@@ -0,0 +1,24 @@
+model_name: "Gemma 2 27B Instruct"
+model_metadata:
+    example_model_input: {"prompt": "what is the meaning of life"}
+requirements:
+- accelerate==0.31.0
+- bitsandbytes==0.43.1
+- click==8.1.7
+- einops==0.8.0
+- huggingface-hub==0.23.4
+- jinja2==3.1.4
+- local-gemma==0.1.0
+- peft==0.11.1
+- safetensors==0.4.3
+- sentencepiece==0.2.0
+- tokenizers==0.19.1
+- torch==2.3.1
+- transformers==4.42.3
+- triton==2.3.1
+secrets:
+  hf_access_token: null
+# Note that we need an A100 to run this model.
+resources:
+  use_gpu: true
+  accelerator: A100
diff --git a/gemma2/gemma2-27b-it/model/__init__.py b/gemma2/gemma2-27b-it/model/__init__.py
diff --git a/gemma2/gemma2-27b-it/model/model.py b/gemma2/gemma2-27b-it/model/model.py
@@ -0,0 +1,115 @@
+from threading import Thread
+from typing import Dict
+import subprocess
+
+import torch
+from local_gemma import LocalGemma2ForCausalLM
+from transformers import (
+    AutoTokenizer,
+    TextIteratorStreamer,
+)
+
+CHECKPOINT = "google/gemma-2-27b-it"
+
+
+class Model:
+    def __init__(self, **kwargs) -> None:
+        self.tokenizer = None
+        self.model = None
+        if "secrets" in kwargs:
+            self._secrets = kwargs["secrets"]
+        else:
+            raise ValueError("Missing secrets")
+
+    def load(self):
+        # print GPU memory
+        total_memory = torch.cuda.get_device_properties("cuda:0").total_memory
+        print(total_memory)
+        # use subprocess to run nvidia-smi
+        try:
+            result = subprocess.run(
+                ["nvidia-smi"], capture_output=True, text=True, check=True
+            )
+            print(result.stdout)
+        except subprocess.CalledProcessError as e:
+            print(f"Command failed with code {e.returncode}: {e.stderr}")
+
+        # make sure token exists
+        if not self._secrets["hf_access_token"]:
+            raise ValueError("Missing hf_access_token")
+        # huggingface auth is done in the local-gemma script, so we just need to call it
+        command = [
+            "local-gemma",
+            "--model",
+            "27b",
+            "--token",
+            self._secrets["hf_access_token"],
+            "What is the capital of France?",
+        ]
+
+        try:
+            result = subprocess.run(command, capture_output=True, text=True, check=True)
+            print(result.stdout)
+        except subprocess.CalledProcessError as e:
+            print(f"Command failed with code {e.returncode}: {e.stderr}")
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            CHECKPOINT, token=self._secrets["hf_access_token"]
+        )
+        print("LOADING MODEL")
+        self.model = LocalGemma2ForCausalLM.from_pretrained(
+            CHECKPOINT, preset="auto", token=self._secrets["hf_access_token"]
+        )
+        print("MODEL LOADED")
+        try:
+            result = subprocess.run(
+                ["nvidia-smi"], capture_output=True, text=True, check=True
+            )
+            print(result.stdout)
+        except subprocess.CalledProcessError as e:
+            print(f"Command failed with code {e.returncode}: {e.stderr}")
+
+    def predict(self, request: Dict) -> Dict:
+        prompt = request.pop("prompt")
+        stream = request.pop("stream", True)
+        if stream == "False":
+            stream = False
+        # Instantiate the Streamer object, which we'll later use for
+        # returning the output to users.
+        streamer = TextIteratorStreamer(self.tokenizer)
+        messages = [{"role": "user", "content": prompt}]
+        encodeds = self.tokenizer.apply_chat_template(messages, return_tensors="pt")
+        model_inputs = encodeds.to(self.model.device)
+        # When creating the generation parameters, ensure to pass the `streamer` object
+        # that we created previously.
+        with torch.no_grad():
+            if stream:
+                generation_kwargs = {
+                    "input_ids": model_inputs,
+                    "do_sample": True,
+                    "pad_token_id": self.tokenizer.eos_token_id,
+                    "max_new_tokens": 1000,
+                    "streamer": streamer,
+                }
+                # Spawn a thread to run the generation, so that it does not block the main
+                # thread.
+                thread = Thread(target=self.model.generate, kwargs=generation_kwargs)
+                thread.start()
+
+                def inner():
+                    for text in streamer:
+                        yield text
+                    thread.join()
+
+                return inner()
+            else:
+                generation_kwargs = {
+                    "input_ids": model_inputs,
+                    "do_sample": True,
+                    "pad_token_id": self.tokenizer.eos_token_id,
+                    "max_new_tokens": 1000,
+                }
+                outputs = self.model.generate(**generation_kwargs)
+                output_text = self.tokenizer.decode(
+                    outputs[0], skip_special_tokens=True
+                )
+                return {"output": output_text}