forked from skypilot-org/skypilot
-
Notifications
You must be signed in to change notification settings - Fork 0
/
llama2.yaml
34 lines (26 loc) · 905 Bytes
/
llama2.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
service:
# Specifying the path to the endpoint to check the readiness of the service.
readiness_probe: /health
# How many replicas to manage.
replicas: 2
envs:
MODEL_NAME: meta-llama/Llama-2-7b-chat-hf
HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass.
resources:
accelerators: {L4:1, A10G:1, A10:1, A100:1, A100-80GB:1}
ports:
- 8000
setup: |
conda activate sglang
if [ $? -ne 0 ]; then
conda create -n sglang python=3.10 -y
conda activate sglang
fi
pip list | grep sglang || pip install "sglang[all]"
pip list | grep transformers || pip install transformers==4.37.2
python -c "import huggingface_hub; huggingface_hub.login('${HF_TOKEN}')"
run: |
conda activate sglang
echo 'Starting sglang openai api server...'
export PATH=$PATH:/sbin/
python -m sglang.launch_server --model-path $MODEL_NAME --host 0.0.0.0 --port 8000