llm/sglang/llama2.yaml

service:
  # Specifying the path to the endpoint to check the readiness of the service.
  readiness_probe: /health
  # How many replicas to manage.
  replicas: 2

envs:
  MODEL_NAME: meta-llama/Llama-2-7b-chat-hf
  HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass.

resources:
  accelerators: {L4:1, A10G:1, A10:1, A100:1, A100-80GB:1}
  ports:
    - 8000

setup: |
  conda activate sglang
  if [ $? -ne 0 ]; then
    conda create -n sglang python=3.10 -y
    conda activate sglang
  fi

  pip list | grep sglang || pip install "sglang[all]"
  pip list | grep transformers || pip install transformers==4.37.2

  python -c "import huggingface_hub; huggingface_hub.login('${HF_TOKEN}')"


run: |
  conda activate sglang
  echo 'Starting sglang openai api server...'
  export PATH=$PATH:/sbin/
  python -m sglang.launch_server --model-path $MODEL_NAME --host 0.0.0.0 --port 8000