diff --git a/Llama-3-Groq-8B-Tool-Use/README.md b/Llama-3-Groq-8B-Tool-Use/README.md new file mode 100644 index 00000000..aba25571 --- /dev/null +++ b/Llama-3-Groq-8B-Tool-Use/README.md @@ -0,0 +1,9 @@ +# Groq/Llama-3-Groq-8B-Tool-Use + +Meta developed and publicly released the Llama 3 family of large language models (LLMs), a collection of pretrained and fine-tuned generative text models ranging in scale from 8 billion to 70 billion parameters. Llama 3 is an auto-regressive language model that uses an optimized transformer architecture. + +In this deployment, the [Groq/Llama-3-Groq-8B-Tool-Use](https://huggingface.co/Groq/Llama-3-Groq-8B-Tool-Use) pretrained model is used, which generates a continuation of the incoming text. But to access this model you must have access granted by the Meta that you can request from https://huggingface.co/Groq/Llama-3-Groq-8B-Tool-Use. + +## Deploying + +Use [this SDL](deploy.yaml) to deploy the application on Akash. You will need to enter your Huggingface Access Key in "HF_TOKEN=" ENV variable and you can adjust the parameters passed into the "vllm serve" argument according to your hardware cluster configuration (refer to vLLM documentation for the various parameters). Lastly you can add additional debug flags through the ENV variables (consult the vLLM and Pytorch documentation for this as well) \ No newline at end of file diff --git a/Llama-3-Groq-8B-Tool-Use/deploy.yaml b/Llama-3-Groq-8B-Tool-Use/deploy.yaml new file mode 100644 index 00000000..1069e8b8 --- /dev/null +++ b/Llama-3-Groq-8B-Tool-Use/deploy.yaml @@ -0,0 +1,65 @@ +--- +version: "2.0" +services: + vllm: + image: vllm/vllm-openai:v0.6.2@sha256:55a88146a4da0b6e193431b5b1d3492dfd7bebdc16919df4d031273e85a6157c + expose: + - port: 8000 + as: 8000 + to: + - global: true + command: + - bash + - "-c" + args: # see https://docs.vllm.ai/en/latest/models/engine_args.html for all avaibale arguments + - >- + vllm serve Groq/Llama-3-Groq-8B-Tool-Use --gpu-memory-utilization 0.99 + env: + - "HF_TOKEN=" # Hugging Face API token required to download restricted or private models + #- NCCL_DEBUG=INFO # Uncomment to enable NCCL debugging + params: + storage: + data: + mount: /root/.cache # Mount the data storage to the cache directory for persistent storage of model files + readOnly: false + shm: + mount: /dev/shm +profiles: + compute: + vllm: + resources: + cpu: + units: 6 + memory: + size: 16Gi + storage: + - size: 10Gi + - name: data + size: 50Gi + attributes: + persistent: true + class: beta3 + - name: shm + size: 10Gi + attributes: + class: ram + persistent: false + gpu: + units: 1 + attributes: + vendor: + nvidia: + # - model: a100 + # - model: h100 + # - model: rtx4090 + placement: + dcloud: + pricing: + vllm: + denom: uakt + amount: 10000 +deployment: + vllm: + dcloud: + profile: vllm + count: 1 diff --git a/README.md b/README.md index 93793c4d..67bca26f 100644 --- a/README.md +++ b/README.md @@ -95,6 +95,7 @@ Also, follow [@akashnet\_](https://twitter.com/akashnet_) to stay in the loop wi - [InvokeAI](invoke-ai-gpu) - [Llama-2-70B](Llama-2-70B) - [Llama-3-8B](Llama-3-8B) +- [Llama-3-Groq-8B-Tool-Use](Llama-3-Groq-8B-Tool-Use) - [Llama-3-70B](Llama-3-70B) - [Llama-3.1-8B](Llama-3.1-8B) - [Llama-3.1-405B-AWQ-INT4](Llama-3.1-405B-AWQ-INT4)