From 5766c014d0e700f0ecbcd4c7c3602c0beac98964 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?L=C3=A9lio?= <l@mistral.ai>
Date: Wed, 3 Jan 2024 11:28:24 +0100
Subject: [PATCH] Add documentation

---
 .gitignore                           |   3 +
 docs/README.md                       |   1 +
 docs/intro.md                        |  60 ++++++++++++
 docs/models.md                       |  62 ++++++++++++
 docs/platform/01-overview.md         |   6 ++
 docs/platform/02-client.md           | 140 +++++++++++++++++++++++++++
 docs/platform/03-endpoints.md        |  57 +++++++++++
 docs/platform/04-guardrailing.md     |  56 +++++++++++
 docs/platform/05-pricing.md          |  25 +++++
 docs/platform/_category_.json        |   9 ++
 docs/self-deployment/01-overview.md  |  11 +++
 docs/self-deployment/02-trtllm.md    |  14 +++
 docs/self-deployment/03-vllm.md      |  92 ++++++++++++++++++
 docs/self-deployment/04-skypilot.md  |  86 ++++++++++++++++
 docs/self-deployment/_category_.json |   8 ++
 docusaurus.config.js                 |   1 +
 16 files changed, 631 insertions(+)
 create mode 100644 docs/README.md
 create mode 100644 docs/intro.md
 create mode 100644 docs/models.md
 create mode 100644 docs/platform/01-overview.md
 create mode 100644 docs/platform/02-client.md
 create mode 100644 docs/platform/03-endpoints.md
 create mode 100644 docs/platform/04-guardrailing.md
 create mode 100644 docs/platform/05-pricing.md
 create mode 100644 docs/platform/_category_.json
 create mode 100644 docs/self-deployment/01-overview.md
 create mode 100644 docs/self-deployment/02-trtllm.md
 create mode 100644 docs/self-deployment/03-vllm.md
 create mode 100644 docs/self-deployment/04-skypilot.md
 create mode 100644 docs/self-deployment/_category_.json

diff --git a/.gitignore b/.gitignore
index c6bba59..ebfa0c9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -128,3 +128,6 @@ dist
 .yarn/build-state.yml
 .yarn/install-state.gz
 .pnp.*
+
+# docusaurus build
+build/
diff --git a/docs/README.md b/docs/README.md
new file mode 100644
index 0000000..a07068e
--- /dev/null
+++ b/docs/README.md
@@ -0,0 +1 @@
+# mistral-docs
diff --git a/docs/intro.md b/docs/intro.md
new file mode 100644
index 0000000..f16678d
--- /dev/null
+++ b/docs/intro.md
@@ -0,0 +1,60 @@
+---
+sidebar_position: 1
+slug: /
+---
+
+# Introduction
+
+Mistral AI currently provides two types of access to Large Language Models: 
+- An API providing pay-as-you-go access to our latest models,
+- Open source models available under the [Apache 2.0](https://github.com/apache/.github/blob/main/LICENSE) License, available on [Hugging Face](https://huggingface.co/mistralai) or directly from [the documentation](/models).
+
+## Where to start?
+
+### API Access
+Our API is currently in beta to ramp up the load and provide good quality of service. Access the [platform](https://console.mistral.ai/) to join the waitlist. Once your subscription is active, you can immediately use our `chat` endpoint: 
+
+```bash
+curl --location "https://api.mistral.ai/v1/chat/completions" \
+     --header 'Content-Type: application/json' \
+     --header 'Accept: application/json' \
+     --header "Authorization: Bearer $MISTRAL_API_KEY" \
+     --data '{
+    "model": "mistral-tiny",
+    "messages": [{"role": "user", "content": "Who is the most renowned French painter?"}]
+  }'
+```
+
+Or our embeddings endpoint:
+
+```bash
+curl --location "https://api.mistral.ai/v1/embeddings" \
+     --header 'Content-Type: application/json' \
+     --header 'Accept: application/json' \
+     --header "Authorization: Bearer $MISTRAL_API_KEY" \
+     --data '{
+    "model": "mistral-embed",
+    "input": ["Embed this sentence.", "As well as this one."]
+  }'
+```
+
+For a full description of the models offered on the API, head on to the **[model docs](./models)**.
+
+For more examples on how to use our platform, head on to our **[platform docs](./platform/01-overview.md)**.
+
+### Raw model weights
+
+Raw model weights can be used in several ways: 
+- For self-deployment, on cloud or on premise, using either [TensorRT-LLM](./self-deployment/trtllm) or [vLLM](./self-deployment/vllm), head on to **[Deployment](./self-deployment/skypilot)**
+- For research, head-on to our [reference implementation repository](https://github.com/mistralai/mistral-src),
+- For local deployment on consumer grade hardware, check out the [llama.cpp](https://github.com/ggerganov/llama.cpp) project or [Ollama](https://ollama.ai/).
+
+
+## Get Help
+
+Join our [Discord community](https://discord.gg/mistralai) to discuss our models and talk to our engineers. Alternatively, reach out to our [business team](https://mistral.ai/contact/) if you have enterprise needs, want more information about our products or if there are missing features you would like us to add.
+
+
+## Contributing
+
+Mistral AI is committed to open source software development and welcomes external contributions. Please open a PR!
diff --git a/docs/models.md b/docs/models.md
new file mode 100644
index 0000000..63229c4
--- /dev/null
+++ b/docs/models.md
@@ -0,0 +1,62 @@
+---
+sidebar_position: 3
+slug: models
+---
+
+# Open-weight models
+
+We open-source both pre-trained models and fine-tuned models. These models are not tuned for safety as we want to empower users to test and refine moderation based on their use cases. For safer models, follow our [guardrailing tutorial](./platform/04-guardrailing.md).
+
+## Mistral 7B
+
+Mistral 7B is the first dense model released by Mistral AI. At the time of the release, it matched the capabilities of models up to 30B parameters. Learn more on our [blog post](https://mistral.ai/news/announcing-mistral-7b/).
+
+## Mixtral 8X7B
+
+Mixtral 8X7B is a sparse mixture of experts model. As such, it leverages up to 45B parameters but only uses about 12B during inference, leading to better inference throughput at the cost of more vRAM. Learn more on the dedicated [blog post](https://mistral.ai/news/mixtral-of-experts/).
+
+## Downloading
+
+- Mistral-7B-v0.1: [Hugging Face](https://huggingface.co/mistralai/Mistral-7B-v0.1) // [raw_weights](https://files.mistral-7b-v0-1.mistral.ai/mistral-7B-v0.1.tar) (md5sum: `37dab53973db2d56b2da0a033a15307f`).
+- Mistral-7B-Instruct-v0.2: [Hugging Face](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2) // [raw_weights](https://files.mistral-7b-v0-2.mistral.ai/Mistral-7B-v0.2-Instruct.tar) (md5sum: `fbae55bc038f12f010b4251326e73d39`).
+- Mixtral-8x7B-v0.1: [Hugging Face](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1).
+- Mixtral-8x7B-Instruct-v0.1: [Hugging Face](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1) // [raw_weights](https://files.mixtral-8x7b-v0-1.mistral.ai/Mixtral-8x7B-v0.1-Instruct.tar) (md5sum: `8e2d3930145dc43d3084396f49d38a3f`).
+
+## Sizes
+
+| Name               | Number of parameters | Number of active parameters | Min. GPU RAM for inference (GB) |
+|--------------------|:--------------------:|:---------------------------:|:-------------------------------:|
+| Mistral-7B-v0.2    | 7.3B                 | 7.3B                        | 16                              |
+| Mistral-8X7B-v0.1  | 46.7B                  | 12.9B                         | 100                             |
+
+## Chat template
+
+The template used to build a prompt for the Instruct model is defined as follows:
+```
+<s>[INST] Instruction [/INST] Model answer</s>[INST] Follow-up instruction [/INST]
+```
+
+Note that `<s>` and `</s>` are special tokens for beginning of string (BOS) and end of string (EOS) while `[INST]` and `[/INST]` are regular strings.
+
+:::note
+
+This format must be strictly respected. Otherwise, the model will generate sub-optimal outputs.
+
+:::
+
+As a reference, here is the format used to tokenize instructions during fine-tuning:
+
+```
+[START_SYMBOL_ID] + 
+tok("[INST]") + tok(USER_MESSAGE_1) + tok("[/INST]") +
+tok(BOT_MESSAGE_1) + [END_SYMBOL_ID] +
+…
+tok("[INST]") + tok(USER_MESSAGE_N) + tok("[/INST]") +
+tok(BOT_MESSAGE_N) + [END_SYMBOL_ID]
+```
+
+:::note
+
+The function `tok` should never generate the EOS token. However, FastChat (used in vLLM) sends the full prompt as a string, which might lead to incorrect tokenization of the EOS token and prompt injection. Users are encouraged to send tokens instead, as described above.
+
+:::
\ No newline at end of file
diff --git a/docs/platform/01-overview.md b/docs/platform/01-overview.md
new file mode 100644
index 0000000..3fe06c5
--- /dev/null
+++ b/docs/platform/01-overview.md
@@ -0,0 +1,6 @@
+# Platform
+
+We provide chat generation endpoints for both our [open-weight models](../models.md) and our optimized models. 
+Our endpoints can be used with our [client packages](../client) or accessed directly through our [API](../../api).
+See our [endpoints page](../endpoints) for a detailed description of endpoints performance. We detail how to moderate
+our endpoints in [guardrailing](../guardrailing), and their [prices](../pricing).
\ No newline at end of file
diff --git a/docs/platform/02-client.md b/docs/platform/02-client.md
new file mode 100644
index 0000000..50828b8
--- /dev/null
+++ b/docs/platform/02-client.md
@@ -0,0 +1,140 @@
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# Client code
+
+We provide client codes in both Python and Javascript.
+
+## Installation
+
+Follow installation instructions in the repository for our [Python Client](https://github.com/mistralai/client-python) or [Javascript Client](https://github.com/mistralai/client-js).
+
+## Chat Completion
+
+The chat completion API allows you to chat with a model fine-tuned to follow instructions.
+
+<Tabs>
+  <TabItem value="python" label="python" default>
+```python
+from mistralai.client import MistralClient
+from mistralai.models.chat_completion import ChatMessage
+
+api_key = os.environ["MISTRAL_API_KEY"]
+model = "mistral-tiny"
+
+client = MistralClient(api_key=api_key)
+
+messages = [
+    ChatMessage(role="user", content="What is the best French cheese?")
+]
+
+# No streaming
+chat_response = client.chat(
+    model=model,
+    messages=messages,
+)
+
+# With streaming
+for chunk in client.chat_stream(model=model, messages=messages):
+    print(chunk)
+```
+  </TabItem>
+  <TabItem value="javascript" label="javascript">
+```javascript
+import MistralClient from '@mistralai/mistralai';
+
+const apiKey = process.env.MISTRAL_API_KEY;
+
+const client = new MistralClient(apiKey);
+
+const chatResponse = await client.chat({
+  model: 'mistral-tiny',
+  messages: [{role: 'user', content: 'What is the best French cheese?'}],
+});
+
+console.log('Chat:', chatResponse.choices[0].message.content);
+```
+  </TabItem>
+  <TabItem value="curl" label="curl">
+```bash
+curl --location "https://api.mistral.ai/v1/chat/completions" \
+     --header 'Content-Type: application/json' \
+     --header 'Accept: application/json' \
+     --header "Authorization: Bearer $MISTRAL_API_KEY" \
+     --data '{
+    "model": "mistral-tiny",
+    "messages": [
+     {
+        "role": "user",
+        "content": "What is the best French cheese?"
+      }
+    ]
+  }'
+```
+  </TabItem>
+</Tabs>
+
+We allow users to provide a custom system prompt (see [API reference](../../api)). A convenient `safe_mode` flag allow to force chat completion to be moderated against sensitive content (see [Guardrailing](../guardrailing)).
+
+## Embeddings
+
+The embeddings API allows you to embed sentences.
+
+<Tabs>
+  <TabItem value="python" label="python" default>
+```python
+from mistralai.client import MistralClient
+
+api_key = os.environ["MISTRAL_API_KEY"]
+client = MistralClient(api_key=api_key)
+
+embeddings_batch_response = client.embeddings(
+      model="mistral-embed",
+      input=["Embed this sentence.", "As well as this one."],
+  )
+```
+  </TabItem>
+  <TabItem value="javascript" label="javascript">
+```javascript
+import MistralClient from '@mistralai/mistralai';
+
+const apiKey = process.env.MISTRAL_API_KEY;
+
+const client = new MistralClient(apiKey);
+
+const input = [];
+for (let i = 0; i < 10; i++) {
+  input.push('What is the best French cheese?');
+}
+
+const embeddingsBatchResponse = await client.embeddings({
+  model: 'mistral-embed',
+  input: input,
+});
+
+console.log('Embeddings Batch:', embeddingsBatchResponse.data);
+```
+  </TabItem>
+  <TabItem value="curl" label="curl">
+```bash
+curl --location "https://api.mistral.ai/v1/embeddings" \
+     --header 'Content-Type: application/json' \
+     --header 'Accept: application/json' \
+     --header "Authorization: Bearer $MISTRAL_API_KEY" \
+     --data '{
+    "model": "mistral-embed",
+    "input": [
+      "Embed this sentence.", 
+      "As well as this one."
+    ]
+  }'
+```
+  </TabItem>
+</Tabs>
+
+# Third-Party Clients
+
+Here are some clients built by the community for various other languages:
+
+## Go
+[Gage-Technologies](https://github.com/Gage-Technologies/mistral-go)
diff --git a/docs/platform/03-endpoints.md b/docs/platform/03-endpoints.md
new file mode 100644
index 0000000..0434206
--- /dev/null
+++ b/docs/platform/03-endpoints.md
@@ -0,0 +1,57 @@
+import Benchmark from '@site/static/img/mistral_family.png';
+
+# Endpoints
+
+We provide different endpoints with different price/performance tradeoffs. Our endpoints depend on internal models.
+ Some of them are [open-weight](../../models), which allow users to deploy them on their own, on arbitrary infrastructure.
+ See [Self-deployment](../../self-deployment/overview) for details.
+
+## Generative endpoints
+
+All our generative endpoints can reason on contexts up to 32k tokens and follow fine-grained instructions.
+The following table gathers benchmarks for each endpoint.
+
+<!-- <div style="text-align: center;"> -->
+<img src={Benchmark} alt="Benchmark" width="500px" class="center"/>
+<!-- </div> -->
+
+We only provide chat access through our API. Users can access underlying base models for endpoints relying on 
+[open-weight models](../../models).
+
+### Tiny
+
+This generative endpoint is best used for large batch processing tasks where cost is a significant factor 
+but reasoning capabilities are not crucial.
+
+Currently powered by Mistral-7B-v0.2, a better fine-tuning of the initial Mistral-7B released,
+inspired by the fantastic work of the community.
+
+
+API name: `mistral-tiny`
+
+### Small
+
+Higher reasoning capabilities and more capabilities.
+
+The endpoint supports English, French, German, Italian, and Spanish and can produce and reason about code.
+
+Currently powered Mixtral-8X7B-v0.1, a sparse mixture of experts model with 12B active parameters.
+
+
+API name: `mistral-small`
+
+### Medium
+
+This endpoint currently relies on an internal prototype model.
+
+API name: `mistral-medium`
+
+## Embedding models
+
+Embedding models enable retrieval and retrieval-augmented generation applications.
+
+Our endpoint outputs vectors in `1024` dimensions. It achieves a retrieval score of 55.26 on MTEB.
+
+API name: `mistral-embed`
+
+
diff --git a/docs/platform/04-guardrailing.md b/docs/platform/04-guardrailing.md
new file mode 100644
index 0000000..f3b6afe
--- /dev/null
+++ b/docs/platform/04-guardrailing.md
@@ -0,0 +1,56 @@
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# Guardrailing
+
+## System prompt to enforce guardrails
+
+The ability to enforce guardrails in chat generations is crucial for front-facing applications. We introduce an optional system prompt to enforce guardrails on top of our models. You can activate this prompt through a `safe_mode` binary flag in API calls as follows:
+
+<Tabs>
+  <TabItem value="python" label="python" default>
+```python
+chat_response = client.chat(
+    model="mistral-tiny", 
+    messages=ChatMessage(role="user", content="What is the best French cheese?"),
+    safe_mode=True
+)
+```
+  </TabItem>
+  <TabItem value="javascript" label="javascript">
+```javascript
+const chatResponse = await client.chat(
+    model: 'mistral-tiny',
+    messages: [{role: 'user', content: 'What is the best French cheese?'}],
+    safe_mode: true
+);
+```
+  </TabItem>
+  <TabItem value="curl" label="curl">
+```bash
+curl --location "https://api.mistral.ai/v1/chat/completions" \
+     --header 'Content-Type: application/json' \
+     --header 'Accept: application/json' \
+     --header "Authorization: Bearer $MISTRAL_API_KEY" \
+     --data '{
+    "model": "mistral-tiny",
+    "messages": [
+     {
+        "role": "user",
+        "content": "What is the best French cheese?"
+      }
+    ],
+    "safe_mode": true
+  }'
+```
+  </TabItem>
+</Tabs>
+
+Toggling `safe_mode` will prepend your messages with the following system prompt:
+```
+Always assist with care, respect, and truth. Respond with utmost utility yet securely. Avoid harmful, unethical, prejudiced, or negative content. Ensure replies promote fairness and positivity.
+```
+<!-- 
+## Safety and utility trade-off
+
+TODO Safety and utility benchmarks with and without safe mode -->
diff --git a/docs/platform/05-pricing.md b/docs/platform/05-pricing.md
new file mode 100644
index 0000000..f108923
--- /dev/null
+++ b/docs/platform/05-pricing.md
@@ -0,0 +1,25 @@
+# Pricing and rate limits
+
+## Pay-as-you-go
+
+The prices listed below are exclusive of VAT.
+
+### Chat Completions API
+
+| Model  | Input | Output |
+|-----------|-----------|-----------|
+| `mistral-tiny` | 0.14€ / 1M tokens | 0.42€ / 1M tokens |
+| `mistral-small` | 0.6€ / 1M tokens | 1.8€ / 1M tokens |
+| `mistral-medium` | 2.5€ / 1M tokens | 7.5€ / 1M tokens |
+
+### Embeddings API
+
+| Model  | Input |
+|-----------|-----------|
+| `mistral-embed` | 0.1€ / 1M tokens |
+
+## Rate limits
+
+All endpoints have a rate limit of 2 requests per second, 2 million tokens per minute, and 200 million tokens per month. You can check your current rate limits on the platform. If you need to increase them, please contact support with your estimated consumption and use case.
+
+We will raise the limits for embedding models in the future.
\ No newline at end of file
diff --git a/docs/platform/_category_.json b/docs/platform/_category_.json
new file mode 100644
index 0000000..041237c
--- /dev/null
+++ b/docs/platform/_category_.json
@@ -0,0 +1,9 @@
+{
+    "label": "Platform",
+    "position": 2,
+    "link": {
+      "type": "doc",
+      "id": "platform/overview"
+    }
+}
+  
\ No newline at end of file
diff --git a/docs/self-deployment/01-overview.md b/docs/self-deployment/01-overview.md
new file mode 100644
index 0000000..291f270
--- /dev/null
+++ b/docs/self-deployment/01-overview.md
@@ -0,0 +1,11 @@
+# Self-deployment
+
+Mistral AI provides ready-to-use Docker images on the Github registry. The weights are distributed separately.
+
+To run these images, you need a cloud virtual machine matching the requirements for a given model. These requirements can be found in the [model description](../models.md).
+
+We recommend two different serving frameworks for our models :
+- [vLLM](https://vllm.readthedocs.io/): A python only serving framework which deploys an API matching OpenAI's spec. vLLM provides paged attention kernel to improve serving throughput.
+- NVidias's [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) served with Nvidia's [Triton Inference Server](https://github.com/triton-inference-server) : TensorRT-LLM provides a DSL to build fast inference engines with dedicated kernels for large language models. Triton Inference Server allows efficient serving of these inference engines.
+
+These images can be run locally, or on your favorite cloud provider, using [SkyPilot](https://skypilot.readthedocs.io/en/latest/).
diff --git a/docs/self-deployment/02-trtllm.md b/docs/self-deployment/02-trtllm.md
new file mode 100644
index 0000000..9285d57
--- /dev/null
+++ b/docs/self-deployment/02-trtllm.md
@@ -0,0 +1,14 @@
+# TensorRT-LLM // Triton
+
+# Building the engine
+
+Follow the official TensorRT-LLM documentation to [build the engine](https://github.com/NVIDIA/TensorRT-LLM/tree/main#quick-start).
+- For Mistral-7B, you can use the [LLaMA example](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/llama#mistral-v01)
+- For Mixtral-8X7B, official documentation coming soon...
+
+
+# Deploying the engine
+
+Once the engine is built, it can be deployed using the Triton inference server and its TensorRTLLM backend.
+
+Follow the [official documentation](https://github.com/triton-inference-server/tensorrtllm_backend#using-the-tensorrt-llm-backend).
diff --git a/docs/self-deployment/03-vllm.md b/docs/self-deployment/03-vllm.md
new file mode 100644
index 0000000..9fbaa94
--- /dev/null
+++ b/docs/self-deployment/03-vllm.md
@@ -0,0 +1,92 @@
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# vLLM
+
+vLLM can be deployed using a docker image we provide, or directly from the python package.
+
+## With docker
+
+On a GPU-enabled host, you can run the Mistral AI LLM Inference image with the following command to download the model from Hugging Face:
+<Tabs>
+  <TabItem value="mistral7b" label="Mistral-7B" default>
+
+```bash
+docker run --gpus all \
+    -e HF_TOKEN=$HF_TOKEN -p 8000:8000 \
+    ghcr.io/mistralai/mistral-src/vllm:latest \
+    --host 0.0.0.0 \
+    --model mistralai/Mistral-7B-Instruct-v0.2
+```
+
+  </TabItem>
+  <TabItem value="mixtral8x7b" label="Mixtral-8X7B">
+
+```bash
+docker run --gpus all \
+    -e HF_TOKEN=$HF_TOKEN -p 8000:8000 \
+    ghcr.io/mistralai/mistral-src/vllm:latest \
+    --host 0.0.0.0 \
+    --model mistralai/Mixtral-8x7B-Instruct-v0.1 \
+    --tensor-parallel-size 2 # adapt to your GPUs \
+    --load-format pt # needed since both `pt` and `safetensors` are available
+```
+
+  </TabItem>
+</Tabs>
+
+Where `HF_TOKEN` is an environment variable containing your [Hugging Face user access token](https://huggingface.co/docs/hub/security-tokens).
+This will spawn a vLLM instance exposing an OpenAI-like API, as documented in the [API section](/api).
+
+:::info
+
+If your GPU has CUDA capabilities below 8.0, you will see the error `ValueError: Bfloat16 is only supported on GPUs with compute capability of at least 8.0. Your XXX GPU has compute capability 7.0`. You need to pass the parameter `--dtype half` to the Docker command line.
+
+:::
+
+The dockerfile for this image can be found on our [reference implementation github](https://github.com/mistralai/mistral-src/blob/main/deploy/Dockerfile).
+
+## Without docker
+
+Alternatively, you can directly spawn a vLLM server on a GPU-enabled host with Cuda 11.8.
+
+### Install vLLM
+
+Firstly you need to install vLLM (or use `conda add vllm` if you are using Anaconda):
+
+```bash
+pip install vllm
+```
+
+### Log in to the Hugging Face hub
+
+You will also need to log in to the Hugging Face hub using: 
+
+```bash
+huggingface-cli login
+```
+
+### Run the OpenAI compatible inference endpoint
+
+You can then use the following command to start the server:
+<Tabs>
+  <TabItem value="mistral7b" label="Mistral-7B" default>
+
+```bash
+python -u -m vllm.entrypoints.openai.api_server \
+       --host 0.0.0.0 \
+       --model mistralai/Mistral-7B-Instruct-v0.2
+```
+  </TabItem>
+  <TabItem value="mixtral8x7b" label="Mixtral-8X7B">
+
+```bash
+python -u -m vllm.entrypoints.openai.api_server \
+       --host 0.0.0.0 \
+       --model mistralai/Mixtral-8X7B-Instruct-v0.1 \
+       --tensor-parallel-size 2 # adapt to your GPUs \
+      --load-format pt # needed since both `pt` and `safetensors` are available
+```
+
+  </TabItem>
+</Tabs>
\ No newline at end of file
diff --git a/docs/self-deployment/04-skypilot.md b/docs/self-deployment/04-skypilot.md
new file mode 100644
index 0000000..bded6d5
--- /dev/null
+++ b/docs/self-deployment/04-skypilot.md
@@ -0,0 +1,86 @@
+import Tabs from '@theme/Tabs';
+import TabItem from '@theme/TabItem';
+
+# Deploy with SkyPilot
+
+[SkyPilot](https://skypilot.readthedocs.io/en/latest/) is a framework for running LLMs, AI, and batch jobs on any cloud, offering maximum cost savings, highest GPU availability, and managed execution.
+
+We provide an example SkyPilot config that deploys our models.
+
+## SkyPilot Configuration
+
+After [installing SkyPilot](https://skypilot.readthedocs.io/en/latest/getting-started/installation.html), you need to create a configuration file that tells SkyPilot how and where to deploy your inference server, using our pre-built docker container:
+<Tabs>
+  <TabItem value="mistral7b" label="Mistral-7B" default>
+
+```yaml
+resources: 
+  cloud: ${CLOUD_PROVIDER}
+  accelerators: A10G:1
+  ports: 
+    - 8000
+
+run: |
+  docker run --gpus all -p 8000:8000 ghcr.io/mistralai/mistral-src/vllm:latest \
+                   --host 0.0.0.0 \
+                   --model mistralai/Mistral-7B-Instruct-v0.2 \
+                   --tensor-parallel-size 1
+```
+
+  </TabItem>
+  <TabItem value="mixtral8x7b" label="Mixtral-8X7B">
+
+```yaml
+resources: 
+  cloud: ${CLOUD_PROVIDER}
+  accelerators: A100-80GB:2
+  ports: 
+    - 8000
+
+run: |
+  docker run --gpus all -p 8000:8000 ghcr.io/mistralai/mistral-src/vllm:latest \
+                   --host 0.0.0.0 \
+                   --model mistralai/Mixtral-8x7B-Instruct-v0.1 \
+                   --tensor-parallel-size 2
+```
+
+  </TabItem>
+</Tabs>
+Once these environment variables are set, you can use `sky launch` to launch the inference server with the name `mistral-7b`:
+
+```bash
+sky launch -c mistral-7b mistral-7b-v0.1.yaml --region us-east-1
+```
+
+:::caution
+
+When deployed that way, the model will be accessible to the whole world. You **must** secure it, either by exposing it exclusively on your private network (change the `--host` Docker option for that), by adding a load-balancer with an authentication mechanism in front of it, or by configuring your instance networking properly.
+
+:::
+
+### Test it out!
+
+To easily retrieve the IP address of the deployed `mistral-7b` cluster you can use:
+
+```bash
+sky status --ip mistral-7b
+```
+
+You can then use curl to send a completion request:
+
+```
+IP=$(sky status --ip cluster-name)
+
+curl http://$IP:8000/v1/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+      "model": "mistralai/Mistral-7B-v0.1",
+      "prompt": "My favourite condiment is",
+      "max_tokens": 25
+  }'
+```
+
+
+## Usage Quotas
+
+Many cloud providers require you to explicitly request access to powerful GPU instances. Read [SkyPilot's guide](https://skypilot.readthedocs.io/en/latest/cloud-setup/quota.html) on how to do this.
diff --git a/docs/self-deployment/_category_.json b/docs/self-deployment/_category_.json
new file mode 100644
index 0000000..6feee32
--- /dev/null
+++ b/docs/self-deployment/_category_.json
@@ -0,0 +1,8 @@
+{
+  "label": "Self-deployment",
+  "position": 4,
+  "link": {
+    "type": "doc",
+    "id": "self-deployment/overview"
+  }
+}
diff --git a/docusaurus.config.js b/docusaurus.config.js
index d117c05..daab61e 100644
--- a/docusaurus.config.js
+++ b/docusaurus.config.js
@@ -34,6 +34,7 @@ const config = {
   // Set the /<baseUrl>/ pathname under which your site is served
   // For GitHub pages deployment, it is often '/<projectName>/'
   baseUrl: "/",
+  trailingSlash: true,
 
   // GitHub pages deployment config.
   // If you aren't using GitHub pages, you don't need these.