diff --git a/.github/workflows/build_documentation.yml b/.github/workflows/build_documentation.yml
deleted file mode 100644
index 7902c171..00000000
--- a/.github/workflows/build_documentation.yml
+++ /dev/null
@@ -1,22 +0,0 @@
-name: Build the documentation
-
-on:
-  pull_request:
-    branches: [main]
-
-jobs:
-  build:
-    name: Build
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v4
-      - uses: actions/setup-python@v4
-        with:
-          python-version: "3.10"
-
-      - name: Build the documentation
-        env:
-          GOOGLE_ANALYTICS_KEY: ${{ secrets.GOOGLE_ANALYTICS_KEY }}
-        run: |
-          pip install -r requirements-doc.txt
-          mkdocs build
diff --git a/.github/workflows/publish_documentation.yml b/.github/workflows/publish_documentation.yml
deleted file mode 100644
index 4679121b..00000000
--- a/.github/workflows/publish_documentation.yml
+++ /dev/null
@@ -1,30 +0,0 @@
-name: Publish the documentation
-
-on:
-  push:
-    branches:
-      - main
-
-permissions:
-  contents: write
-
-jobs:
-  deploy:
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v4
-      - uses: actions/setup-python@v4
-        with:
-          python-version: 3.x
-      - run: echo "cache_id=$(date --utc '+%V')" >> $GITHUB_ENV
-      - uses: actions/cache@v3
-        with:
-          key: mkdocs-material-${{ env.cache_id }}
-          path: .cache
-          restore-keys: |
-            mkdocs-material-
-      - run: pip install -r requirements-doc.txt
-      - name: Build documentation
-        env:
-          GOOGLE_ANALYTICS_KEY: ${{ secrets.GOOGLE_ANALYTICS_KEY }}
-        run: mkdocs gh-deploy --force
diff --git a/docs/api/guide.md b/docs/api/guide.md
deleted file mode 100644
index 1c3160c8..00000000
--- a/docs/api/guide.md
+++ /dev/null
@@ -1 +0,0 @@
-::: outlines.fsm.guide
diff --git a/docs/api/index.md b/docs/api/index.md
deleted file mode 100644
index b0d5c88f..00000000
--- a/docs/api/index.md
+++ /dev/null
@@ -1 +0,0 @@
-# API Reference
diff --git a/docs/api/json_schema.md b/docs/api/json_schema.md
deleted file mode 100644
index 471cb3a8..00000000
--- a/docs/api/json_schema.md
+++ /dev/null
@@ -1 +0,0 @@
-::: outlines.fsm.json_schema
diff --git a/docs/api/models.md b/docs/api/models.md
deleted file mode 100644
index 27ad297f..00000000
--- a/docs/api/models.md
+++ /dev/null
@@ -1,3 +0,0 @@
-::: outlines.models.transformers
-
-::: outlines.models.openai
diff --git a/docs/api/parsing.md b/docs/api/parsing.md
deleted file mode 100644
index e9662999..00000000
--- a/docs/api/parsing.md
+++ /dev/null
@@ -1 +0,0 @@
-::: outlines.fsm.parsing
diff --git a/docs/api/prompts.md b/docs/api/prompts.md
deleted file mode 100644
index 9d28f838..00000000
--- a/docs/api/prompts.md
+++ /dev/null
@@ -1 +0,0 @@
-::: outlines.prompts
diff --git a/docs/api/regex.md b/docs/api/regex.md
deleted file mode 100644
index 5ef91db4..00000000
--- a/docs/api/regex.md
+++ /dev/null
@@ -1 +0,0 @@
-::: outlines.generate.regex
diff --git a/docs/api/samplers.md b/docs/api/samplers.md
deleted file mode 100644
index 2b9b3423..00000000
--- a/docs/api/samplers.md
+++ /dev/null
@@ -1 +0,0 @@
-::: outlines.samplers
diff --git a/docs/assets/images/dottxt.png b/docs/assets/images/dottxt.png
deleted file mode 100644
index 1bdf8cb6..00000000
Binary files a/docs/assets/images/dottxt.png and /dev/null differ
diff --git a/docs/assets/images/logits_processing_diagram.svg b/docs/assets/images/logits_processing_diagram.svg
deleted file mode 100644
index 92668e6c..00000000
--- a/docs/assets/images/logits_processing_diagram.svg
+++ /dev/null
@@ -1,157 +0,0 @@
-<?xml version="1.0" encoding="UTF-8" standalone="no"?>
-<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
- "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
-<!-- Generated by graphviz version 2.43.0 (0)
- -->
-<!-- Title: %3 Pages: 1 -->
-<svg width="214pt" height="784pt"
- viewBox="0.00 0.00 214.00 784.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
-<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 780)">
-<title>%3</title>
-<polygon fill="white" stroke="transparent" points="-4,4 -4,-780 210,-780 210,4 -4,4"/>
-<!-- inputTokens -->
-<g id="node1" class="node">
-<title>inputTokens</title>
-<polygon fill="none" stroke="black" points="48,-751 48,-772 159,-772 159,-751 48,-751"/>
-<text text-anchor="start" x="51" y="-758.8" font-family="Times,serif" font-weight="bold" font-size="14.00">Input Tokens</text>
-<polygon fill="none" stroke="black" points="48,-730 48,-751 95,-751 95,-730 48,-730"/>
-<text text-anchor="start" x="66.5" y="-736.8" font-family="Times,serif" font-size="14.00">7</text>
-<polygon fill="none" stroke="black" points="95,-730 95,-751 127,-751 127,-730 95,-730"/>
-<text text-anchor="start" x="106" y="-736.8" font-family="Times,serif" font-size="14.00">4</text>
-<polygon fill="none" stroke="black" points="127,-730 127,-751 159,-751 159,-730 127,-730"/>
-<text text-anchor="start" x="138" y="-736.8" font-family="Times,serif" font-size="14.00">8</text>
-</g>
-<!-- TransformerDecoder -->
-<g id="node2" class="node">
-<title>TransformerDecoder</title>
-<polygon fill="lightblue" stroke="lightblue" points="190,-690 16,-690 16,-654 190,-654 190,-690"/>
-<text text-anchor="middle" x="103" y="-668.9" font-family="Helvetica,sans-Serif" font-size="12.00">Transformer Decoder Pass</text>
-</g>
-<!-- inputTokens&#45;&gt;TransformerDecoder -->
-<g id="edge1" class="edge">
-<title>inputTokens&#45;&gt;TransformerDecoder</title>
-<path fill="none" stroke="black" d="M103,-725.64C103,-717.67 103,-708.77 103,-700.58"/>
-<polygon fill="black" stroke="black" points="106.5,-700.38 103,-690.38 99.5,-700.38 106.5,-700.38"/>
-</g>
-<!-- logitsTable -->
-<g id="node3" class="node">
-<title>logitsTable</title>
-<polygon fill="none" stroke="black" points="19,-593 19,-614 188,-614 188,-593 19,-593"/>
-<text text-anchor="start" x="22" y="-600.8" font-family="Times,serif" font-weight="bold" font-size="14.00">Model Output Logits</text>
-<polygon fill="none" stroke="black" points="19,-572 19,-593 85,-593 85,-572 19,-572"/>
-<text text-anchor="start" x="30.5" y="-578.8" font-family="Times,serif" font-size="14.00">Token</text>
-<polygon fill="none" stroke="black" points="85,-572 85,-593 188,-593 188,-572 85,-572"/>
-<text text-anchor="start" x="97" y="-578.8" font-family="Times,serif" font-size="14.00">Probability</text>
-<polygon fill="none" stroke="black" points="19,-551 19,-572 85,-572 85,-551 19,-551"/>
-<text text-anchor="start" x="45.5" y="-557.8" font-family="Times,serif" font-size="14.00">+</text>
-<polygon fill="none" stroke="black" points="85,-551 85,-572 188,-572 188,-551 85,-551"/>
-<text text-anchor="start" x="125" y="-557.8" font-family="Times,serif" font-size="14.00">3%</text>
-<polygon fill="none" stroke="black" points="19,-530 19,-551 85,-551 85,-530 19,-530"/>
-<text text-anchor="start" x="41" y="-536.8" font-family="Times,serif" font-size="14.00">foo</text>
-<polygon fill="none" stroke="black" points="85,-530 85,-551 188,-551 188,-530 85,-530"/>
-<text text-anchor="start" x="125" y="-536.8" font-family="Times,serif" font-size="14.00">4%</text>
-<polygon fill="none" stroke="black" points="19,-509 19,-530 85,-530 85,-509 19,-509"/>
-<text text-anchor="start" x="49.5" y="-515.8" font-family="Times,serif" font-size="14.00">.</text>
-<polygon fill="none" stroke="black" points="85,-509 85,-530 188,-530 188,-509 85,-509"/>
-<text text-anchor="start" x="125" y="-515.8" font-family="Times,serif" font-size="14.00">7%</text>
-<polygon fill="none" stroke="black" points="19,-488 19,-509 85,-509 85,-488 19,-488"/>
-<text text-anchor="start" x="47" y="-494.8" font-family="Times,serif" font-size="14.00">1</text>
-<polygon fill="none" stroke="black" points="85,-488 85,-509 188,-509 188,-488 85,-488"/>
-<text text-anchor="start" x="120.5" y="-494.8" font-family="Times,serif" font-size="14.00">11%</text>
-<polygon fill="none" stroke="black" points="19,-467 19,-488 85,-488 85,-467 19,-467"/>
-<text text-anchor="start" x="47" y="-473.8" font-family="Times,serif" font-size="14.00">2</text>
-<polygon fill="none" stroke="black" points="85,-467 85,-488 188,-488 188,-467 85,-467"/>
-<text text-anchor="start" x="120.5" y="-473.8" font-family="Times,serif" font-size="14.00">13%</text>
-<polygon fill="none" stroke="black" points="19,-446 19,-467 85,-467 85,-446 19,-446"/>
-<text text-anchor="start" x="47" y="-452.8" font-family="Times,serif" font-size="14.00">3</text>
-<polygon fill="none" stroke="black" points="85,-446 85,-467 188,-467 188,-446 85,-446"/>
-<text text-anchor="start" x="120.5" y="-452.8" font-family="Times,serif" font-size="14.00">17%</text>
-</g>
-<!-- TransformerDecoder&#45;&gt;logitsTable -->
-<g id="edge2" class="edge">
-<title>TransformerDecoder&#45;&gt;logitsTable</title>
-<path fill="none" stroke="black" d="M103,-653.83C103,-646.79 103,-638.04 103,-628.47"/>
-<polygon fill="black" stroke="black" points="106.5,-628.25 103,-618.25 99.5,-628.25 106.5,-628.25"/>
-</g>
-<!-- OutlinesLogitsProcessor -->
-<g id="node4" class="node">
-<title>OutlinesLogitsProcessor</title>
-<polygon fill="lightblue" stroke="lightblue" points="206,-406 0,-406 0,-370 206,-370 206,-406"/>
-<text text-anchor="middle" x="103" y="-384.9" font-family="Helvetica,sans-Serif" font-size="12.00">Outlines Regex Logits Processor</text>
-</g>
-<!-- logitsTable&#45;&gt;OutlinesLogitsProcessor -->
-<g id="edge3" class="edge">
-<title>logitsTable&#45;&gt;OutlinesLogitsProcessor</title>
-<path fill="none" stroke="black" d="M103,-441.93C103,-432.71 103,-423.91 103,-416.22"/>
-<polygon fill="black" stroke="black" points="106.5,-416.17 103,-406.17 99.5,-416.17 106.5,-416.17"/>
-</g>
-<!-- logitsProcessorTable -->
-<g id="node5" class="node">
-<title>logitsProcessorTable</title>
-<polygon fill="none" stroke="black" points="30,-309 30,-330 177,-330 177,-309 30,-309"/>
-<text text-anchor="start" x="37" y="-316.8" font-family="Times,serif" font-weight="bold" font-size="14.00">Processed Logits</text>
-<polygon fill="none" stroke="black" points="30,-288 30,-309 92,-309 92,-288 30,-288"/>
-<text text-anchor="start" x="39.5" y="-294.8" font-family="Times,serif" font-size="14.00">Token</text>
-<polygon fill="none" stroke="black" points="92,-288 92,-309 177,-309 177,-288 92,-288"/>
-<text text-anchor="start" x="95" y="-294.8" font-family="Times,serif" font-size="14.00">Probability</text>
-<polygon fill="none" stroke="black" points="30,-267 30,-288 92,-288 92,-267 30,-267"/>
-<text text-anchor="start" x="54.5" y="-273.8" font-family="Times,serif" font-size="14.00">+</text>
-<polygon fill="none" stroke="black" points="92,-267 92,-288 177,-288 177,-267 92,-267"/>
-<text text-anchor="start" x="123" y="-273.8" font-family="Times,serif" font-size="14.00">0%</text>
-<polygon fill="none" stroke="black" points="30,-246 30,-267 92,-267 92,-246 30,-246"/>
-<text text-anchor="start" x="41" y="-536.8" font-family="Times,serif" font-size="14.00">foo</text>
-<polygon fill="none" stroke="black" points="92,-246 92,-267 177,-267 177,-246 92,-246"/>
-<text text-anchor="start" x="123" y="-252.8" font-family="Times,serif" font-size="14.00">0%</text>
-<polygon fill="none" stroke="black" points="30,-225 30,-246 92,-246 92,-225 30,-225"/>
-<text text-anchor="start" x="58.5" y="-231.8" font-family="Times,serif" font-size="14.00">.</text>
-<polygon fill="none" stroke="black" points="92,-225 92,-246 177,-246 177,-225 92,-225"/>
-<text text-anchor="start" x="123" y="-231.8" font-family="Times,serif" font-size="14.00">7%</text>
-<polygon fill="none" stroke="black" points="30,-204 30,-225 92,-225 92,-204 30,-204"/>
-<text text-anchor="start" x="56" y="-210.8" font-family="Times,serif" font-size="14.00">1</text>
-<polygon fill="none" stroke="black" points="92,-204 92,-225 177,-225 177,-204 92,-204"/>
-<text text-anchor="start" x="118.5" y="-210.8" font-family="Times,serif" font-size="14.00">11%</text>
-<polygon fill="none" stroke="black" points="30,-183 30,-204 92,-204 92,-183 30,-183"/>
-<text text-anchor="start" x="56" y="-189.8" font-family="Times,serif" font-size="14.00">2</text>
-<polygon fill="none" stroke="black" points="92,-183 92,-204 177,-204 177,-183 92,-183"/>
-<text text-anchor="start" x="118.5" y="-189.8" font-family="Times,serif" font-size="14.00">13%</text>
-<polygon fill="none" stroke="black" points="30,-162 30,-183 92,-183 92,-162 30,-162"/>
-<text text-anchor="start" x="56" y="-168.8" font-family="Times,serif" font-size="14.00">3</text>
-<polygon fill="none" stroke="black" points="92,-162 92,-183 177,-183 177,-162 92,-162"/>
-<text text-anchor="start" x="118.5" y="-168.8" font-family="Times,serif" font-size="14.00">17%</text>
-</g>
-<!-- OutlinesLogitsProcessor&#45;&gt;logitsProcessorTable -->
-<g id="edge4" class="edge">
-<title>OutlinesLogitsProcessor&#45;&gt;logitsProcessorTable</title>
-<path fill="none" stroke="black" d="M103,-369.83C103,-362.79 103,-354.04 103,-344.47"/>
-<polygon fill="black" stroke="black" points="106.5,-344.25 103,-334.25 99.5,-344.25 106.5,-344.25"/>
-</g>
-<!-- sampler -->
-<g id="node6" class="node">
-<title>sampler</title>
-<polygon fill="lightblue" stroke="lightblue" points="136.5,-122 69.5,-122 69.5,-86 136.5,-86 136.5,-122"/>
-<text text-anchor="middle" x="103" y="-100.9" font-family="Helvetica,sans-Serif" font-size="12.00">Sampler</text>
-</g>
-<!-- logitsProcessorTable&#45;&gt;sampler -->
-<g id="edge5" class="edge">
-<title>logitsProcessorTable&#45;&gt;sampler</title>
-<path fill="none" stroke="black" d="M103,-157.93C103,-148.71 103,-139.91 103,-132.22"/>
-<polygon fill="black" stroke="black" points="106.5,-132.17 103,-122.17 99.5,-132.17 106.5,-132.17"/>
-</g>
-<!-- sampledTokenTable -->
-<g id="node7" class="node">
-<title>sampledTokenTable</title>
-<polygon fill="none" stroke="black" points="40,-25 40,-46 166,-46 166,-25 40,-25"/>
-<text text-anchor="start" x="43" y="-32.8" font-family="Times,serif" font-weight="bold" font-size="14.00">Sampled Token</text>
-<polygon fill="none" stroke="black" points="40,-4 40,-25 119,-25 119,-4 40,-4"/>
-<text text-anchor="start" x="58" y="-10.8" font-family="Times,serif" font-size="14.00">Token</text>
-<polygon fill="none" stroke="black" points="119,-4 119,-25 166,-25 166,-4 119,-4"/>
-<text text-anchor="start" x="137.5" y="-10.8" font-family="Times,serif" font-size="14.00">3</text>
-</g>
-<!-- sampler&#45;&gt;sampledTokenTable -->
-<g id="edge6" class="edge">
-<title>sampler&#45;&gt;sampledTokenTable</title>
-<path fill="none" stroke="black" d="M103,-85.91C103,-78.33 103,-69.13 103,-60.23"/>
-<polygon fill="black" stroke="black" points="106.5,-60.05 103,-50.05 99.5,-60.05 106.5,-60.05"/>
-</g>
-</g>
-</svg>
diff --git a/docs/assets/images/logo.png b/docs/assets/images/logo.png
deleted file mode 100644
index 9a9f234a..00000000
Binary files a/docs/assets/images/logo.png and /dev/null differ
diff --git a/docs/assets/images/normal_computing.jpg b/docs/assets/images/normal_computing.jpg
deleted file mode 100644
index a71483a6..00000000
Binary files a/docs/assets/images/normal_computing.jpg and /dev/null differ
diff --git a/docs/blog/.authors.yml b/docs/blog/.authors.yml
deleted file mode 100644
index c3fe2ac1..00000000
--- a/docs/blog/.authors.yml
+++ /dev/null
@@ -1,5 +0,0 @@
-authors:
-  remilouf:
-    name: Remi Louf
-    description: author
-    avatar: https://avatars.githubusercontent.com/u/3885044?v=4
diff --git a/docs/blog/assets/4000_stars.png b/docs/blog/assets/4000_stars.png
deleted file mode 100644
index f3a596d7..00000000
Binary files a/docs/blog/assets/4000_stars.png and /dev/null differ
diff --git a/docs/blog/index.md b/docs/blog/index.md
deleted file mode 100644
index 05761ac5..00000000
--- a/docs/blog/index.md
+++ /dev/null
@@ -1 +0,0 @@
-# Blog
diff --git a/docs/blog/posts/roadmap-2024.md b/docs/blog/posts/roadmap-2024.md
deleted file mode 100644
index d1413b05..00000000
--- a/docs/blog/posts/roadmap-2024.md
+++ /dev/null
@@ -1,91 +0,0 @@
----
-date: 2024-01-10
-categories:
-  - Roadmap
-authors:
-  - remilouf
----
-
-# Roadmap for 2024
-
-Outlines is not even one year old and it's already gone a long way! As we just reached 4000 stars, and before laying out the roadmap for the following year, we would like to pause and thank all of you for supporting us, using and contributing to the library!
-
-![4000 stars](../assets/4000_stars.png)
-
-## Thoughts
-
-Before delving into [the detailed roadmap](#detailed-roadmap), let me share a few thoughts and explain the general direction of the library. These thoughts are informed with my multiple interactions with users, either on [Twitter](https://twitter.com/remilouf) or in our [Discord server](https://discord.gg/ZxBxyWmW5n).
-
-*Outlines currently differentiates itself* from other libraries with its efficient JSON- and regex- constrained generation. A user-facing interface for grammar-structured generation (it had been hidden in the repository) was also recently added. But there is much more we can do along these lines. In 2024 will we will keep pushing in the direction of more accurate, faster constrained generation.
-
-Outlines also supports many models providers: `transformers`, `mamba`, `llama.cpp` and `exllama2`. Those *integrations represent a lot of maintenance*, and we will need to simplify them. For instance, `transformers` now supports quantized models, and we will soon deprecate the support for `autoawq` and `autogptq`.
-Thanks to a refactor of the library, it is now possible to use our constrained generation method by using logits processor with all other libraries, except `mamba`. We will look for libraries that provide state-space models and allow to pass a logits processor during inference. We will interface with `llama.cpp` and `exllama2` using logits processors.
-
-*We would like expand our work to the whole sampling layer*, and add new sampling methods that should make structured generation more accurate. This means we will keep the `transformers` integration as it is today and will expand our text generation logic around this library.
-
-Making workflows re-usable and easy to share is difficult today. That is why *we are big believers in [outlines functions](https://github.com/outlines-dev/functions)*. We will keep improving the interface and adding examples.
-
-Finally, *we want to add a CLI tool*, `outlines serve`. This will allows you to either serve an API that does general constrained generation, or to serve Outlines function.
-
-## Detailed roadmap
-
-Here is a more detailed roadmap for the next 12 months. Outlines is a [community](https://discord.gg/ZxBxyWmW5n) effort, and we invite you to pick either topic and [contribute to the library](https://github.com/outlines-dev/outlines). I will progressively add related [issues](https://github.com/outlines-dev/outlines/issues) in the repository.
-
-### Many more examples and tutorials
-
-Let's be honest, Outlines is lacking clear and thorough examples. We want to change this!
-
-* How does Outlines work? What can you do with it?
-* What can you do with Outlines that is harder or impossible to do with other libraries?
-* How you can perform standard LLM workflows, for instance Chain of Thoughts, Tree of Thoughts, etc?
-* How does Oultines integrates with the larger ecosystem, for instance other libraries like LangChain and LlamaIndex?
-
-### Simplify the integrations
-
-We want to keep the current integrations but lower the maintenance cost so we can focus on what we bring to the table.
-
-* Deprecate every obsolete integration: `transformers` has recently integrated `autoawq` and `autogptq` for instance. ([PR](https://github.com/outlines-dev/outlines/pull/527))
-* See if we can integrate to a library that provides state-space models via a logit processing function;
-* Integrate with llama.cpp via a logits processor;
-* Integrate with exllamav2 via a logits processor;
-
-### Push structured generation further
-
-We're just getting started!
-
-* Improve the performance of existing structured generation algorithms;
-* Improve the correctness of structured generation algorithms;
-* Add ready-to-use grammars in the [grammars](https://github.com/outlines-dev/grammars) repository or in a submodule in Outlines.
-
-### Keep developing Outlines functions
-
-Functions are awesome, use them!
-
-* Implement a CLI `outlines serve` that allows to serve Outlines functions locally;
-* Add more functions to the [functions](https://github.com/outlines-dev/functions) repository.
-
-### Serve structured generation
-
-We want to make it easier to serve structured generation and outlines functions.
-
-* Implement the outlines serve CLI `outlines serve`
-  - Serve local APIs that perform structured generation;
-  - Serve Outlines functions.
-
-### Improve the generation layer
-
-* Use `transformers`'s private API to prepare inputs for generation inside the `Transformers` class;
-* Support successions of model generation and text infilling for methods like Beam Search and SMC;
-* Differentiate by adding new caching methods: attention sink, trie-based caching, etc;
-* Differentiate by implementing SMC;
-* Implement Beam Search;
-* Add token healing.
-
-### A more seamless integration with OpenAI
-
-* Provide the same user interface for OpenAI and open source models so they are easily interchangeable;
-* Integrate the function calling API.
-
-## Last word
-
-This roadmap was influenced by the expressed interests of the community. If it doesn't reflect your needs please come and [share your experience with us](https://discord.gg/ZxBxyWmW5n).
diff --git a/docs/community/belonging.png b/docs/community/belonging.png
deleted file mode 100644
index 7346f9ae..00000000
Binary files a/docs/community/belonging.png and /dev/null differ
diff --git a/docs/community/contribute.md b/docs/community/contribute.md
deleted file mode 100644
index d5568f47..00000000
--- a/docs/community/contribute.md
+++ /dev/null
@@ -1,132 +0,0 @@
----
-title: Contribute
----
-
-## What contributions?
-
-- **Documentation** contributions are very valuable to us!
-- **Examples.** Show us what you did with Outlines :)
-- **Bug reports** with a minimum working examples in the [issue tracker][issues]
-- **Bug fixes** are always a pleasure to review.
-- **New features**. Please start a new [discussion][discussions], or [come chat with us][discord] beforehand!
-
-Note that the [issue tracker][issues] is only intended for actionable items. In doubt, open a [discussion][discussions] or [come talk to us][discord].
-
-## How to contribute?
-
-### Setup
-
-First, [fork the repository on GitHub](https://github.com/outlines-dev/outlines/fork) and clone the fork locally:
-
-```bash
-git clone git@github.com/YourUserName/outlines.git
-cd outlines
-```
-
-Create a new virtual environment. *If you are using conda*:
-
-```bash
-conda env create -f environment.yml
-```
-
-*If you are using venv*:
-
-```python
-python -m venv .venv
-source .venv/bin/activate
-```
-
-Then install the dependencies in editable mode, and install the pre-commit hooks:
-
-```python
-pip install -e ".[test]"
-pre-commit install
-```
-
-### Before pushing your code
-
-Run the tests:
-
-```python
-pytest
-```
-
-And run the code style checks:
-
-```python
-pre-commit run --all-files
-```
-
-### Benchmarking
-
-Outlines uses [asv](https://asv.readthedocs.io) for automated benchmark testing. Benchmarks are run automatically before pull requests are merged to prevent performance degredation.
-
-You can run the benchmark test suite locally with the following command:
-```
-asv run --config benchmarks/asv.conf.json
-```
-
-Caveats:
-- If you're on a device with CUDA, you must add the argument `--launch-method spawn`
-- Uncommitted code will not be benchmarked, you must first commit your changes.
-
-#### Run a specific test:
-```
-asv run --config benchmarks/asv.conf.json -b bench_json_schema.JsonSchemaBenchmark.time_json_schema_to_fsm
-```
-
-#### Profile a specific test:
-```
-asv run --config benchmarks/asv.conf.json --profile -b bench_json_schema.JsonSchemaBenchmark.time_json_schema_to_fsm
-```
-
-#### Compare to `origin/main`
-```
-get fetch origin
-asv continuous origin/main HEAD --config benchmarks/asv.conf.json
-```
-
-#### ASV PR Behavior
-
-- **View ASV Benchmark Results:** Open the workflow, view `BENCHMARK RESULTS` section.
-- Merging is blocked unless benchmarks are run for the latest commit.
-- Benchmarks fail if performance degrades by more than 10% for any individual benchmark.
-- The "Benchmark PR" workflow runs when its manually dispatched, or if the `run_benchmarks` label is added to the PR they run for every commit.
-
-
-### Contribute to the documentation
-
-To work on the *documentation* you will need to install the related dependencies:
-
-```python
-pip install -r requirements-doc.txt
-```
-
-To build the documentation and serve it locally, run the following command in the repository's root folder:
-
-```python
-mkdocs serve
-```
-
-By following the instruction you will be able to view the documentation locally.
-It will be updated every time you make a change.
-
-## Open a Pull Request
-
-Create a new branch on your fork, commit and push the changes:
-
-```bash
-git checkout -b new-branch
-git add .
-git commit -m "Changes I made"
-git push origin new-branch
-```
-
-Then you can [open a pull request][pull-requests] on GitHub. It should prompt you to do so. Every subsequent change that you make on your branch will update the pull request.
-
-Do not hesitate to open a draft PR before your contribution is ready, especially if you have questions and/or need feedback. If you need help, come tell us on [Discord][discord].
-
-[discord]: https://discord.gg/R9DSu34mGd
-[discussions]: https://github.com/outlines-dev/outlines/discussions
-[issues]: https://github.com/outlines-dev/outlines/issues
-[pull-requests]: https://github.com/outlines-dev/outlines/pulls
diff --git a/docs/community/examples.md b/docs/community/examples.md
deleted file mode 100644
index 2ebaf276..00000000
--- a/docs/community/examples.md
+++ /dev/null
@@ -1,19 +0,0 @@
-# Community projects and articles
-
-Publishing examples and articles about Outlines are a meaningful way to contrinute to the community. Here is a list of projects we are aware of. Drop us a line if we forgot yours!
-
-[MMSG](https://github.com/leloykun/mmsg) is a Python library for generating interleaved text and image content in a structured format you can directly pass to downstream APIs.
-
-[Multimodal Structured Generation: CVPR's 2nd MMFM Challenge Technical Report](https://arxiv.org/abs/2406.11403) shows that Structured Generation can outperform finetuning, and maybe even multimodality, in document-image understanding tasks as part of CVPR's 2nd MMFM Challenge.
-
-[Chess LLM Arena](https://huggingface.co/spaces/mlabonne/chessllm) is a HuggingFace Space where you can make LLMs compete in a chess match.
-
-[LLM Data Gen](https://huggingface.co/spaces/lhoestq/LLM_DataGen) is a HuggingFace Space that generates synthetic dataset files in JSONLines format.
-
-[Fast, High-Fidelity LLM Decoding with Regex Constraints ](https://vivien000.github.io/blog/journal/llm-decoding-with-regex-constraints.html) presents an efficient alternative to Outlines's structured generation.
-
-[gigax](https://github.com/GigaxGames/gigax) is an Open-Source library that allows to create real-time LLM-powered NPCs for video games.
-
-[Improving Prompt Consistency with Structured Generations](https://huggingface.co/blog/evaluation-structured-outputs) shows how structured generation can improve consistency of evaluation runs by reducing sensitivity to changes in prompt format.
-
-[AskNews](https://asknews.app) is a news curation service processing 300k news articles per day in a structured way, with Outlines.
diff --git a/docs/community/feedback.md b/docs/community/feedback.md
deleted file mode 100644
index 94280954..00000000
--- a/docs/community/feedback.md
+++ /dev/null
@@ -1,81 +0,0 @@
----
-title: Feedback
----
-
-# Feedback
-
-If Outlines has been helpful to you, let us know on [Discord][discord] or give us a shoutout on [Twitter][twitter]! It's always heartwarming ❤️
-
-
-<head>
-  <!-- From Marvin AI's documentation -->
-  <!-- Their library is also awesome -->
-  <!-- https://www.askmarvin.ai/ -->
-  <style>
-    .tweet-masonry {
-      column-count: 2;
-      column-gap: 20px;
-      padding: 20px;
-    }
-
-    .twitter-tweet {
-      display: inline-block;
-      width: 100%;
-      margin-bottom: 20px;
-      margin-top: 0px !important;
-      break-inside: avoid;
-    }
-
-    @media (max-width: 600px) {
-      .tweet-masonry {
-        column-count: 1;
-      }
-    }
-  </style>
-</head>
-<body>
-
-<div class="tweet-masonry">
-
-<blockquote class="twitter-tweet"><p lang="en" dir="ltr">I am once again reminding you that structured extraction using LLMs is going to transform every single industry in the next 10 years <a href="https://t.co/xQ3tcWnrZ8">https://t.co/xQ3tcWnrZ8</a></p>&mdash; Sam Hogan (@0xSamHogan) <a href="https://twitter.com/0xSamHogan/status/1780637917737816323?ref_src=twsrc%5Etfw">April 17, 2024</a></blockquote> <script async src="https://platform.twitter.com/widgets.js" charset="utf-8"></script>
-
-<blockquote class="twitter-tweet"><p lang="en" dir="ltr">outline&#39;s growth is insane, using is an understatement! <a href="https://t.co/rHCNWhZdCs">https://t.co/rHCNWhZdCs</a></p>&mdash; jason liu (@jxnlco) <a href="https://twitter.com/jxnlco/status/1780618454040797554?ref_src=twsrc%5Etfw">April 17, 2024</a></blockquote> <script async src="https://platform.twitter.com/widgets.js" charset="utf-8"></script>
-
-<blockquote class="twitter-tweet"><p lang="en" dir="ltr">Outlines is an amazing lib and more popular than <a href="https://twitter.com/remilouf?ref_src=twsrc%5Etfw">@remilouf</a>’s modesty will admit. <a href="https://t.co/DfHbMPIlX1">https://t.co/DfHbMPIlX1</a> <a href="https://t.co/mDHIWJrD0C">https://t.co/mDHIWJrD0C</a></p>&mdash; Delip Rao e/σ (@deliprao) <a href="https://twitter.com/deliprao/status/1780780217180598377?ref_src=twsrc%5Etfw">April 18, 2024</a></blockquote> <script async src="https://platform.twitter.com/widgets.js" charset="utf-8"></script>
-
-<blockquote class="twitter-tweet"><p lang="en" dir="ltr">Impressive implementation of a true regex / json / grammar guided text generation <a href="https://t.co/RX5RVYaVIx">pic.twitter.com/RX5RVYaVIx</a></p>&mdash; Rohan Paul (@rohanpaul_ai) <a href="https://twitter.com/rohanpaul_ai/status/1741099984299135403?ref_src=twsrc%5Etfw">December 30, 2023</a></blockquote> <script async src="https://platform.twitter.com/widgets.js" charset="utf-8"></script>
-
-<blockquote class="twitter-tweet"><p lang="en" dir="ltr">Most underrated Github Repo in AI + LLM JSON guided Generation: <a href="https://t.co/lSB8KIet1H">https://t.co/lSB8KIet1H</a></p>&mdash; 🎙Jean-Louis Queguiner (@JiliJeanlouis) <a href="https://twitter.com/JiliJeanlouis/status/1736857292581093706?ref_src=twsrc%5Etfw">December 18, 2023</a></blockquote> <script async src="https://platform.twitter.com/widgets.js" charset="utf-8"></script>
-
-<blockquote class="twitter-tweet"><p lang="en" dir="ltr">Nice and useful. <a href="https://t.co/LX72AE0lgt">https://t.co/LX72AE0lgt</a></p>&mdash; Dan Roy (@roydanroy) <a href="https://twitter.com/roydanroy/status/1691556956941525458?ref_src=twsrc%5Etfw">August 15, 2023</a></blockquote> <script async src="https://platform.twitter.com/widgets.js" charset="utf-8"></script>
-
-<blockquote class="twitter-tweet"><p lang="en" dir="ltr">HUGE dub for open source AI <a href="https://t.co/bYKuiEUZ1j">https://t.co/bYKuiEUZ1j</a></p>&mdash; kenneth 🖇 (@k3nnethfrancis) <a href="https://twitter.com/k3nnethfrancis/status/1691304781732843521?ref_src=twsrc%5Etfw">August 15, 2023</a></blockquote> <script async src="https://platform.twitter.com/widgets.js" charset="utf-8"></script>
-
-<blockquote class="twitter-tweet"><p lang="en" dir="ltr">This is amazing - glad to see more outp guidance modules! <br><br>Will try this out soon I&#39;m wondering how they translate from regex automatons to token boundaries<br><br>Also why Open Source will succeed. Even today I don&#39;t see any guided output functionality from the big providers. <a href="https://t.co/Ity2H25Klf">https://t.co/Ity2H25Klf</a></p>&mdash; Hrishi (@hrishioa) <a href="https://twitter.com/hrishioa/status/1691181499671080960?ref_src=twsrc%5Etfw">August 14, 2023</a></blockquote> <script async src="https://platform.twitter.com/widgets.js" charset="utf-8"></script>
-
-<blockquote class="twitter-tweet"><p lang="en" dir="ltr">Outlines 〰️- a library to help LLM developers guide text generation in a fast and reliable way.<br><br>&quot;Provides generation methods that guarantee that the output will match a regular expressions, or follow a JSON schema.&quot;<br><br>Need to check this out. Reliable JSON output is a common use… <a href="https://t.co/Bkbh8vKogN">pic.twitter.com/Bkbh8vKogN</a></p>&mdash; elvis (@omarsar0) <a href="https://twitter.com/omarsar0/status/1691179888214966273?ref_src=twsrc%5Etfw">August 14, 2023</a></blockquote> <script async src="https://platform.twitter.com/widgets.js" charset="utf-8"></script>
-
-<blockquote class="twitter-tweet"><p lang="en" dir="ltr">Woah this is cool! Makes open source models more usable.<br><br>Give any LLM Function Call capability (and more) with Outlines: <a href="https://t.co/PtPykR5ZGR">https://t.co/PtPykR5ZGR</a> <a href="https://t.co/RRQjWHnIxv">https://t.co/RRQjWHnIxv</a> <a href="https://t.co/BwNnH8SMwv">pic.twitter.com/BwNnH8SMwv</a></p>&mdash; Yohei (@yoheinakajima) <a href="https://twitter.com/yoheinakajima/status/1691231912466223104?ref_src=twsrc%5Etfw">August 14, 2023</a></blockquote> <script async src="https://platform.twitter.com/widgets.js" charset="utf-8"></script>
-
-<blockquote class="twitter-tweet"><p lang="en" dir="ltr">This is awesome! Being able to guarantee the output&#39;s structure unblocks so many applications. This is a great milestone and a fundamental building block for more advanced AI apps. <a href="https://t.co/WdwMOc7hE8">https://t.co/WdwMOc7hE8</a></p>&mdash; Guilherme Castro (@skastr052) <a href="https://twitter.com/skastr052/status/1691239359494619136?ref_src=twsrc%5Etfw">August 15, 2023</a></blockquote> <script async src="https://platform.twitter.com/widgets.js" charset="utf-8"></script>
-
-<blockquote class="twitter-tweet"><p lang="en" dir="ltr">Juggling with the unpredictable outputs of ChatGPT API lately while building my product. 😓 <br><br>Tried prompt engineering to channel its wisdom into a neat JSON, but it&#39;s like asking a cat to fetch. 🐱<br><br>Luckily, stumbled upon &quot;Outlines&quot; – looks like a promising way to tame the LLM… <a href="https://t.co/oYQ6q8exAS">pic.twitter.com/oYQ6q8exAS</a></p>&mdash; Charlie (@14435635Sun) <a href="https://twitter.com/14435635Sun/status/1691439342689095680?ref_src=twsrc%5Etfw">August 15, 2023</a></blockquote> <script async src="https://platform.twitter.com/widgets.js" charset="utf-8"></script>
-
-<blockquote class="twitter-tweet"><p lang="en" dir="ltr">A complex system of LLM input-outputs interacting with non-LLM agents and models benefits immeasurably from structured outputs. The outlines package saves so much time, <a href="https://t.co/NhVQ6NpKDR">https://t.co/NhVQ6NpKDR</a></p>&mdash; Amir Sani (@amirsani) <a href="https://twitter.com/amirsani/status/1728734266568376433?ref_src=twsrc%5Etfw">November 26, 2023</a></blockquote> <script async src="https://platform.twitter.com/widgets.js" charset="utf-8"></script>
-</div>
-</body>
-</html>
-
-# Let us know!
-
-We highly value the insights of our users, and we would love to hear from you. If you are using Outlines for your projects and would like to share your experience with us, let's connect:
-
-- What are you building with it?
-- What do you like about it?
-- What challenges are you facing?
-- What do you think could be improved?
-
-To schedule an appointment follow [this link](https://cal.com/dottxt/outlines). This is exclusively intended to share your experience, please go on [Discord][discord] or [GitHub](https://github.com/outlines-dev/outlines/discussions) for support.
-
-[discord]: https://discord.gg/UppQmhEpe8
-[twitter]: https://twitter.com/dottxtai
diff --git a/docs/community/index.md b/docs/community/index.md
deleted file mode 100644
index 9519f5b5..00000000
--- a/docs/community/index.md
+++ /dev/null
@@ -1,12 +0,0 @@
-# Community
-
-![Belonging](belonging.png)
-
-Outlines exists for a community of users who believe software doesn't need to be complicated. Who share the same passion for Large Language Models but don't want to compromise on robustness. Together, we are bringing these powerful models back to the world of software.
-
-## Connect on Discord
-
-The Outlines community lives on our Discord server. There you can ask questions, share ideas or just chat with people like you. Don't be a stranger and [join us][discord].
-
-
-[discord]: https://discord.gg/UppQmhEpe8
diff --git a/docs/community/versioning.md b/docs/community/versioning.md
deleted file mode 100644
index d64a56e7..00000000
--- a/docs/community/versioning.md
+++ /dev/null
@@ -1,26 +0,0 @@
----
-title: Versioning Guide
----
-
-# Versioning Guide
-
-
-The Outlines project follows a structured versioning scheme designed to provide clarity and minimize risk for downstream dependents.
-
-Each part of the version number (`major.minor.patch`) conveys information about the nature and impact of the changes included in the release.
-
-- **Major Releases** includes compatibility-breaking changes to core interfaces, such as `LogitsProcessor`s and `Guides`.
-- **Minor Releases** introduce changes of substance to internal or unexposed functionality. These changes are well tested and intended to maintain compatability with existing use of core interfaces.
-- **Patch Releases** address bug fixes and incorporate low-risk changes to improve stability and performance.
-
-## Releases
-
-Releases along with release notes can be found on the [Outlines Releases GitHub Page](https://github.com/outlines-dev/outlines/releases).
-
-## Version Pinning Recommendations
-
-Here are our recommendations for managing dependencies on the Outlines package:
-
-**Small, Risk-Tolerant Projects:** Pin to a specific major version.
-
-**Large, Conservative Projects:** Pin to a specific minor version.
diff --git a/docs/cookbook/chain_of_density.md b/docs/cookbook/chain_of_density.md
deleted file mode 100644
index 16c2838f..00000000
--- a/docs/cookbook/chain_of_density.md
+++ /dev/null
@@ -1,125 +0,0 @@
-# Summarize documents using Chain of Density prompting
-
-A good summary should be informative, concise and clear. While large language models are generally good at summarizing documents, their summaries tend to be long and contain redundant information; their information density tends to be on the lower end. This is where [chain of Density](https://arxiv.org/abs/2309.04269), a new prompting technique, comes in. In this example we will show how one can implement chain of density with a few lines of code using Outlines, leveraging both Outline's prompt templating and its structured generation capabilities.
-
-The article we will try to summarize is the first three paragraphs of the [Alan Turing page on Wikipedia](https://en.wikipedia.org/wiki/Alan_Turing):
-
-```python
-article = """
-Alan Mathison Turing OBE FRS (/ˈtjʊərɪŋ/; 23 June 1912 – 7 June 1954) was an English mathematician, computer scientist, logician, cryptanalyst, philosopher and theoretical biologist.[5] Turing was highly influential in the development of theoretical computer science, providing a formalisation of the concepts of algorithm and computation with the Turing machine, which can be considered a model of a general-purpose computer.[6][7][8] He is widely considered to be the father of theoretical computer science and artificial intelligence.[9]
-
-Born in Maida Vale, London, Turing was raised in southern England. He graduated at King's College, Cambridge, with a degree in mathematics. Whilst he was a fellow at Cambridge, he published a proof demonstrating that some purely mathematical yes–no questions can never be answered by computation. He defined a Turing machine and proved that the halting problem for Turing machines is undecidable. In 1938, he obtained his PhD from the Department of Mathematics at Princeton University. During the Second World War, Turing worked for the Government Code and Cypher School at Bletchley Park, Britain's codebreaking centre that produced Ultra intelligence. For a time he led Hut 8, the section that was responsible for German naval cryptanalysis. Here, he devised a number of techniques for speeding the breaking of German ciphers, including improvements to the pre-war Polish bomba method, an electromechanical machine that could find settings for the Enigma machine. Turing played a crucial role in cracking intercepted coded messages that enabled the Allies to defeat the Axis powers in many crucial engagements, including the Battle of the Atlantic.[10][11]
-
-After the war, Turing worked at the National Physical Laboratory, where he designed the Automatic Computing Engine, one of the first designs for a stored-program computer. In 1948, Turing joined Max Newman's Computing Machine Laboratory at the Victoria University of Manchester, where he helped develop the Manchester computers[12] and became interested in mathematical biology. He wrote a paper on the chemical basis of morphogenesis[1] and predicted oscillating chemical reactions such as the Belousov–Zhabotinsky reaction, first observed in the 1960s. Despite these accomplishments, Turing was never fully recognised in Britain during his lifetime because much of his work was covered by the Official Secrets Act.[13]
-"""
-```
-
-## How Chain Of Density works
-
-Chain Of Density starts with asking the model to generate a first long and non-specific summary. Then it asks the model to generate 4 extra summaries by proceeding in the following way:
-
-1. Identify 1-3 entities missing in the previous summary;
-2. Add all entities marked as missing in the previous step, while not dropping entities;
-3. Make the summary more concise;
-
-The prompt also asks the model to return a list of JSON objects that contain the missing entities and the new summary. This is where structured generation will come in handy :) The paper provides the prompt and an example:
-
-![Figure 2 in the paper](./images/chain_of_density.png)
-
-We can now implement the prompt provided in the paper:
-
-```python
-import outlines
-
-@outlines.prompt
-def chain_of_density(article):
-    """Article: {{ article }}
-
-    You will generate increasingly concise, entity-dense summaries of the above Article.
-
-    Repeat the following 2 steps 5 times.
-
-    Step 1. Identify 1-3 informative Entities ("; " delimited) from the Article which are missing from the previously generated summary.
-    Step 2. Write a new, denser summary of identical length which covers every entity and detail from the previous summary plus the Missing Entities.
-
-    A Missing Entity is:
-    - Relevant: to the main story.
-    - Specific: descriptive yet concise (5 words or fewer).
-    - Novel: not in the previous summary.
-    - Faithful: present in the Article.
-    - Anywhere: located anywhere in the Article.
-
-    Guidelines:
-    - The first summary should be long (4-5 sentences, ~80 words) yet highly non-specific, containing little information beyond the entities marked as missing. Use overly verbose language and fillers (e.g., "this article discusses") to reach ~80 words.
-    - Make every word count: rewrite the previous summary to improve flow and make space for additional entities.
-    - Make space with fusion, compression, and removal of uninformative phrases like "the article discusses".
-    - The summaries should become highly dense and concise yet self-contained, e.g., easily understood without the Article.
-    - Missing entities can appear anywhere in the new summary.
-    - Never drop entities from the previous summary. If space cannot be made, add fewer new entities.
-
-    Remember, use the exact same number of words for each summary.
-
-    Answer in JSON. The JSON should be a a dictionary with key "summaries" that contains a list (length 5) of dictionaries whose keys are "Missing_Entities" and "Denser_Summary".
-    """
-```
-
-??? Note
-
-    Note that we modified the prompt slightly so it returns a JSON object that contains the summaries, instead of a list of summaries.
-
-
-## Outlines implementation
-
-We will use Outline's JSON-structured generation to ensure that the model's output is consistent with the format specified in the prompt. We start with defining the JSON objects that the model is asked to return using Pydantic. One JSON object that contains a list of `Summary` objects that contain the missing entities and new summary:
-
-```python
-from pydantic import BaseModel, conlist
-
-class Summary(BaseModel):
-    missing_entities: str
-    denser_summary: str
-
-class Summaries(BaseModel):
-    summaries: conlist(Summary, max_length=5, min_length=5)
-```
-
-We now generate the prompt by passing the article we want to summarize to the template. We load a quantized version of Mistral-7B using the AutoAWQ library, and then use JSON-structured generation to generate the summaries:
-
-```python
-model = outlines.models.transformers("TheBloke/Mistral-7B-OpenOrca-AWQ")
-
-prompt = chain_of_density(article)
-result = outlines.generate.json(model, Summaries)(prompt)
-```
-
-We can now check the results:
-
-```python
-print(result.model_dump())
-# {'summaries': [
-#     {
-#       'missing_entities': 'English mathematician, cryptanalyst, philosopher',
-#       'denser_summary': 'Alan Mathison Turing was an English mathematician, cryptanalyst, philosopher.'
-#     },
-#     {
-#       'missing_entities': '',
-#       'denser_summary': "Alan Mathison Turing was an English mathematician who was a crucial figure in WW2's Bletchley Park codebreaking centre and designed one of the first computers."
-#     },
-#     {
-#       'missing_entities': 'cryptanalyst, studied, biology, father',
-#       'denser_summary': 'Alan Mathison Turing was an English cryptanalyst, studied theoretical computer science, and contributed to mathematical biology.'
-#     },
-#     {
-#       'missing_entities': 'biology, morphogenesis, chemical',
-#       'denser_summary': 'Alan Mathison Turing was an English cryptanalyst, studied theoretical computer science, and predicted chemical reactions in morphogenesis.
-#     '},
-#     {
-#       'missing_entities': '',
-#       'denser_summary': 'Alan Mathison Turing was an English cryptanalyst, developed computer science, and made strides in mathematical biology research.'
-#       }
-# ]}
-```
-
-Not bad, considering we used a smallish model to generate the summary! Chain of Density seems to be a very effective prompting technique to generate dense summaries, even with small quantized models. Its implementation in Outlines is also very short.
-
-Note that this is the first article I tried and it worked out of the box. Try it out on other articles, and please share the results on Twitter, or by opening [a new discussion](https://github.com/outlines-dev/outlines/discussions/categories/show-and-tell) on the Outlines repository!
diff --git a/docs/cookbook/chain_of_thought.md b/docs/cookbook/chain_of_thought.md
deleted file mode 100644
index 17c36269..00000000
--- a/docs/cookbook/chain_of_thought.md
+++ /dev/null
@@ -1,138 +0,0 @@
-# Chain of thought
-
-
-Chain of thought is a prompting technique introduced in the paper ["Chain-of-Thought Prompting Elicits Reasoning in Large Language Models"](https://arxiv.org/abs/2201.11903) where throught prompting the authors generate a series of intermediate reasoning steps which improves the ability of LLMs to perform complex reasoning.
-
-In this guide, we use [outlines](https://outlines-dev.github.io/outlines/) to apply chain of thought through structured output.
-
-We use [llama.cpp](https://github.com/ggerganov/llama.cpp) using the [llama-cpp-python](https://github.com/abetlen/llama-cpp-python) library. Outlines supports llama-cpp-python, but we need to install it ourselves:
-
-```bash
-pip install llama-cpp-python
-```
-
-We download the model weights by passing the name of the repository on the HuggingFace Hub, and the filenames (or glob pattern):
-```python
-import llama_cpp
-from outlines import generate, models
-
-model = models.llamacpp("NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF",
-            "Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf",
-            tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained(
-            "NousResearch/Hermes-2-Pro-Llama-3-8B"
-            ),
-            n_gpu_layers=-1,
-            flash_attn=True,
-            n_ctx=8192,
-            verbose=False)
-```
-
-??? note "(Optional) Store the model weights in a custom folder"
-
-    By default the model weights are downloaded to the hub cache but if we want so store the weights in a custom folder, we pull a quantized GGUF model [Hermes-2-Pro-Llama-3-8B](https://huggingface.co/NousResearch/Hermes-2-Theta-Llama-3-8B-GGUF) by [NousResearch](https://nousresearch.com/) from [HuggingFace](https://huggingface.co/):
-
-    ```bash
-    wget https://hf.co/NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF/resolve/main/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf
-    ```
-
-    We initialize the model:
-
-    ```python
-    import llama_cpp
-    from llama_cpp import Llama
-    from outlines import generate, models
-
-    llm = Llama(
-        "/path/to/model/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf",
-        tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained(
-            "NousResearch/Hermes-2-Pro-Llama-3-8B"
-        ),
-        n_gpu_layers=-1,
-        flash_attn=True,
-        n_ctx=8192,
-        verbose=False
-    )
-    ```
-
-## Chain of thought
-
-We first define our Pydantic class for a reasoning step:
-
-```python
-from pydantic import BaseModel, Field
-
-class Reasoning_Step(BaseModel):
-    reasoning_step: str = Field(..., description="Reasoning step")
-```
-
-We then define the Pydantic class for reasoning which will consist on a list of reasoning steps and a conclusion, and we get its JSON schema:
-
-```python
-from typing import List
-
-class Reasoning(BaseModel):
-    reasoning: List[Reasoning_Step] = Field(..., description="List of reasoning steps")
-    conclusion: str = Field(..., description="Conclusion")
-
-json_schema = Reasoning.model_json_schema()
-```
-
-We could generate a response using the json schema but for a change we will use the regex:
-
-```python
-from outlines.integrations.utils import convert_json_schema_to_str
-from outlines.fsm.json_schema import build_regex_from_schema
-
-schema_str = convert_json_schema_to_str(json_schema=json_schema)
-regex_str = build_regex_from_schema(schema_str)
-```
-
-We then need to adapt our prompt to the [Hermes prompt format for JSON schema](https://github.com/NousResearch/Hermes-Function-Calling?tab=readme-ov-file#prompt-format-for-json-mode--structured-outputs):
-
-```python
-def generate_hermes_prompt(user_prompt):
-    return (
-        "<|im_start|>system\n"
-        "You are a world class AI model who answers questions in JSON "
-        f"Here's the json schema you must adhere to:\n<schema>\n{schema}\n</schema><|im_end|>\n"
-        "<|im_start|>user\n"
-        + user_prompt
-        + "<|im_end|>"
-        + "\n<|im_start|>assistant\n"
-        "<schema>"
-    )
-```
-
-For a given user prompt:
-
-```python
-user_prompt = "9.11 and 9.9 -- which is bigger?"
-```
-
-we can use `generate.regex` by passing the Pydantic class we previously defined, and call the generator with the Hermes prompt:
-
-```python
-generator = generate.regex(model, regex_str)
-prompt = generate_hermes_prompt(user_prompt)
-response = generator(prompt, max_tokens=1024, temperature=0, seed=42)
-```
-
-We obtain a series of intermediate reasoning steps as well as the conclusion:
-
-```python
-import json
-
-json_response = json.loads(response)
-
-print(json_response["reasoning"])
-print(json_response["conclusion"])
-# [{'reasoning_step': 'Both 9.11 and 9.9 are decimal numbers.'},
-#  {'reasoning_step': 'When comparing decimal numbers, we look at the numbers after the decimal point.'},
-#  {'reasoning_step': 'In this case, 9.11 has the number 1 after the decimal point, while 9.9 has the number 9.'},
-#  {'reasoning_step': 'Since 1 is greater than 9, 9.11 is greater than 9.9.'}]
-# '9.11 is bigger.'
-```
-
-We notice that the 4th reasoning step is wrong ``Since 1 is greater than 9, 9.11 is greater than 9.9.'', so we should probably give the model some examples for this particular task.
-
-This example was originally contributed by [Alonso Silva](https://github.com/alonsosilvaallende).
diff --git a/docs/cookbook/classification.md b/docs/cookbook/classification.md
deleted file mode 100644
index c5609631..00000000
--- a/docs/cookbook/classification.md
+++ /dev/null
@@ -1,103 +0,0 @@
-# Classification
-
-Classification is a classic problem in NLP and finds many applications: spam detection, sentiment analysis, triaging of incoming requests, etc. We will use the example of a company that wants to sort support requests between those that require immediate attention (`URGENT`), those that can wait a little (`STANDARD`). You could easily extend the example by adding new labels.
-
-
-This tutorial shows how one can implement multi-label classification using Outlines. We will use two functionalities of the library: `generate.choice` and `generate.json`.
-
-As always, we start with initializing the model. Since we are GPU poor we will be using a quantized version of Mistal-7B-v0.1:
-
-```python
-import outlines
-
-model = outlines.models.transformers("TheBloke/Mistral-7B-OpenOrca-AWQ", device="cuda")
-```
-
-We will use the following prompt template:
-
-```python
-@outlines.prompt
-def customer_support(request):
-    """You are an experienced customer success manager.
-
-    Given a request from a client, you need to determine when the
-    request is urgent using the label "URGENT" or when it can wait
-    a little with the label "STANDARD".
-
-    # Examples
-
-    Request: "How are you?"
-    Label: STANDARD
-
-    Request: "I need this fixed immediately!"
-    Label: URGENT
-
-    # TASK
-
-    Request: {{ request }}
-    Label: """
-```
-
-## Choosing between multiple choices
-
-Outlines provides a shortcut to do multi-label classification, using the `outlines.generate.choice` function to initialize a generator. Outlines uses multinomial sampling by default, here we will use the greedy sampler to get the label with the highest probability:
-
-```python
-from outlines.samplers import greedy
-
-generator = outlines.generate.choice(model, ["URGENT", "STANDARD"], sampler=greedy())
-```
-Outlines supports batched requests, so we will pass two requests to the model:
-
-```python
-requests = [
-    "My hair is one fire! Please help me!!!",
-    "Just wanted to say hi"
-]
-
-prompts = [customer_support(request) for request in requests]
-```
-
-We can now asks the model to classify the requests:
-
-```python
-labels = generator(prompts)
-print(labels)
-# ['URGENT', 'STANDARD']
-```
-
-Now, you might be in a hurry and don't want to wait until the model finishes completion. After all, you only need to see the first letter of the response to know whether the request is urgent or standard. You can instead stream the response:
-
-```python
-tokens = generator.stream(prompts)
-labels = ["URGENT" if "U" in token else "STANDARD" for token in next(tokens)]
-print(labels)
-# ['URGENT', 'STANDARD']
-```
-
-## Using JSON-structured generation
-
-Another (convoluted) way to do multi-label classification is to JSON-structured generation in Outlines. We first need to define our Pydantic schema that contains the labels:
-
-```python
-from enum import Enum
-from pydantic import BaseModel
-
-
-class Label(str, Enum):
-    urgent = "URGENT"
-    standard = "STANDARD"
-
-
-class Classification(BaseModel):
-    label: Label
-```
-
-and we can use `generate.json` by passing this Pydantic model we just defined, and call the generator:
-
-```python
-generator = outlines.generate.json(model, Classification, sampler=greedy())
-labels = generator(prompts)
-print(labels)
-# [Classification(label=<Label.urgent: 'URGENT'>), Classification(label=<Label.standard: 'STANDARD'>)]
-```
diff --git a/docs/cookbook/dating_profiles.md b/docs/cookbook/dating_profiles.md
deleted file mode 100644
index d0fb9b57..00000000
--- a/docs/cookbook/dating_profiles.md
+++ /dev/null
@@ -1,229 +0,0 @@
-# Generate a synthetic dating profile from a description
-
-In this example we will see how we can use Outlines to generate synthetic data for a dating application. This example was originally contributed by [Vibhor Kumar](https://github.com/veezbo).
-
-```python
-from dataclasses import dataclass
-from enum import Enum
-
-import torch
-import transformers
-from pydantic import BaseModel, conlist, constr
-
-import outlines
-```
-
-## Defining the profile with Pydantic
-
-Here a dating profile will consist in a biography, a job, a list of interests and two question-answer pairs. The questions are written in advance by the team, and the users are asked to provide an answer:
-
-```python
-class QuestionChoice(str, Enum):
-    A = "The key to my heart is"
-    B = "The first item on my bucket list is"
-    C = "Perks of dating me"
-    D = "Message me if you also love"
-    E = "People would describe me as"
-    F = "I can beat you in a game of"
-
-@dataclass
-class QuestionAnswer:
-    question: QuestionChoice
-    answer: str
-```
-
-Users need to provide a short biography, with a minimum of 10 and a maximum of 300 characters. The application also limits job descriptions to 50 characters. In addition to the question-answer pairs, the user is required to provide a list of between 1 and 5 interests:
-
-```python
-class DatingProfile(BaseModel):
-    bio: constr(str, min_length=10, max_length=300)
-    job: constr(str, max_lengt=50)
-    interests: conlist(str, min_length=1, max_length=5)  # type: ignore
-    qna1: QuestionAnswer
-    qna2: QuestionAnswer
-```
-
-## Prompt template and examples
-
-We will ask the model to generate profiles from a high-level description:
-
-```python
-@dataclass
-class Example:
-    description: str
-    profile: DatingProfile
-```
-
-We will use Outlines' prompt templating abilities to generate the prompt for us. This help clearly separate the general prompting logic from what is specific to an example.
-
-```python
-
-@outlines.prompt
-def dating_profile_prompt(description: str, examples: list[Example]):
-    """
-    You are a world-renowned matchmaker who understands the modern dating
-    market. Your job is to generate dating app profiles for male clients
-    interested in women based on a provided description. The profiles should be
-    authentic, show off their strengths, and maximize their likelihood of
-    getting matches on dating apps.  Here are some examples of past clients that
-    you have successfully created profiles for:
-
-    {% for example in examples %}
-    Description:
-    {{ example.description }}
-    Profile:
-    {{ example.profile }}
-    {% endfor %}
-
-    Here is the new client who you need to create a profile for:
-    Description: {{ description }}
-    Profile:
-    """
-```
-
-We will provide the model with several few-shot examples:
-
-```python
-samples: list[Example] = [
-    Example(
-        description="I'm an author and former professional soccer player living in Seattle who publishes popular fiction books. A typical day for me starts by hanging out with my cat, drinking a coffee, and reading as much as I can in a few hours. Then, I'll prepare a quick smoothie before starting to write for a few hours, take a break with soccer or running a few miles, and finally meet friends for dinner at a new, hip restaurant in the evening. Sometimes we go axe-throwing afterwards, or play poker, or watch a comedy show, or visit a dive bar. On my vacations, I travel extensively to countries South America, Europe, and Asia, with the goal of visiting them all!",
-        profile=DatingProfile(
-            bio="Adventurer, dreamer, author, and soccer enthusiast. Life’s too short to waste time so I make the most of each day by exploring new places and playing with my friends on the pitch. What’s your favorite way to get out and have fun?",
-            job="Famous Soccer Player -> Famous Author",
-            interests=["Soccer", "Travel", "Friends", "Books", "Fluffy Animals"],
-            qna1=QuestionAnswer(
-                question=QuestionChoice.B, answer="swim in all seven oceans!"
-            ),
-            qna2=QuestionAnswer(
-                question=QuestionChoice.E,
-                answer="fun-loving, adventurous, and a little bit crazy",
-            ),
-        ),
-    ),
-    Example(
-        description="I run my company and build houses for a living. I'm a big fan of the outdoors and love to go hiking, camping, and fishing. I don't like video games, but do like to watch movies. My love language is home-cooked food, and I'm looking for someone who isn't afraid to get their hands dirty.",
-        profile=DatingProfile(
-            bio="If you're looking for a Montana man who loves to get outdoors and hunt, and who's in-tune with his masculinity then I'm your guy!",
-            job="House Construction Manager / Entrepreneur",
-            interests=["Hunting", "Hiking", "The outdoors", "Home-cooked food"],
-            qna1=QuestionAnswer(question=QuestionChoice.A, answer="food made at home"),
-            qna2=QuestionAnswer(
-                question=QuestionChoice.C,
-                answer="having a man in your life who can fix anything",
-            ),
-        ),
-    ),
-    Example(
-        description="I run my own Youtube channel with 10M subscribers. I love working with kids, and my audience skews pretty young too. In my free time, I play Fortnite and Roblox. I'm looking for someone who is also a gamer and likes to have fun. I'm learning Japanese in my free time as well as how to cook.",
-        profile=DatingProfile(
-            bio="Easy on the eyes (find me on Youtube!) and great with kids. What more do you need?",
-            job="Youtuber 10M+ subscribers",
-            interests=["Kids", "Gaming", "Japanese"],
-            qna1=QuestionAnswer(question=QuestionChoice.D, answer="anime and gaming!"),
-            qna2=QuestionAnswer(question=QuestionChoice.F, answer="Fortnite, gg ez"),
-        ),
-    ),
-]
-```
-
-## Load the model
-
-We will use Mosaic's MPT-7B model (requires 13GB of GPU memory) which can fit on a single GPU with a reasonable context window. We initialize it with Outlines:
-
-```python
-config = transformers.AutoConfig.from_pretrained(
-    "mosaicml/mpt-7b-8k-instruct", trust_remote_code=True
-)
-config.init_device = "meta"
-model = outlines.models.transformers(
-    model_name="mosaicml/mpt-7b-8k-instruct",
-    device="cuda",
-    model_kwargs={
-        "config": config,
-        "trust_remote_code": True,
-        "torch_dtype": torch.bfloat16,
-        "device_map": {"": 0},
-    },
-)
-```
-
-## JSON-structured generation of profiles
-
-We will now generate a dating profile from a textual description of oneself:
-
-``` python
-new_description = """I'm a laid-back lawyer who spends a lot of his free-time
-gaming. I work in a corporate office, but ended up here after the start-up  I
-cofounded got acquired, so still play ping pong with my cool coworkers every
-day.  I have a bar at home where I make cocktails, which is great for
-entertaining  friends. I secretly like to wear suits and get a new one tailored
-every few  months. I also like weddings because I get to wear those suits, and
-it's  a good excuse for a date. I watch the latest series because I'm paying,
-with my hard-earned money, for every streaming service."""
-
-prompt = dating_profile_prompt(new_description, samples)
-profile = outlines.generate.json(model, DatingProfile)(prompt)
-parsed_profile = DatingProfile.model_validate_json(profile)
-```
-
-## Results
-
-Here are a couple of results:
-
-```json
-{
-    "bio": """I'm an ambitious lawyer with a casual and fashionable style. I love
-    games and sports, but my true passion is preparing refreshing cocktails at
-    home and dressing to the nines at weddings. I'm currently looking for a woman
-    to show a good time to and get a kiss on the opulent suit I just had made.
-    Send resume to this inbox.""",
-    "job": "Lawyer",
-    "interests":
-    [
-        "Stylish guys",
-        "Gaming",
-        "Ping pong",
-        "Cocktails",
-        "Weddings"
-    ],
-    "qna1":
-    {
-        "question": "The first item on my bucket list is",
-        "answer": "be married and have a family."
-    },
-    "qna2":
-    {
-        "question": "People would describe me as",
-        "answer": "charming, stylish, and funny."
-    }
-}
-```
-
-```json
-{
-    "bio": """I’m a sexy lawyer with time on my hands. I love to game and
-    play ping pong, but the real reason you should swipe to the right
-    is because I look great in a suit. Who doesn’t love a man in a
-    suit? Just saying. Send me a message if you think it’s time to take
-    your dating life to the next level.""",
-    "job": "Lawyer",
-    "interests":
-    [
-        "Gaming",
-        "Ping Pong",
-        "Tailored Suits",
-        "Weddings",
-        "Streaming Services"
-    ],
-    "qna1":
-    {
-        "question": "The first item on my bucket list is",
-        "answer": "simulate space but stay alive for as long as possible"
-    },
-    "qna2":
-    {
-        "question": "People would describe me as",
-        "answer": "easy-going, a little nerdy but with a mature essence"
-    }
-}
-```
diff --git a/docs/cookbook/deploy-using-bentoml.md b/docs/cookbook/deploy-using-bentoml.md
deleted file mode 100644
index 6bee7744..00000000
--- a/docs/cookbook/deploy-using-bentoml.md
+++ /dev/null
@@ -1,224 +0,0 @@
-# Run Outlines using BentoML
-
-[BentoML](https://github.com/bentoml/BentoML) is an open-source model serving library for building performant and scalable AI applications with Python. It comes with tools that you need for serving optimization, model packaging, and production deployment.
-
-In this guide, we will show you how to use BentoML to run programs written with Outlines on GPU locally and in [BentoCloud](https://www.bentoml.com/), an AI Inference Platform for enterprise AI teams. The example source code in this guide is also available in the [examples/bentoml/](https://github.com/outlines-dev/outlines/blob/main/examples/bentoml/) directory.
-
-## Import a model
-
-First we need to download an LLM (Mistral-7B-v0.1 in this example and you can use any other LLM) and import the model into BentoML's [Model Store](https://docs.bentoml.com/en/latest/guides/model-store.html). Let's install BentoML and other dependencies from PyPi (preferably in a virtual environment):
-
-```bash
-pip install -r requirements.txt
-```
-
-Then save the code snippet below as `import_model.py` and run `python import_model.py`.
-
-**Note**: You need to accept related conditions on [Hugging Face](https://huggingface.co/mistralai/Mistral-7B-v0.1) first to gain access to Mistral-7B-v0.1.
-
-```python
-import bentoml
-
-MODEL_ID = "mistralai/Mistral-7B-v0.1"
-BENTO_MODEL_TAG = MODEL_ID.lower().replace("/", "--")
-
-def import_model(model_id, bento_model_tag):
-
-    import torch
-    from transformers import AutoModelForCausalLM, AutoTokenizer
-
-    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
-    model = AutoModelForCausalLM.from_pretrained(
-        MODEL_ID,
-        torch_dtype=torch.float16,
-        low_cpu_mem_usage=True,
-    )
-
-    with bentoml.models.create(bento_model_tag) as bento_model_ref:
-        tokenizer.save_pretrained(bento_model_ref.path)
-        model.save_pretrained(bento_model_ref.path)
-
-
-if __name__ == "__main__":
-    import_model(MODEL_ID, BENTO_MODEL_TAG)
-```
-
-You can verify the download is successful by running:
-
-```bash
-$ bentoml models list
-
-Tag                                          Module  Size        Creation Time
-mistralai--mistral-7b-v0.1:m7lmf5ac2cmubnnz          13.49 GiB   2024-04-25 06:52:39
-```
-
-## Define a BentoML Service
-
-As the model is ready, we can define a [BentoML Service](https://docs.bentoml.com/en/latest/guides/services.html) to wrap the capabilities of the model.
-
-We will run the JSON-structured generation example [in the README](https://github.com/outlines-dev/outlines?tab=readme-ov-file#efficient-json-generation-following-a-json-schema), with the following schema:
-
-```python
-DEFAULT_SCHEMA = """{
-    "title": "Character",
-    "type": "object",
-    "properties": {
-        "name": {
-            "title": "Name",
-            "maxLength": 10,
-            "type": "string"
-        },
-        "age": {
-            "title": "Age",
-            "type": "integer"
-        },
-        "armor": {"$ref": "#/definitions/Armor"},
-        "weapon": {"$ref": "#/definitions/Weapon"},
-        "strength": {
-            "title": "Strength",
-            "type": "integer"
-        }
-    },
-    "required": ["name", "age", "armor", "weapon", "strength"],
-    "definitions": {
-        "Armor": {
-            "title": "Armor",
-            "description": "An enumeration.",
-            "enum": ["leather", "chainmail", "plate"],
-            "type": "string"
-        },
-        "Weapon": {
-            "title": "Weapon",
-            "description": "An enumeration.",
-            "enum": ["sword", "axe", "mace", "spear", "bow", "crossbow"],
-            "type": "string"
-        }
-    }
-}"""
-```
-
-First, we need to define a BentoML service by decorating an ordinary class (`Outlines` here) with `@bentoml.service` decorator. We pass to this decorator some configuration and GPU on which we want this service to run in BentoCloud (here an L4 with 24GB memory):
-
-```python
-import typing as t
-import bentoml
-
-from import_model import BENTO_MODEL_TAG
-
-@bentoml.service(
-    traffic={
-        "timeout": 300,
-    },
-    resources={
-        "gpu": 1,
-        "gpu_type": "nvidia-l4",
-    },
-)
-class Outlines:
-
-    bento_model_ref = bentoml.models.get(BENTO_MODEL_TAG)
-
-    def __init__(self) -> None:
-
-        import outlines
-        import torch
-        self.model = outlines.models.transformers(
-            self.bento_model_ref.path,
-            device="cuda",
-            model_kwargs={"torch_dtype": torch.float16},
-        )
-
-    ...
-```
-
-We then need to define an HTTP endpoint using `@bentoml.api` to decorate the method `generate` of `Outlines` class:
-
-```python
-    ...
-
-    @bentoml.api
-    async def generate(
-        self,
-        prompt: str = "Give me a character description.",
-        json_schema: t.Optional[str] = DEFAULT_SCHEMA,
-    ) -> t.Dict[str, t.Any]:
-
-        import outlines
-
-        generator = outlines.generate.json(self.model, json_schema)
-        character = generator(prompt)
-
-        return character
-```
-
-Here `@bentoml.api` decorator defines `generate` as an HTTP endpoint that accepts a JSON request body with two fields: `prompt` and `json_schema` (optional, which allows HTTP clients to provide their own JSON schema). The type hints in the function signature will be used to validate incoming JSON requests. You can define as many HTTP endpoints as you want by using `@bentoml.api` to decorate other methods of `Outlines` class.
-
-Now you can save the above code to `service.py` (or use [this implementation](https://github.com/outlines-dev/outlines/blob/main/examples/bentoml/)), and run the code using the BentoML CLI.
-
-## Run locally for testing and debugging
-
-Then you can run a server locally by:
-
-```bash
-bentoml serve .
-```
-
-The server is now active at <http://localhost:3000>. You can interact with it using the Swagger UI or in other different ways:
-
-<details>
-
-<summary>CURL</summary>
-
-```bash
-curl -X 'POST' \
-  'http://localhost:3000/generate' \
-  -H 'accept: application/json' \
-  -H 'Content-Type: application/json' \
-  -d '{
-  "prompt": "Give me a character description."
-}'
-```
-
-</details>
-
-<details>
-
-<summary>Python client</summary>
-
-```python
-import bentoml
-
-with bentoml.SyncHTTPClient("http://localhost:3000") as client:
-    response = client.generate(
-        prompt="Give me a character description"
-    )
-    print(response)
-```
-
-</details>
-
-Expected output:
-
-```bash
-{
-  "name": "Aura",
-  "age": 15,
-  "armor": "plate",
-  "weapon": "sword",
-  "strength": 20
-}
-```
-
-## Deploy to BentoCloud
-
-After the Service is ready, you can deploy it to [BentoCloud](https://docs.bentoml.com/en/latest/bentocloud/get-started.html) for better management and scalability. [Sign up](https://cloud.bentoml.com/signup) if you haven't got a BentoCloud account.
-
-Make sure you have [logged in to BentoCloud](https://docs.bentoml.com/en/latest/bentocloud/how-tos/manage-access-token.html), then run the following command to deploy it.
-
-```bash
-bentoml deploy .
-```
-
-Once the application is up and running on BentoCloud, you can access it via the exposed URL.
-
-**Note**: For custom deployment in your own infrastructure, use [BentoML to generate an OCI-compliant image](https://docs.bentoml.com/en/latest/guides/containerization.html).
diff --git a/docs/cookbook/deploy-using-cerebrium.md b/docs/cookbook/deploy-using-cerebrium.md
deleted file mode 100644
index 6fbaab39..00000000
--- a/docs/cookbook/deploy-using-cerebrium.md
+++ /dev/null
@@ -1,121 +0,0 @@
-# Run Outlines using Cerebrium
-
-[Cerebrium](https://www.cerebrium.ai/) is a serverless AI infrastructure platform that makes it easier for companies to build and deploy AI based applications. They offer Serverless GPU's with low cold start times with over 12 varieties of GPU chips that auto scale and you only pay for the compute you use.
-
-In this guide we will show you how you can use Cerebrium to run programs written with Outlines on GPUs in the cloud.
-
-# Setup Cerebrium
-
-First, we install Cerebrium and login to get authenticated.
-
-```bash
-pip install cerebrium
-cerebrium login
-```
-
-Then let us create our first project
-
-```bash
-cerebrium init outlines-project
-```
-
-## Setup Environment and Hardware
-
-You set up your environment and hardware in the cerebrium.toml file that was created using the init function above.
-
-```toml
-[cerebrium.deployment]
-docker_base_image_url = "nvidia/cuda:12.1.1-runtime-ubuntu22.04"
-
-[cerebrium.hardware]
-cpu = 2
-memory = 14.0
-gpu = "AMPERE A10"
-gpu_count = 1
-provider = "aws"
-region = "us-east-1"
-
-[cerebrium.dependencies.pip]
-outline = "==0.0.37"
-transformers = "==4.38.2"
-datasets = "==2.18.0"
-accelerate = "==0.27.2"
-```
-
-## Setup inference
-
-Running code in Cerebrium is like writing normal python with no special syntax. In a `main.py` file specify the following:
-
-```python
-import outlines
-
-
-model = outlines.models.transformers("microsoft/Phi-3-mini-4k-instruct")
-
-schema = """{
-    "title": "Character",
-    "type": "object",
-    "properties": {
-        "name": {
-            "title": "Name",
-            "maxLength": 10,
-            "type": "string"
-        },
-        "age": {
-            "title": "Age",
-            "type": "integer"
-        },
-        "armor": {"$ref": "#/definitions/Armor"},
-        "weapon": {"$ref": "#/definitions/Weapon"},
-        "strength": {
-            "title": "Strength",
-            "type": "integer"
-        }
-    },
-    "required": ["name", "age", "armor", "weapon", "strength"],
-    "definitions": {
-        "Armor": {
-            "title": "Armor",
-            "description": "An enumeration.",
-            "enum": ["leather", "chainmail", "plate"],
-            "type": "string"
-        },
-        "Weapon": {
-            "title": "Weapon",
-            "description": "An enumeration.",
-            "enum": ["sword", "axe", "mace", "spear", "bow", "crossbow"],
-            "type": "string"
-        }
-    }
-}"""
-
-generator = outlines.generate.json(model, schema)
-```
-
-On first deploy, it will download the model and store it on disk therefore for subsequent calls it will load the model from disk.
-
-Every function in Cerebrium is callable through an API endpoint. Code at the top most layer (ie: not in a function) is instantiated only when the container is spun up the first time so for subsequent calls, it will simply run the code defined in the function you call.
-
-To deploy an API that creates a new character when called with a prompt you can add the following code to `main.py`:
-
-```python
-def generate(
-    prompt: str = "Amiri, a 53 year old warrior woman with a sword and leather armor.",
-):
-
-    character = generator(
-        f"<s>[INST]Give me a character description. Describe {prompt}.[/INST]"
-    )
-
-    return character
-```
-
-
-## Run on the cloud
-
-```bash
-cerebrium deploy
-```
-
-You will see your application deploy, install pip packages and download the model. Once completed it will output a CURL request you can use to call your endpoint. Just remember to end
-the url with the function you would like to call - in this case /generate. You should see your response returned!
diff --git a/docs/cookbook/deploy-using-modal.md b/docs/cookbook/deploy-using-modal.md
deleted file mode 100644
index 835924d2..00000000
--- a/docs/cookbook/deploy-using-modal.md
+++ /dev/null
@@ -1,128 +0,0 @@
-# Run Outlines using Modal
-
-[Modal](https://modal.com/) is a serverless platform that allows you to easily run code on the cloud, including GPUs. It can come very handy for those of us who don't have a monster GPU at home and want to be able to quickly and easily provision, configure and orchestrate cloud infrastructure.
-
-In this guide we will show you how you can use Modal to run programs written with Outlines on GPU in the cloud.
-
-## Build the image
-
-First we need to define our container image. We download the Mistral-7B-v0.1 model from HuggingFace as part of the definition of the image so it only needs to be done once (you need to provide an [access token](https://huggingface.co/settings/tokens))
-
-```python
-from modal import Image, App, gpu
-
-app = App(name="outlines-app")
-
-outlines_image = Image.debian_slim(python_version="3.11").pip_install(
-    "outlines==0.0.37",
-    "transformers==4.38.2",
-    "datasets==2.18.0",
-    "accelerate==0.27.2",
-)
-
-def import_model():
-    import os
-    os.environ["HF_TOKEN"] = "YOUR_HUGGINGFACE_TOKEN"
-    import outlines
-    outlines.models.transformers("mistralai/Mistral-7B-Instruct-v0.2")
-
-outlines_image = outlines_image.run_function(import_model)
-```
-
-We will run the JSON-structured generation example [in the README](https://github.com/outlines-dev/outlines?tab=readme-ov-file#efficient-json-generation-following-a-json-schema), with the following schema:
-
-## Run inference
-
-```python
-schema = """{
-    "title": "Character",
-    "type": "object",
-    "properties": {
-        "name": {
-            "title": "Name",
-            "maxLength": 10,
-            "type": "string"
-        },
-        "age": {
-            "title": "Age",
-            "type": "integer"
-        },
-        "armor": {"$ref": "#/definitions/Armor"},
-        "weapon": {"$ref": "#/definitions/Weapon"},
-        "strength": {
-            "title": "Strength",
-            "type": "integer"
-        }
-    },
-    "required": ["name", "age", "armor", "weapon", "strength"],
-    "definitions": {
-        "Armor": {
-            "title": "Armor",
-            "description": "An enumeration.",
-            "enum": ["leather", "chainmail", "plate"],
-            "type": "string"
-        },
-        "Weapon": {
-            "title": "Weapon",
-            "description": "An enumeration.",
-            "enum": ["sword", "axe", "mace", "spear", "bow", "crossbow"],
-            "type": "string"
-        }
-    }
-}"""
-```
-
-To make the inference work on Modal we need to wrap the corresponding function in a `@app.function` decorator. We pass to this decorator the image and GPU on which we want this function to run (here an A100 with 80GB memory):
-
-```python
-@app.function(image=outlines_image, gpu=gpu.A100(size='80GB'))
-def generate(
-    prompt: str = "Amiri, a 53 year old warrior woman with a sword and leather armor.",
-):
-    import outlines
-
-    model = outlines.models.transformers(
-        "mistralai/Mistral-7B-v0.1", device="cuda"
-    )
-
-    generator = outlines.generate.json(model, schema)
-    character = generator(
-        f"<s>[INST]Give me a character description. Describe {prompt}.[/INST]"
-    )
-
-    print(character)
-```
-
-We then need to define a `local_entrypoint` to call our function `generate` remotely:
-
-```python
-@app.local_entrypoint()
-def main(
-    prompt: str = "Amiri, a 53 year old warrior woman with a sword and leather armor.",
-):
-    generate.remote(prompt)
-```
-
-Here `@app.local_entrypoint()` decorator defines `main` as the function to start from locally when running the Modal CLI. You can save above code to `example.py` (or use [this implementation](https://github.com/outlines-dev/outlines/blob/main/examples/modal_example.py)). Let's now see how to run the code on the cloud using the Modal CLI.
-
-## Run on the cloud
-
-First install the Modal client from PyPi:
-
-```bash
-pip install modal
-```
-
-You then need to obtain a token from Modal. To do so easily, run the following command:
-
-```bash
-modal setup
-```
-
-Once that is set you can run inference on the cloud using:
-
-```bash
-modal run example.py
-```
-
-You should see the Modal app initialize, and soon after see the result of the `print` function in your terminal. That's it!
diff --git a/docs/cookbook/extraction.md b/docs/cookbook/extraction.md
deleted file mode 100644
index 28317b6b..00000000
--- a/docs/cookbook/extraction.md
+++ /dev/null
@@ -1,87 +0,0 @@
-# Named entity extraction
-
-Named Entity Extraction is a fundamental problem in NLP. It involves identifying and categorizing named entities within a document: people, organization, dates, places, etc. It is usually the first step in a more complex NLP worklow. Here we will use the example of a pizza restaurant that receives orders via their website and need to identify the number and types of pizzas that are being ordered.
-
-Getting LLMs to output the extracted entities in a structured format can be challenging. In this tutorial we will see how we can use Outlines' JSON-structured generation to extract entities from a document and return them in a valid JSON data structure 100% of the time.
-
-As always, we start with initializing the model. We will be using a quantized version of Mistal-7B-v0.1 (we're GPU poor):
-
-```python
-import outlines
-
-model = outlines.models.transformers("TheBloke/Mistral-7B-OpenOrca-AWQ", device="cuda")
-```
-
-And we will be using the following prompt template:
-
-```python
-@outlines.prompt
-def take_order(order):
-    """You are the owner of a pizza parlor. Customers \
-    send you orders from which you need to extract:
-
-    1. The pizza that is ordered
-    2. The number of pizzas
-
-    # EXAMPLE
-
-    ORDER: I would like one Margherita pizza
-    RESULT: {"pizza": "Margherita", "number": 1}
-
-    # OUTPUT INSTRUCTIONS
-
-    Answer in valid JSON. Here are the different objects relevant for the output:
-
-    Order:
-        pizza (str): name of the pizza
-        number (int): number of pizzas
-
-    Return a valid JSON of type "Order"
-
-    # OUTPUT
-
-    ORDER: {{ order }}
-    RESULT: """
-```
-
-We now define our data model using Pydantic:
-
-```python
-from enum import Enum
-from pydantic import BaseModel
-
-class Pizza(str, Enum):
-    margherita = "Margherita"
-    pepperonni = "Pepperoni"
-    calzone = "Calzone"
-
-class Order(BaseModel):
-    pizza: Pizza
-    number: int
-```
-
-We can now define our generator and call it on several incoming orders:
-
-```python
-orders = [
-    "Hi! I would like to order two pepperonni pizzas and would like them in 30mins.",
-    "Is it possible to get 12 margheritas?"
-]
-prompts = [take_order(order) for order in orders]
-
-generator = outlines.generate.json(model, Order)
-
-results = generator(prompts)
-print(results)
-# [Order(pizza=<Pizza.pepperonni: 'Pepperoni'>, number=2),
-#  Order(pizza=<Pizza.margherita: 'Margherita'>, number=12)]
-```
-
-There are several ways you could improve this example:
-
-- Clients may order several types of pizzas.
-- Clients may order drinks as well.
-- If the pizza place has a delivery service we need to extract the client's address and phone number
-- Clients may specify the time for which they want the pizza. We could then check against a queuing system and reply to them with the estimated delivery time.
-
-How would you change the Pydantic model to account for these use cases?
diff --git a/docs/cookbook/images/chain_of_density.png b/docs/cookbook/images/chain_of_density.png
deleted file mode 100644
index 61e01f40..00000000
Binary files a/docs/cookbook/images/chain_of_density.png and /dev/null differ
diff --git a/docs/cookbook/images/coding_structure_diagram.png b/docs/cookbook/images/coding_structure_diagram.png
deleted file mode 100644
index c30e04a5..00000000
Binary files a/docs/cookbook/images/coding_structure_diagram.png and /dev/null differ
diff --git a/docs/cookbook/images/knowledge-graph-extraction.png b/docs/cookbook/images/knowledge-graph-extraction.png
deleted file mode 100644
index a24b1c61..00000000
Binary files a/docs/cookbook/images/knowledge-graph-extraction.png and /dev/null differ
diff --git a/docs/cookbook/images/simtom.png b/docs/cookbook/images/simtom.png
deleted file mode 100644
index 5f6c926d..00000000
Binary files a/docs/cookbook/images/simtom.png and /dev/null differ
diff --git a/docs/cookbook/index.md b/docs/cookbook/index.md
deleted file mode 100644
index a844ce24..00000000
--- a/docs/cookbook/index.md
+++ /dev/null
@@ -1,14 +0,0 @@
-# Examples
-
-This part of the documentation provides a few cookbooks that you can browse to get acquainted with the library and get some inspiration about what you could do with structured generation. Remember that you can easily change the model that is being used!
-
-- [Classification](classification.md): Classify customer requests.
-- [Named Entity Extraction](extraction.md): Extract information from pizza orders.
-- [Dating Profile](dating_profiles.md): Build dating profiles from descriptions using prompt templating and JSON-structured generation.
-- [Chain Of Density](chain_of_density.md): Summarize documents using chain of density prompting and JSON-structured generation.
-- [Playing Chess](models_playing_chess.md): Make Phi-3 Mini play chess against itself using regex-structured generation.
-- [SimToM](simtom.md): Improve LLMs' Theory of Mind capabilities with perspective-taking prompting and JSON-structured generation.
-- [Q&A with Citations](qa-with-citations.md): Answer questions and provide citations using JSON-structured generation.
-- [Knowledge Graph Generation](knowledge_graph_extraction.md): Generate a Knowledge Graph from unstructured text using JSON-structured generation.
-- [Chain Of Thought (CoT)](chain_of_thought.md): Generate a series of intermediate reasoning steps using regex-structured generation.
-- [ReAct Agent](react_agent.md): Build an agent with open weights models using regex-structured generation.
diff --git a/docs/cookbook/knowledge_graph_extraction.md b/docs/cookbook/knowledge_graph_extraction.md
deleted file mode 100644
index e25166bc..00000000
--- a/docs/cookbook/knowledge_graph_extraction.md
+++ /dev/null
@@ -1,155 +0,0 @@
-# Knowledge Graph Extraction
-
-In this guide, we use [outlines](https://outlines-dev.github.io/outlines/) to extract a knowledge graph from unstructured text.
-
-We will use [llama.cpp](https://github.com/ggerganov/llama.cpp) using the [llama-cpp-python](https://github.com/abetlen/llama-cpp-python) library. Outlines supports llama-cpp-python, but we need to install it ourselves:
-
-```bash
-pip install llama-cpp-python
-```
-
-We download the model weights by passing the name of the repository on the HuggingFace Hub, and the filenames (or glob pattern):
-```python
-import llama_cpp
-from outlines import generate, models
-
-model = models.llamacpp("NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF",
-            "Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf",
-            tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained(
-            "NousResearch/Hermes-2-Pro-Llama-3-8B"
-            ),
-            n_gpu_layers=-1,
-            flash_attn=True,
-            n_ctx=8192,
-            verbose=False)
-```
-
-??? note "(Optional) Store the model weights in a custom folder"
-
-    By default the model weights are downloaded to the hub cache but if we want so store the weights in a custom folder, we pull a quantized GGUF model [Hermes-2-Pro-Llama-3-8B](https://huggingface.co/NousResearch/Hermes-2-Theta-Llama-3-8B-GGUF) by [NousResearch](https://nousresearch.com/) from [HuggingFace](https://huggingface.co/):
-
-    ```bash
-    wget https://hf.co/NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF/resolve/main/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf
-    ```
-
-    We initialize the model:
-
-    ```python
-    import llama_cpp
-    from llama_cpp import Llama
-    from outlines import generate, models
-
-    llm = Llama(
-        "/path/to/model/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf",
-        tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained(
-            "NousResearch/Hermes-2-Pro-Llama-3-8B"
-        ),
-        n_gpu_layers=-1,
-        flash_attn=True,
-        n_ctx=8192,
-        verbose=False
-    )
-    ```
-
-## Knowledge Graph Extraction
-
-We first need to define our Pydantic class for each node and each edge of the knowledge graph:
-
-```python
-from pydantic import BaseModel, Field
-
-class Node(BaseModel):
-    """Node of the Knowledge Graph"""
-
-    id: int = Field(..., description="Unique identifier of the node")
-    label: str = Field(..., description="Label of the node")
-    property: str = Field(..., description="Property of the node")
-
-
-class Edge(BaseModel):
-    """Edge of the Knowledge Graph"""
-
-    source: int = Field(..., description="Unique source of the edge")
-    target: int = Field(..., description="Unique target of the edge")
-    label: str = Field(..., description="Label of the edge")
-    property: str = Field(..., description="Property of the edge")
-```
-
-We then define the Pydantic class for the knowledge graph and get its JSON schema:
-
-```python
-from typing import List
-
-class KnowledgeGraph(BaseModel):
-    """Generated Knowledge Graph"""
-
-    nodes: List[Node] = Field(..., description="List of nodes of the knowledge graph")
-    edges: List[Edge] = Field(..., description="List of edges of the knowledge graph")
-
-schema = KnowledgeGraph.model_json_schema()
-```
-
-We then need to adapt our prompt to the [Hermes prompt format for JSON schema](https://github.com/NousResearch/Hermes-Function-Calling?tab=readme-ov-file#prompt-format-for-json-mode--structured-outputs):
-
-```python
-def generate_hermes_prompt(user_prompt):
-    return (
-        "<|im_start|>system\n"
-        "You are a world class AI model who answers questions in JSON "
-        f"Here's the json schema you must adhere to:\n<schema>\n{schema}\n</schema><|im_end|>\n"
-        "<|im_start|>user\n"
-        + user_prompt
-        + "<|im_end|>"
-        + "\n<|im_start|>assistant\n"
-        "<schema>"
-    )
-```
-
-For a given user prompt, for example:
-
-```python
-user_prompt = "Alice loves Bob and she hates Charlie."
-```
-
-We can use `generate.json` by passing the Pydantic class we previously defined, and call the generator with the Hermes prompt:
-
-```python
-from outlines import generate, models
-
-model = models.LlamaCpp(llm)
-generator = generate.json(model, KnowledgeGraph)
-prompt = generate_hermes_prompt(user_prompt)
-response = generator(prompt, max_tokens=1024, temperature=0, seed=42)
-```
-
-We obtain the nodes and edges of the knowledge graph:
-
-```python
-print(response.nodes)
-print(response.edges)
-# [Node(id=1, label='Alice', property='Person'),
-# Node(id=2, label='Bob', property='Person'),
-# Node(id=3, label='Charlie', property='Person')]
-# [Edge(source=1, target=2, label='love', property='Relationship'),
-# Edge(source=1, target=3, label='hate', property='Relationship')]
-```
-
-## (Optional) Visualizing the Knowledge Graph
-
-We can use the [Graphviz library](https://graphviz.readthedocs.io/en/stable/) to visualize the generated knowledge graph. For detailed installation instructions, see [here](https://graphviz.readthedocs.io/en/stable/#installation).
-
-```python
-from graphviz import Digraph
-
-dot = Digraph()
-for node in response.nodes:
-    dot.node(str(node.id), node.label, shape='circle', width='1', height='1')
-for edge in response.edges:
-    dot.edge(str(edge.source), str(edge.target), label=edge.label)
-
-dot.render('knowledge-graph.gv', view=True)
-```
-
-![Image of the Extracted Knowledge Graph](./images/knowledge-graph-extraction.png)
-
-This example was originally contributed by [Alonso Silva](https://github.com/alonsosilvaallende).
diff --git a/docs/cookbook/models_playing_chess.md b/docs/cookbook/models_playing_chess.md
deleted file mode 100644
index a0df2eb2..00000000
--- a/docs/cookbook/models_playing_chess.md
+++ /dev/null
@@ -1,88 +0,0 @@
-# Large language models playing chess
-
-In this example we will make a Phi-2 model play chess against itself. On its own the model easily generates invalid moves, so we will give it a little help. At each step we will generate a regex that only matches valid move, and use it to help the model only generating valid moves.
-
-## The chessboard
-
-The game will be played on a standard checkboard. We will use the `chess` [library](https://github.com/niklasf/python-chess) to track the opponents' moves, and check that the moves are valid.
-
-```python
-%pip install outlines -q
-%pip install chess -q
-%pip install transformers accelerate einops -q
-
-import chess
-
-board = chess.Board("rnbqkbnr/pppppppp/8/8/8/8/PPPPPPPP/RNBQKBNR w KQkq - 0 1")
-```
-
-## The opponents
-
-Phi-2 will be playing against itself:
-
-```python
-from outlines import models
-
-model = models.transformers("microsoft/Phi-3-mini-4k-instruct")
-
-```
-
-## A little help for the language model
-
-To make sure Phi-2 generates valid chess moves we will use Outline's regex-structured generation. We define a function that takes the current state of the board and returns a regex that matches all possible legal moves:
-
-```python
-import re
-
-def legal_moves_regex(board):
-    """Build a regex that only matches valid moves."""
-    legal_moves = list(board.legal_moves)
-    legal_modes_str = [board.san(move) for move in legal_moves]
-    legal_modes_str = [re.sub(r"[+#]", "", move) for move in legal_modes_str]
-    regex_pattern = "|".join(re.escape(move) for move in legal_modes_str)
-    regex_pattern = f"{regex_pattern}"
-    return regex_pattern
-```
-
-## Prompting the language model
-
-The prompt corresponds to the current state of the board, so we start with:
-
-```python
-prompt = "Let's play Chess. Moves: "
-
-```
-
-We update the prompt at each step so it reflects the state of the board after the previous move.
-
-## Let's play
-
-```python
-from outlines import generate
-
-board_state = " "
-turn_number = 0
-while not board.is_game_over():
-    regex_pattern = legal_moves_regex(board)
-    structured = generate.regex(model, regex_pattern)(prompt + board_state)
-    move = board.parse_san(structured)
-
-    if turn_number % 2 == 0 :  # It's White's turn
-        board_state += board.san(move) + " "
-    else:
-        board_state += board.san(move) + " " + str(turn_number) + "."
-
-    turn_number += 1
-
-    board.push(move)
-
-    print(board_state)
-```
-
-Interestingly enough, Phi-2 hates capturing.
-
-```pgn
- e4 e5 1.Nf3 Ne7 3.b4 Nf5 5.Nc3 Ne7 7.Bb5 a6 9.Na4 b6 11.c3 Nec6 13.c4 a5 15.d4 Qg5 17.Nd2 Bb7 19.dxe5
-```
-
-*This example was originally authored by [@903124S](https://x.com/903124S) in [this gist](https://gist.github.com/903124/cfbefa24da95e2316e0d5e8ef8ed360d).*
diff --git a/docs/cookbook/qa-with-citations.md b/docs/cookbook/qa-with-citations.md
deleted file mode 100644
index 79a2214c..00000000
--- a/docs/cookbook/qa-with-citations.md
+++ /dev/null
@@ -1,255 +0,0 @@
-# Generate Synthetic Data and Q&A with Citations
-
-This tutorial is adapted from the [instructor-ollama notebook](https://github.com/alonsosilvaallende/Hermes-Function-Calling/blob/main/examples/instructor_ollama.ipynb). We start with a simple example to generate synthetic data and then we approach the problem of question answering by providing citations.
-
-We will use [llama.cpp](https://github.com/ggerganov/llama.cpp) using the [llama-cpp-python](https://github.com/abetlen/llama-cpp-python) library. Outlines supports llama-cpp-python, but we need to install it ourselves:
-
-```bash
-pip install llama-cpp-python
-```
-
-We download the model weights by passing the name of the repository on the HuggingFace Hub, and the filenames (or glob pattern):
-```python
-import llama_cpp
-from outlines import generate, models
-
-model = models.llamacpp("NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF",
-            "Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf",
-            tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained(
-            "NousResearch/Hermes-2-Pro-Llama-3-8B"
-            ),
-            n_gpu_layers=-1,
-            flash_attn=True,
-            n_ctx=8192,
-            verbose=False)
-```
-
-??? note "(Optional) Store the model weights in a custom folder"
-
-    By default the model weights are downloaded to the hub cache but if we want so store the weights in a custom folder, we pull a quantized GGUF model [Hermes-2-Pro-Llama-3-8B](https://huggingface.co/NousResearch/Hermes-2-Theta-Llama-3-8B-GGUF) by [NousResearch](https://nousresearch.com/) from [HuggingFace](https://huggingface.co/):
-
-    ```bash
-    wget https://hf.co/NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF/resolve/main/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf
-    ```
-
-    We initialize the model:
-
-    ```python
-    import llama_cpp
-    from llama_cpp import Llama
-    from outlines import generate, models
-
-    llm = Llama(
-        "/path/to/model/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf",
-        tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained(
-            "NousResearch/Hermes-2-Pro-Llama-3-8B"
-        ),
-        n_gpu_layers=-1,
-        flash_attn=True,
-        n_ctx=8192,
-        verbose=False
-    )
-    ```
-
-## Generate Synthetic Data
-
-We first need to define our Pydantic class for a user:
-
-```python
-from pydantic import BaseModel, Field
-
-class UserDetail(BaseModel):
-    id: int = Field(..., description="Unique identifier") # so the model keeps track of the number of users
-    first_name: str
-    last_name: str
-    age: int
-```
-
-We then define a Pydantic class for a list of users:
-
-```python
-from typing import List
-
-class Users(BaseModel):
-    users: List[UserDetail]
-```
-
-We can use a `generate.json` by passing this Pydantic class we just defined, and call the generator:
-
-```python
-model = models.LlamaCpp(llm)
-generator = generate.json(model, Users)
-response = generator("Create 5 fake users", max_tokens=1024, temperature=0, seed=42)
-print(response.users)
-# [UserDetail(id=1, first_name='John', last_name='Doe', age=25),
-# UserDetail(id=2, first_name='Jane', last_name='Doe', age=30),
-# UserDetail(id=3, first_name='Bob', last_name='Smith', age=40),
-# UserDetail(id=4, first_name='Alice', last_name='Smith', age=35),
-# UserDetail(id=5, first_name='John', last_name='Smith', age=20)]
-```
-
-```python
-for user in response.users:
-    print(user.first_name)
-    print(user.last_name)
-    print(user.age)
-    print(#####)
-# John
-# Doe
-# 25
-# #####
-# Jane
-# Doe
-# 30
-# #####
-# Bob
-# Smith
-# 40
-# #####
-# Alice
-# Smith
-# 35
-# #####
-# John
-# Smith
-# 20
-# #####
-```
-
-## QA with Citations
-
-We first need to define our Pydantic class for QA with citations:
-
-```python
-from typing import List
-from pydantic import BaseModel
-
-class QuestionAnswer(BaseModel):
-    question: str
-    answer: str
-    citations: List[str]
-
-schema = QuestionAnswer.model_json_schema()
-```
-
-We then need to adapt our prompt to the [Hermes prompt format for JSON schema](https://github.com/NousResearch/Hermes-Function-Calling?tab=readme-ov-file#prompt-format-for-json-mode--structured-outputs):
-
-```python
-def generate_hermes_prompt(question, context, schema=schema):
-    return (
-        "<|im_start|>system\n"
-        "You are a world class AI model who answers questions in JSON with correct and exact citations "
-        "extracted from the `Context`. "
-        f"Here's the json schema you must adhere to:\n<schema>\n{schema}\n</schema><|im_end|>\n"
-        "<|im_start|>user\n"
-        + "`Context`: "
-        + context
-        + "\n`Question`: "
-        + question + "<|im_end|>"
-        + "\n<|im_start|>assistant\n"
-        "<schema>"
-    )
-```
-
-We can use `generate.json` by passing the Pydantic class we previously defined, and call the generator with Hermes prompt:
-
-```python
-question = "What did the author do during college?"
-context = """
-My name is Jason Liu, and I grew up in Toronto Canada but I was born in China.
-I went to an arts high school but in university I studied Computational Mathematics and physics.
-As part of coop I worked at many companies including Stitchfix, Facebook.
-I also started the Data Science club at the University of Waterloo and I was the president of the club for 2 years.
-"""
-generator = generate.json(model, QuestionAnswer)
-prompt = generate_hermes_prompt(question, context)
-response = generator(prompt, max_tokens=1024, temperature=0, seed=42)
-print(response)
-# QuestionAnswer(question='What did the author do during college?', answer='The author studied Computational Mathematics and physics in university and was also involved in starting the Data Science club, serving as its president for 2 years.', citations=['I went to an arts high school but in university I studied Computational Mathematics and physics.', 'I also started the Data Science club at the University of Waterloo and I was the president of the club for 2 years.'])
-```
-
-We can do the same for a list of question-context pairs:
-
-```python
-question1 = "Where was John born?"
-context1 = """
-John Doe is a software engineer who was born in New York, USA.
-He studied Computer Science at the Massachusetts Institute of Technology.
-During his studies, he interned at Google and Microsoft.
-He also founded the Artificial Intelligence club at his university and served as its president for three years.
-"""
-
-question2 = "What did Emily study in university?"
-context2 = """
-Emily Smith is a data scientist from London, England.
-She attended the University of Cambridge where she studied Statistics and Machine Learning.
-She interned at IBM and Amazon during her summer breaks.
-Emily was also the head of the Women in Tech society at her university.
-"""
-
-question3 = "Which companies did Robert intern at?"
-context3 = """
-Robert Johnson, originally from Sydney, Australia, is a renowned cybersecurity expert.
-He studied Information Systems at the University of Melbourne.
-Robert interned at several cybersecurity firms including NortonLifeLock and McAfee.
-He was also the leader of the Cybersecurity club at his university.
-"""
-
-question4 = "What club did Alice start at her university?"
-context4 = """
-Alice Williams, a native of Dublin, Ireland, is a successful web developer.
-She studied Software Engineering at Trinity College Dublin.
-Alice interned at several tech companies including Shopify and Squarespace.
-She started the Web Development club at her university and was its president for two years.
-"""
-
-question5 = "What did Michael study in high school?"
-context5 = """
-Michael Brown is a game developer from Tokyo, Japan.
-He attended a specialized high school where he studied Game Design.
-He later attended the University of Tokyo where he studied Computer Science.
-Michael interned at Sony and Nintendo during his university years.
-He also started the Game Developers club at his university.
-"""
-
-for question, context in [
-    (question1, context1),
-    (question2, context2),
-    (question3, context3),
-    (question4, context4),
-    (question5, context5),
-]:
-    final_prompt = my_final_prompt(question, context)
-    generator = generate.json(model, QuestionAnswer)
-    response = generator(final_prompt, max_tokens=1024, temperature=0, seed=42)
-    display(question)
-    display(response.answer)
-    display(response.citations)
-    print("\n\n")
-
-# 'Where was John born?'
-# 'John Doe was born in New York, USA.'
-# ['John Doe is a software engineer who was born in New York, USA.']
-#
-#
-# 'What did Emily study in university?'
-# 'Emily studied Statistics and Machine Learning in university.'
-# ['She attended the University of Cambridge where she studied Statistics and Machine Learning.']
-#
-#
-# 'Which companies did Robert intern at?'
-# 'Robert interned at NortonLifeLock and McAfee.'
-# ['Robert Johnson, originally from Sydney, Australia, is a renowned cybersecurity expert. He interned at several cybersecurity firms including NortonLifeLock and McAfee.']
-#
-#
-# 'What club did Alice start at her university?'
-# 'Alice started the Web Development club at her university.'
-# ['Alice Williams, a native of Dublin, Ireland, is a successful web developer. She started the Web Development club at her university and was its president for two years.']
-#
-#
-# 'What did Michael study in high school?'
-# 'Michael studied Game Design in high school.'
-# ['Michael Brown is a game developer from Tokyo, Japan. He attended a specialized high school where he studied Game Design.']
-```
-
-This example was originally contributed by [Alonso Silva](https://github.com/alonsosilvaallende).
diff --git a/docs/cookbook/react_agent.md b/docs/cookbook/react_agent.md
deleted file mode 100644
index ca4829d5..00000000
--- a/docs/cookbook/react_agent.md
+++ /dev/null
@@ -1,273 +0,0 @@
-# ReAct Agent
-
-This example shows how to use [outlines](https://outlines-dev.github.io/outlines/) to build your own agent with open weights local models and structured outputs. It is inspired by the blog post [A simple Python implementation of the ReAct pattern for LLMs](https://til.simonwillison.net/llms/python-react-pattern) by [Simon Willison](https://simonwillison.net/).
-
-The ReAct pattern (for Reason+Act) is described in the paper [ReAct: Synergizing Reasoning and Acting in Language Models](https://arxiv.org/abs/2210.03629). It's a pattern where you implement additional actions that an LLM can take - searching Wikipedia or running calculations for example - and then teach it how to request the execution of those actions, and then feed their results back into the LLM.
-
-Additionally, we give the LLM the possibility of using a scratchpad described in the paper [Show Your Work: Scratchpads for Intermediate Computation with Language Models](https://arxiv.org/abs/2112.00114) which improves the ability of LLMs to perform multi-step computations.
-
-We use [llama.cpp](https://github.com/ggerganov/llama.cpp) using the [llama-cpp-python](https://github.com/abetlen/llama-cpp-python) library. Outlines supports llama-cpp-python, but we need to install it ourselves:
-
-```bash
-pip install llama-cpp-python
-```
-
-We download the model weights by passing the name of the repository on the HuggingFace Hub, and the filenames (or glob pattern):
-```python
-import llama_cpp
-from outlines import generate, models
-
-model = models.llamacpp("NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF",
-            "Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf",
-            tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained(
-            "NousResearch/Hermes-2-Pro-Llama-3-8B"
-            ),
-            n_gpu_layers=-1,
-            flash_attn=True,
-            n_ctx=8192,
-            verbose=False)
-```
-
-??? note "(Optional) Store the model weights in a custom folder"
-
-    By default the model weights are downloaded to the hub cache but if we want so store the weights in a custom folder, we pull a quantized GGUF model [Hermes-2-Pro-Llama-3-8B](https://huggingface.co/NousResearch/Hermes-2-Theta-Llama-3-8B-GGUF) by [NousResearch](https://nousresearch.com/) from [HuggingFace](https://huggingface.co/):
-
-    ```bash
-    wget https://hf.co/NousResearch/Hermes-2-Pro-Llama-3-8B-GGUF/resolve/main/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf
-    ```
-
-    We initialize the model:
-
-    ```python
-    import llama_cpp
-    from llama_cpp import Llama
-    from outlines import generate, models
-
-    llm = Llama(
-        "/path/to/model/Hermes-2-Pro-Llama-3-8B-Q4_K_M.gguf",
-        tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained(
-            "NousResearch/Hermes-2-Pro-Llama-3-8B"
-        ),
-        n_gpu_layers=-1,
-        flash_attn=True,
-        n_ctx=8192,
-        verbose=False
-    )
-    ```
-
-## Build a ReAct agent
-
-In this example, we use two tools:
-
-- wikipedia: \<search term\> - search Wikipedia and returns the snippet of the first result
-- calculate: \<expression\> - evaluate an expression using Python's eval() function
-
-```python
-import httpx
-
-def wikipedia(q):
-    return httpx.get("https://en.wikipedia.org/w/api.php", params={
-        "action": "query",
-        "list": "search",
-        "srsearch": q,
-        "format": "json"
-    }).json()["query"]["search"][0]["snippet"]
-
-
-def calculate(numexp):
-    return eval(numexp)
-```
-
-We define the logic of the agent through a Pydantic class. First, we want the LLM to decide only between the two previously defined tools:
-
-```python
-from enum import Enum
-
-class Action(str, Enum):
-    wikipedia = "wikipedia"
-    calculate = "calculate"
-```
-
-Our agent will loop through Thought and Action. We explicitly give the Action Input field so it doesn't forget to add the arguments of the Action. We also add a scratchpad (optional).
-
-```python
-from pydantic import BaseModel, Field
-
-class Reason_and_Act(BaseModel):
-    Scratchpad: str = Field(..., description="Information from the Observation useful to answer the question")
-    Thought: str = Field(..., description="It describes your thoughts about the question you have been asked")
-    Action: Action
-    Action_Input: str = Field(..., description="The arguments of the Action.")
-```
-
-Our agent will reach a Final Answer. We also add a scratchpad (optional).
-
-```python
-class Final_Answer(BaseModel):
-    Scratchpad: str = Field(..., description="Information from the Observation useful to answer the question")
-    Final_Answer: str = Field(..., description="Answer to the question grounded on the Observation")
-```
-
-Our agent will decide when it has reached a Final Answer and therefore to stop the loop of Thought and Action.
-
-```python
-from typing import Union
-
-class Decision(BaseModel):
-    Decision: Union[Reason_and_Act, Final_Answer]
-```
-
-We could generate a response using the json schema but we will use the regex and check that everything is working as expected:
-
-```python
-from outlines.integrations.utils import convert_json_schema_to_str
-from outlines.fsm.json_schema import build_regex_from_schema
-
-json_schema = Decision.model_json_schema()
-schema_str = convert_json_schema_to_str(json_schema=json_schema)
-regex_str = build_regex_from_schema(schema_str)
-print(regex_str)
-# '\\{[ ]?"Decision"[ ]?:[ ]?(\\{[ ]?"Scratchpad"[ ]?:[ ]?"([^"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\["\\\\])*"[ ]?,[ ]?"Thought"[ ]?:[ ]?"([^"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\["\\\\])*"[ ]?,[ ]?"Action"[ ]?:[ ]?("wikipedia"|"calculate")[ ]?,[ ]?"Action_Input"[ ]?:[ ]?"([^"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\["\\\\])*"[ ]?\\}|\\{[ ]?"Scratchpad"[ ]?:[ ]?"([^"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\["\\\\])*"[ ]?,[ ]?"Final_Answer"[ ]?:[ ]?"([^"\\\\\\x00-\\x1F\\x7F-\\x9F]|\\\\["\\\\])*"[ ]?\\})[ ]?\\}'
-```
-
-We then need to adapt our prompt to the [Hermes prompt format for JSON schema](https://github.com/NousResearch/Hermes-Function-Calling?tab=readme-ov-file#prompt-format-for-json-mode--structured-outputs) and explain the agent logic:
-
-```python
-import datetime
-
-def generate_hermes_prompt(question, schema=""):
-    return (
-        "<|im_start|>system\n"
-        "You are a world class AI model who answers questions in JSON with correct Pydantic schema. "
-        f"Here's the json schema you must adhere to:\n<schema>\n{schema}\n</schema>\n"
-        "Today is " + datetime.datetime.today().strftime('%Y-%m-%d') + ".\n" +
-        "You run in a loop of Scratchpad, Thought, Action, Action Input, PAUSE, Observation. "
-        "At the end of the loop you output a Final Answer. "
-        "Use Scratchpad to store the information from the Observation useful to answer the question "
-        "Use Thought to describe your thoughts about the question you have been asked "
-        "and reflect carefully about the Observation if it exists. "
-        "Use Action to run one of the actions available to you. "
-        "Use Action Input to input the arguments of the selected action - then return PAUSE. "
-        "Observation will be the result of running those actions. "
-        "Your available actions are:\n"
-        "calculate:\n"
-        "e.g. calulate: 4**2 / 3\n"
-        "Runs a calculation and returns the number - uses Python so be sure to use floating point syntax if necessary\n"
-        "wikipedia:\n"
-        "e.g. wikipedia: Django\n"
-        "Returns a summary from searching Wikipedia\n"
-        "DO NOT TRY TO GUESS THE ANSWER. Begin! <|im_end|>"
-        "\n<|im_start|>user\n" + question + "<|im_end|>"
-        "\n<|im_start|>assistant\n"
-    )
-```
-
-We define a ChatBot class
-
-```python
-class ChatBot:
-    def __init__(self, prompt=""):
-        self.prompt = prompt
-
-    def __call__(self, user_prompt):
-        self.prompt += user_prompt
-        result = self.execute()
-        return result
-
-    def execute(self):
-        generator = generate.regex(model, regex_str)
-        result = generator(self.prompt, max_tokens=1024, temperature=0, seed=42)
-        return result
-```
-
-We define a query function:
-
-```python
-import json
-
-def query(question, max_turns=5):
-    i = 0
-    next_prompt = (
-        "\n<|im_start|>user\n" + question + "<|im_end|>"
-        "\n<|im_start|>assistant\n"
-    )
-    previous_actions = []
-    while i < max_turns:
-        i += 1
-        prompt = generate_hermes_prompt(question=question, schema=Decision.model_json_schema())
-        bot = ChatBot(prompt=prompt)
-        result = bot(next_prompt)
-        json_result = json.loads(result)['Decision']
-        if "Final_Answer" not in list(json_result.keys()):
-            scratchpad = json_result['Scratchpad'] if i == 0 else ""
-            thought = json_result['Thought']
-            action = json_result['Action']
-            action_input = json_result['Action_Input']
-            print(f"\x1b[34m Scratchpad: {scratchpad} \x1b[0m")
-            print(f"\x1b[34m Thought: {thought} \x1b[0m")
-            print(f"\x1b[36m  -- running {action}: {str(action_input)}\x1b[0m")
-            if action + ": " + str(action_input) in previous_actions:
-                observation = "You already run that action. **TRY A DIFFERENT ACTION INPUT.**"
-            else:
-                if action=="calculate":
-                    try:
-                        observation = eval(str(action_input))
-                    except Exception as e:
-                        observation = f"{e}"
-                elif action=="wikipedia":
-                    try:
-                        observation = wikipedia(str(action_input))
-                    except Exception as e:
-                        observation = f"{e}"
-            print()
-            print(f"\x1b[33m Observation: {observation} \x1b[0m")
-            print()
-            previous_actions.append(action + ": " + str(action_input))
-            next_prompt += (
-                "\nScratchpad: " + scratchpad +
-                "\nThought: " + thought +
-                "\nAction: " + action  +
-                "\nAction Input: " + action_input +
-                "\nObservation: " + str(observation)
-            )
-        else:
-            scratchpad = json_result["Scratchpad"]
-            final_answer = json_result["Final_Answer"]
-            print(f"\x1b[34m Scratchpad: {scratchpad} \x1b[0m")
-            print(f"\x1b[34m Final Answer: {final_answer} \x1b[0m")
-            return final_answer
-    print(f"\nFinal Answer: I am sorry, but I am unable to answer your question. Please provide more information or a different question.")
-    return "No answer found"
-```
-
-We can now test our ReAct agent:
-
-```python
-print(query("What's 2 to the power of 10?"))
-# Scratchpad:
-# Thought: I need to perform a mathematical calculation to find the result of 2 to the power of 10.
-#  -- running calculate: 2**10
-#
-# Observation: 1024
-#
-# Scratchpad: 2 to the power of 10 is 1024.
-# Final Answer: 2 to the power of 10 is 1024.
-# 2 to the power of 10 is 1024.
-```
-
-```python
-print(query("What does England share borders with?"))
-# Scratchpad:
-# Thought: To answer this question, I will use the 'wikipedia' action to gather information about England's geographical location and its borders.
-#  -- running wikipedia: England borders
-#
-# Observation: Anglo-Scottish <span class="searchmatch">border</span> (Scottish Gaelic: Crìochan Anglo-Albannach) is an internal <span class="searchmatch">border</span> of the United Kingdom separating Scotland and <span class="searchmatch">England</span> which runs for
-#
-# Scratchpad: Anglo-Scottish border (Scottish Gaelic: Crìochan Anglo-Albannach) is an internal border of the United Kingdom separating Scotland and England which runs for
-# Final Answer: England shares a border with Scotland.
-# England shares a border with Scotland.
-```
-
-As mentioned in Simon's blog post, this is not a very robust implementation at all and there's a ton of room for improvement. But it is lovely how simple it is with a few lines of Python to make these extra capabilities available to the LLM. And now you can run it locally with an open weights LLM.
-
-This example was originally contributed by [Alonso Silva](https://github.com/alonsosilvaallende).
diff --git a/docs/cookbook/simtom.md b/docs/cookbook/simtom.md
deleted file mode 100644
index aa96005b..00000000
--- a/docs/cookbook/simtom.md
+++ /dev/null
@@ -1,129 +0,0 @@
-# Build perspective-taking agents with SimToM
-
-Prompting strategies like Chain-of-Thought (CoT) can improve LLMs' reasoning capabilities. However, they underwhelm in tasks that require keeping track of inconsistent world states. [SimToM](https://arxiv.org/abs/2311.10227) proposes a simple, two-stage prompting framework for LLMs inspired by Simulation Theory. The authors showed that this approach outperforms zero-shot prompting and CoT on ToMI and BigToM, two benchmarks with Theory of Mind questions.
-
-In this example, we will implement SimToM with a few lines of code using Outlines' prompt templating and structured generation capabilities.
-
-## How SimToM works
-
-SimToM calls an LLM with two consecutive prompts:
-
-1. **Perspective-taking**: The first prompt receives a `story` and a `character`. The goal is to understand the situation based on the character's point of view and filter out the rest of the story.
-2. **Question-Answering**: The second prompt receives the character's point of view from the previous step and tasks the LLM to answer a question using that context.
-
-![Figure 2 in the paper](./images/simtom.png)
-
-## Outlines implementation
-
-To implement SimToM with Outlines, we will need to:
-
-1. Write the prompts with [prompt functions](https://outlines-dev.github.io/outlines/reference/prompting/).
-2. Define the JSON object each prompt will return using Pydantic.
-3. Generate responses with a Mistral model using the [transformers integration](https://outlines-dev.github.io/outlines/reference/models/transformers/).
-
-Let's dive into it!
-
-### Using Prompt Functions
-
-With Outlines, you can write your prompts as Python functions by adding the `@outlines.prompt` decorator. The prompt template is contained in their docstring, and their arguments correspond to variables used in the prompt.
-
-The authors have shared their code, prompts and data in [this GitHub repository](https://github.com/shawnsihyunlee/simulatedtom). Below, we define in Outlines the prompts they used for the ToMI dataset:
-
-```python
-import outlines
-
-
-@outlines.prompt
-def perspective_taking(story: str, character: str) -> None:
-    """<s>[INST] The following is a sequence of events about some characters, that takes place in multiple locations.
-    Your job is to output only the events that the specified character, {{character}}, knows about.
-
-    Here are a few rules:
-    1. A character knows about all events that they do.
-    2. If a character is in a certain room/location, that character knows about all other events that happens in the room. This includes other characters leaving or exiting the location, the locations of objects in that location, and whether somebody moves an object to another place.
-    3. If a character leaves a location, and is NOT in that location, they no longer know about any events that happen within that location. However, they can re-enter the location.
-
-    Story: {{story}}
-    What events does {{character}} know about? Only output the events according to the above rules, do not provide an explanation. [/INST]""" # noqa
-
-@outlines.prompt
-def simulation(events: list, name: str, question: str) -> None:
-    """<s>[INST] {% for event in events %}
-    {{event}}
-    {% endfor %}
-    You are {{name}}.
-    Based on the above information, answer the following question:
-    {{question}}
-    You must choose one of the above choices, do not say there is not enough information. Answer with a single word, do not output anything else. [/INST]""" # noqa
-```
-
-### JSON Structured Generation
-
-Outlines guarantees that the LLM will return a valid JSON object, which we can specify as a Pydantic model.
-
-We will need two Pydantic models for SimToM, one for each prompt:
-
-```python
-from pydantic import BaseModel, Field
-from typing import List
-
-
-class PerspectiveTaking(BaseModel):
-    """This is for the first prompt."""
-    character: str = Field(description="The character we extract the events for.")
-    events: List[str] = Field(description="All events that the character knows about.")
-
-
-class Simulation(BaseModel):
-    """This is for the second prompt."""
-    answer: str
-```
-
-### Calling an LLM
-
-Let's try SimToM with an example from the ToMI dataset:
-
-```python
-story = """
-1 Aria entered the front_yard.
-2 Aiden entered the front_yard.
-3 The grapefruit is in the green_bucket.
-4 Aria moved the grapefruit to the blue_container.
-5 Aiden exited the front_yard.
-6 Noah entered the playroom.
-"""
-question = "7 Where was the grapefruit at the beginning?"
-character = "Aria"
-```
-
-We load `Mistral-7B-Instruct-v0.3`, create the prompt using the template we defined earlier, and generate a structured response. As a reminder, the goal of the first call is to get all the events a character, `Aria`, knows about.
-
-```python
-# Load an LLM from Hugging Face
-MODEL_NAME = "mistral-community/Mistral-7B-Instruct-v0.3"
-model = outlines.models.transformers(MODEL_NAME, device="cuda")
-
-perspective_prompt = perspective_taking(story=story, character=character)
-
-# Call Mistral 7B with the first prompt
-generator = outlines.generate.json(model, PerspectiveTaking)
-perspective = generator(perspective_prompt)
-
-print(perspective.model_dump())
-# {'character': 'Aria', 'events': ['1 Aria entered the front_yard.', '3 The grapefruit is in the green_bucket.', '4 Aria moved the grapefruit to the blue_container.']}
-```
-
-Not bad! We will now generate the second prompt with those events.
-
-```python
-sim_prompt = simulation(events=perspective.events, name=character, question=question)
-
-# Call Mistral 7B with the second prompt
-generator = outlines.generate.json(model, Simulation)
-result = generator(sim_prompt)
-
-print(result.model_dump())
-# {'answer': 'green_bucket'}
-```
-
-And this is it! SimToM could be useful in agentic workflows, where agents must act based on what they know, not all available information. One caveat of SimToM is that the perspective-taking step may remove important information, leading to wrong results. As the authors note in their paper, it can feature as a simple and effective baseline for evaluating LLMs on Theory of Mind reasoning tasks.
diff --git a/docs/cookbook/structured_generation_workflow.md b/docs/cookbook/structured_generation_workflow.md
deleted file mode 100644
index b9d7c28a..00000000
--- a/docs/cookbook/structured_generation_workflow.md
+++ /dev/null
@@ -1,215 +0,0 @@
-# Structured Generation Workflow: Generating Synthetic Phone Numbers
-
-This is a condensed version of [Coding for Structured Generation with LLMs](https://blog.dottxt.co/coding-for-structured-generation.html).
-
-For this example we're going to be building an LLM program to generate **synthetic data** in the form of realistic looking phone numbers for Washington State. Using an LLM for this task *is a bit overkill* since we could just as easily accomplish this
-with a tool like [Faker](https://fakerjs.dev/), but this example still serves as a useful way to demonstrate a workflow for using structured generation.
-
-## Unstructured approach
-
-Before diving into how to use structure generation for this task let's start with an unstructured example. We begin by loading our model:
-
-```python
-import outlines
-
-model_name = 'microsoft/Phi-3-mini-4k-instruct'
-model = outlines.models.transformers(model_name)
-```
-
-Next we need a prompt for this model. Since we're focusing on structured generation, we won't be engaging in any form of "prompt hacking" and will be leaving this prompt untouched for the rest of this example.
-
-```python
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-
-messages_phone = [
-            {"role": "user", "content": """
-            Please generate a realistic phone number for Washington State in the following format
-
-            (555) 555-5555
-
-            """}
-]
-
-# This allows us to properly format our prompt for
-# Phi-3 Mini's 'Instruct' interface.
-prompt_phone = tokenizer.apply_chat_template(messages_phone, tokenize=False)
-```
-
-With our prompt ready we can now generate 10 example phone numbers
-
-```python
-phone_generator_unstruct = outlines.generate.text(model)
-for _ in range(10):
-    print(phone_generator_unstruct(prompt_phone,max_tokens=12))
-```
-
-> I'd be happy to help you generate a realistic phone\
-I cannot generate a real phone number as I'm just\
-I'm an AI and don't have the ability\
-Sure! Here is a randomly generated phone number in the format\
-Here's a phone number that fits the format for a\
-In Washington State, phone numbers typically have a three-dig\
-Here are a few examples of phone numbers that could be considered\
-I'd be happy to help generate a realistic phone number\
-I'd be happy to help you generate a random phone\
-Based on the format you provided, a realistic phone number for\
-
-As we can see, none of these outputs are even phone numbers!
-
-Let's see  if we can improve this using structured generation.
-
-## The Structured Generation Workflow
-
-In order to solve this problem we're going to introduce a *Structured Generation Workflow* outlined in this image:
-
-!["Visual of Structured Generation Workflow"](./images/coding_structure_diagram.png)
-
-Let's step through this:
-
-### Real example
-
-We start with a real example phone number, in this case for the Seattle Public Library, that we can use to verify the structure we are creating.
-
-```python
-phone_number = "(206) 386-4636"
-```
-
-For a simple example like this, we'll just be using a single phone number, for more complex examples it can be helpful to have more examples.
-
-### Draft Structure
-
-The next step in the process is for use to define a simple regex that we feel correctly models our real data.
-
-```python
-phone_regex_1 = r'\([0-9]{3}\) [0-9]{3}-[0-9]{4}'
-```
-
-Next we need to validate this regex against our real data.
-
-### Validate by matching examples
-
-Whenever writing non-trivial code with structured generation it is *essential* that you first validate the code against your real data example(s).
-
-We'll start with a simple method of validation: just checking that our regex matches the data.
-
-```
-import re
-re.match(phone_regex_1, phone_number)
-
-# <re.Match object; span=(0, 14), match='(206) 386-4636'>
-
-```
-
-Now that we have a match, we can move on to generating structured output!
-
-### Generate Structure
-
-We're ready to see if structured generation can make an improvement over our initial unstructured approach:
-
-```python
-phone_generator_v1 = outlines.generate.regex(model, phone_regex_1)
-for _ in range(10):
-    print(phone_generator_v1(prompt_phone))
-```
-> (206) 555-1234\
-(206) 555-1234\
-(206) 555-1234\
-(206) 555-1234\
-(206) 555-1234\
-(206) 555-1234\
-(206) 123-4567\
-(206) 555-1234\
-(206) 555-1234\
-(206) 555-1234
-
-At least we have phone numbers! But I think we can do better!
-
-### Inspect output
-
-In this case the model *did* create phone numbers and, impressively, got the area code correct. So using structured generation did improve things. However these numbers are pretty boring. Let's improve that structure!
-
-## Iteration
-
-We've walked through the loop once, so we can go quickly now through each iteration.
-
-We start by improving our structure:
-
-```python
-phone_regex_2 = r'\([0-9]{3}\) [2-46-9]{3}-[02-9]{4}'
-```
-
-Before rushing to another round of generation, let's validate this new regex. We'll add just a bit more sophistication over our last check:
-
-```python
-re.match(phone_regex_2, phone_number)[0] == phone_number
-# True
-```
-Now that we've validated, let's generate with this new regex!
-
-```python
-phone_generator_v2 = outlines.generate.regex(model,
-                                             phone_regex_2)
-for _ in range(10):
-    print(phone_generator_v2(prompt_phone))
-```
-
-> (206) 867-5309\
-(206) 666-7777\
-(206) 444-3333\
-(206) 444-3333\
-(206) 943-2222\
-(206) 323-6789\
-(206) 444-3333\
-(206) 867-5309\
-(206) 466-2255\
-(206) 222-3333
-
-Better, but I don't like those repeated sequences. Like good software developers, let's iterate again!
-
-## Reiteration - with debugging
-
-Here's a fancier regex that should give us more interesting results:
-
-```python
-phone_regex_3_error = r'\([0-9]{3}\) [2-4][7-9][4-6]-[3-6][2-8][1-4]'
-```
-
-This looks good to me, but there's a subtle bug, that's why we *always* need to validate our structure against real data. This time we'll make our validator do a bit more work to verify the correct string is matched:
-
-```python
-if not re.match(phone_regex_3_error, phone_number):
-    print("Regex fails match")
-else:
-    matched_string = re.match(phone_regex_3_error, phone_number)[0]
-    if matched_string == phone_number:
-    print("Successful match")
-    else:
-    print(f"Error {matched_string} != {phone_number}")
-```
-This prints out:
->  Error (206) 386-463 != (206) 386-4636
-
-Ah! We were missing the last digit, let's fix that and regenerate:
-
-```python
-phone_regex_3_fixed = r'\([0-9]{3}\) [2-4][7-9][4-6]-[3-6][2-8][1-4][6-9]'
-phone_generator_v3 = outlines.generate.regex(model,
-                                             phone_regex_3_fixed)
-for _ in range(10):
-    print(phone_generator_v3(prompt_phone))
-```
-
->(206) 494-3216\
-(206) 374-6218\
-(206) 494-3337\
-(206) 476-3216\
-(206) 484-3548\
-(206) 495-3218\
-(206) 494-5517\
-(206) 375-4636\
-(206) 384-6216\
-(206) 385-6218
-
-Much better!
-
-Now you've seen a quick example of the structured generation workflow that can be used at the basis for building and iteration on much larger structured generation tasks!
diff --git a/docs/index.md b/docs/index.md
deleted file mode 100644
index b19ebd57..00000000
--- a/docs/index.md
+++ /dev/null
@@ -1,26 +0,0 @@
----
-title: Outlines
-hide:
-  - navigation
-  - toc
-  - feedback
----
-
-#
-
-<figure markdown>
-  ![Image title](assets/images/logo.png){ width="300" }
-</figure>
-
-<center>
-    <h1 class="title">Generate text with LLMs</h1>
-    <h2 class="subtitle">Robust prompting & (structured) text generation</h2>
-    [:fontawesome-solid-bolt: Get started](welcome.md){ .md-button .md-button--primary }
-    [:fontawesome-brands-discord: Join the Community](https://discord.gg/ZxBxyWmW5n){ .md-button }
-
-<div class="index-pre-code">
-```bash
-pip install outlines
-```
-</div>
-</center>
diff --git a/docs/installation.md b/docs/installation.md
deleted file mode 100644
index 1017b627..00000000
--- a/docs/installation.md
+++ /dev/null
@@ -1,38 +0,0 @@
----
-title: installation
----
-
-# Installation
-
-You can install Outlines with `pip`:
-
-```python
-pip install outlines
-```
-
-Outlines supports OpenAI, transformers, Mamba, llama.cpp and exllama2 but **you will need to install them manually**:
-
-```python
-pip install openai
-pip install transformers datasets accelerate torch
-pip install llama-cpp-python
-pip install exllamav2 transformers torch
-pip install mamba_ssm transformers torch
-pip install vllm
-```
-
-If you encounter any problem using Outlines with these libraries, take a look at their installation instructions. The installation of `openai` and `transformers` should be straightforward, but other libraries have specific hardware requirements.
-
-## Bleeding edge
-
-You can install the latest version of Outlines on the repository's `main` branch:
-
-```python
-pip install git+https://github.com/outlines-dev/outlines.git@main
-```
-
-This can be useful, for instance, when a fix has been merged but not yet released.
-
-## Installing for development
-
-See the [contributing documentation](community/contribute.md) for instructions on how to install Outlines for development.
diff --git a/docs/licence.md b/docs/licence.md
deleted file mode 100644
index a74661fa..00000000
--- a/docs/licence.md
+++ /dev/null
@@ -1,34 +0,0 @@
----
-title: Licence
----
-
-# Licence and citations
-
-Outlines is licenced under the Apache 2.0 licence. To comply with the licence you need to add the following notice at the top every file that uses part of Outlines' code:
-
-```
-Copyright 2023- The Outlines developers
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-```
-
-If you use Outlines in your work you can use the following citation:
-
-```
-@article{willard2023efficient,
-  title={Efficient Guided Generation for LLMs},
-  author={Willard, Brandon T and Louf, R{\'e}mi},
-  journal={arXiv preprint arXiv:2307.09702},
-  year={2023}
-}
-```
diff --git a/docs/logos/amazon.png b/docs/logos/amazon.png
deleted file mode 100644
index 85e1dc81..00000000
Binary files a/docs/logos/amazon.png and /dev/null differ
diff --git a/docs/logos/apple.png b/docs/logos/apple.png
deleted file mode 100644
index 345f6277..00000000
Binary files a/docs/logos/apple.png and /dev/null differ
diff --git a/docs/logos/best_buy.png b/docs/logos/best_buy.png
deleted file mode 100644
index a273b364..00000000
Binary files a/docs/logos/best_buy.png and /dev/null differ
diff --git a/docs/logos/canoe.png b/docs/logos/canoe.png
deleted file mode 100644
index 3876a3fe..00000000
Binary files a/docs/logos/canoe.png and /dev/null differ
diff --git a/docs/logos/cisco.png b/docs/logos/cisco.png
deleted file mode 100644
index 31422734..00000000
Binary files a/docs/logos/cisco.png and /dev/null differ
diff --git a/docs/logos/dassault_systems.png b/docs/logos/dassault_systems.png
deleted file mode 100644
index 1ed0e876..00000000
Binary files a/docs/logos/dassault_systems.png and /dev/null differ
diff --git a/docs/logos/databricks.png b/docs/logos/databricks.png
deleted file mode 100644
index 15f64018..00000000
Binary files a/docs/logos/databricks.png and /dev/null differ
diff --git a/docs/logos/datadog.png b/docs/logos/datadog.png
deleted file mode 100644
index dc758f42..00000000
Binary files a/docs/logos/datadog.png and /dev/null differ
diff --git a/docs/logos/dbt_labs.png b/docs/logos/dbt_labs.png
deleted file mode 100644
index 3e52f470..00000000
Binary files a/docs/logos/dbt_labs.png and /dev/null differ
diff --git a/docs/logos/gladia.jpg b/docs/logos/gladia.jpg
deleted file mode 100644
index 4c5e505e..00000000
Binary files a/docs/logos/gladia.jpg and /dev/null differ
diff --git a/docs/logos/harvard.png b/docs/logos/harvard.png
deleted file mode 100644
index f6ec08d7..00000000
Binary files a/docs/logos/harvard.png and /dev/null differ
diff --git a/docs/logos/hf.png b/docs/logos/hf.png
deleted file mode 100644
index 9ba7a6f2..00000000
Binary files a/docs/logos/hf.png and /dev/null differ
diff --git a/docs/logos/johns_hopkins.png b/docs/logos/johns_hopkins.png
deleted file mode 100644
index d1ba7012..00000000
Binary files a/docs/logos/johns_hopkins.png and /dev/null differ
diff --git a/docs/logos/meta.png b/docs/logos/meta.png
deleted file mode 100644
index 33e9e588..00000000
Binary files a/docs/logos/meta.png and /dev/null differ
diff --git a/docs/logos/mit.png b/docs/logos/mit.png
deleted file mode 100644
index 60c5528e..00000000
Binary files a/docs/logos/mit.png and /dev/null differ
diff --git a/docs/logos/mount_sinai.png b/docs/logos/mount_sinai.png
deleted file mode 100644
index f18af068..00000000
Binary files a/docs/logos/mount_sinai.png and /dev/null differ
diff --git a/docs/logos/nvidia.png b/docs/logos/nvidia.png
deleted file mode 100644
index 22f9e9ed..00000000
Binary files a/docs/logos/nvidia.png and /dev/null differ
diff --git a/docs/logos/nyu.png b/docs/logos/nyu.png
deleted file mode 100644
index d93d0fcb..00000000
Binary files a/docs/logos/nyu.png and /dev/null differ
diff --git a/docs/logos/safran.png b/docs/logos/safran.png
deleted file mode 100644
index ece23c8e..00000000
Binary files a/docs/logos/safran.png and /dev/null differ
diff --git a/docs/logos/salesforce.png b/docs/logos/salesforce.png
deleted file mode 100644
index 031f942b..00000000
Binary files a/docs/logos/salesforce.png and /dev/null differ
diff --git a/docs/logos/shopify.png b/docs/logos/shopify.png
deleted file mode 100644
index a4f9cb57..00000000
Binary files a/docs/logos/shopify.png and /dev/null differ
diff --git a/docs/logos/smithsonian.png b/docs/logos/smithsonian.png
deleted file mode 100644
index 0cd759ab..00000000
Binary files a/docs/logos/smithsonian.png and /dev/null differ
diff --git a/docs/logos/tinder.png b/docs/logos/tinder.png
deleted file mode 100644
index 28d0d0dd..00000000
Binary files a/docs/logos/tinder.png and /dev/null differ
diff --git a/docs/logos/upenn.png b/docs/logos/upenn.png
deleted file mode 100644
index 36fb6218..00000000
Binary files a/docs/logos/upenn.png and /dev/null differ
diff --git a/docs/overrides/index.html b/docs/overrides/index.html
deleted file mode 100644
index 74a4987f..00000000
--- a/docs/overrides/index.html
+++ /dev/null
@@ -1,11 +0,0 @@
-{% extends "base.html" %}
-
-{% block announce %}
-    For updates follow <strong>@remilouf</strong> on
-    <a href="https://twitter.com/squidfunk">
-        <span class="twemoji twitter">
-        {% include ".icons/fontawesome/brands/twitter.svg" %}
-        </span>
-        <strong>Twitter</strong>
-    </a>
-{% endblock %}
diff --git a/docs/overrides/main.html b/docs/overrides/main.html
deleted file mode 100644
index b4183d71..00000000
--- a/docs/overrides/main.html
+++ /dev/null
@@ -1,22 +0,0 @@
-{% extends "base.html" %}
-
-{% block announce %}
-    For updates follow <strong>@dottxtai</strong> on
-    <a href="https://twitter.com/dottxtai">
-        <span class="twemoji twitter">
-        {% include ".icons/fontawesome/brands/twitter.svg" %}
-        </span>
-        <strong>Twitter</strong>
-    </a>
-    and
-    <span class="twemoji stat">
-    {% include ".icons/fontawesome/solid/star.svg" %}
-    </span>
-    the repo on
-    <a href="https://github.com/outlines-dev/outlines">
-        <span class="twemoji github">
-        {% include ".icons/fontawesome/brands/github.svg" %}
-        </span>
-        <strong>Github</strong>
-    </a>
-{% endblock %}
diff --git a/docs/quickstart.md b/docs/quickstart.md
deleted file mode 100644
index 2e1f9a6b..00000000
--- a/docs/quickstart.md
+++ /dev/null
@@ -1,228 +0,0 @@
----
-title: Quickstart
----
-
-# Quickstart
-
-After [installing Outlines](installation.md), the fastest way to get to up to speed with the library is to get acquainted with its few core elements. We advise you to take a quick look at this page to see everything Outlines has to offer before diving in the [documentation](reference/index.md).
-
-
-## Core elements
-
-### Models
-
-The first step when writing a program with Outlines is to initialize a model. Weights will be loaded on the device at this step:
-
-```python
-import outlines
-
-model = outlines.models.transformers(
-    "microsoft/Phi-3-mini-4k-instruct",
-    device="cuda"  # optional device argument, default is cpu
-)
-```
-
-Outlines supports a wide variety of inference engines and model weight types. More details on different models can be found in the [Outlines Models](./reference/models/models.md) documentation page.
-
-### Generation
-
-Once the model is initialized you can build an `outlines.generate` generator. This generator can be called with a prompt directly.
-
-([Outlines Structured Generation Full Documentation](./reference/generation/generation.md))
-
-=== "Text"
-
-    ```python
-    generator = outlines.generate.text(model)
-
-    result = generator("Question: What's 2+2? Answer:", max_tokens=100)
-    print(result)
-    # The answer is 4
-
-	# Outlines also supports streaming output
-    stream = generator.stream("What's 2+2?", max_tokens=4)
-    for i in range(5):
-        token = next(stream)
-        print(repr(token))
-	# '2'
-	# '+'
-	# '2'
-	# ' equals'
-	# '4'
-    ```
-
-=== "Structured"
-
-    Along with typical language model generation behavior via, `outlines.generate.text`, Outlines supports structured generation, which guarantees the tokens generated by the model will follow a predefined structure. Structures can be defined by a regex pattern, JSON schema, python object type, or a Lark grammar defining a parsable language such as SQL or Python.
-
-	Example: using pydantic to enforce a JSON schema
-
-    ```python
-    from enum import Enum
-    from pydantic import BaseModel, constr, conint
-
-    class Character(BaseModel):
-        name: constr(max_length=10)
-        age: conint(gt=18, lt=99)
-        armor: (Enum('Armor', {'leather': 'leather', 'chainmail': 'chainmail', 'plate': 'plate'}))
-        strength: conint(gt=1, lt=100)
-
-    generator = outlines.generate.json(model, Character)
-
-    character = generator(
-        "Generate a new character for my awesome game: "
-        + "name, age (between 1 and 99), armor and strength. "
-        )
-    print(character)
-    # Character(name='Zara', age=25, armor=<Armor.leather: 'leather'>, strength=85)
-	```
-
-## [Deploy using vLLM and FastAPI](./reference/serve/vllm.md)
-
-Outlines can be deployed as a LLM service using [vLLM][vllm]{:target="_blank"} and [FastAPI][fastapi]{:target="_blank"}. The server supports asynchronous processing of incoming requests, and benefits from the performance of vLLM.
-
-First start the server:
-
-```python
-python -m outlines.serve.serve --model="microsoft/Phi-3-mini-4k-instruct"
-```
-
-Or you can start the server with Outlines' official Docker image:
-
-```bash
-docker run -p 8000:8000 outlinesdev/outlines --model="microsoft/Phi-3-mini-4k-instruct"
-```
-
-This will by default start a server at `http://127.0.0.1:8000` (check what the console says, though). Without the `--model` argument set, the OPT-125M model is used.
-
-
-You can then query the model in shell by passing a prompt and a [JSON Schema][jsonschema]{:target="_blank"} specification for the structure of the output:
-
-```bash
-curl http://127.0.0.1:8000/generate \
-    -d '{
-        "prompt": "Question: What is a language model? Answer:",
-        "schema": {"type": "string"}
-        }'
-```
-
-Or use the [requests][requests]{:target="_blank"} library from another python program. You can read the [vLLM documentation][vllm]{:target="_blank"} for more details.
-
-## Utilities
-
-### [Prompt templates](./reference/prompting.md)
-
-Prompting can lead to messy code. Outlines' prompt functions are python functions that contain a template for the prompt in their docstring. We use a powerful templating language to allow you to loop over lists, dictionaries, add conditionals, etc. directly from the prompt. When called, a prompt function returns the rendered template:
-
-```python
-import outlines
-
-@outlines.prompt
-def few_shots(instructions, examples, question):
-    """{{ instructions }}
-
-    Examples
-    --------
-
-    {% for example in examples %}
-    Q: {{ example.question }}
-    A: {{ example.answer }}
-
-    {% endfor %}
-    Question
-    --------
-
-    Q: {{ question }}
-    A:
-    """
-
-instructions = "Please answer the following question following the examples"
-examples = [
-    {"question": "2+2=?", "answer":4},
-    {"question": "3+3=?", "answer":6}
-]
-question = "4+4 = ?"
-
-prompt = few_shots(instructions, examples, question)
-print(prompt)
-# Please answer the following question following the examples
-
-# Examples
-# --------
-
-# Q: 2+2=?
-# A: 4
-
-# Q: 3+3=?
-# A: 6
-
-# Question
-# --------
-
-# Q: 4+4 = ?
-# A:
-```
-
-### Outlines functions
-
-Once you are done experimenting with a prompt and an output structure, it is useful to be able to encapsulate all of these in a single function that can be called from other parts of the program. This is what `outlines.Function` allows you to do:
-
-=== "function.py"
-
-    ```python
-    from pydantic import BaseModel
-
-    import outlines
-
-
-    @outlines.prompt
-    def tell_a_joke(topic):
-        """Tell me a joke about {{ topic }}."""
-
-    class Joke(BaseModel):
-        setup: str
-        punchline: str
-
-    generate_joke = outlines.Function(
-        tell_a_joke,
-        Joke,
-        "microsoft/Phi-3-mini-4k-instruct"
-    )
-    ```
-
-=== "Call a function"
-
-    ```python
-    from .function import generate_joke
-
-    response = generate_joke("baseball")
-
-    # haha
-    # Joke(setup='Why was the baseball in a bad mood?', punchline='Because it got hit around a lot.')
-    ```
-
-=== "Call a function stored on GitHub"
-
-    You can load a function that is stored on a repository on GitHub directly from Outlines. Say `Someone` stores a function in `joke.py` at the root of the `TheirRepo` repository:
-
-    ```python
-    import outlines
-
-    joke = outlines.Function.from_github("Someone/TheirRepo/joke")
-    response = joke("baseball")
-    ```
-    It make it easier for the community to collaborate on the infinite number of use cases enabled by these models!
-
-
-## Going further
-
-If you need more inspiration you can take a look at the [cookbook](cookbook/index.md) or watch [Remi Louf's AI Engineer World’s Fair Presentation on Outlines](https://www.youtube.com/live/R0X7mPagRiE?t=775s). If you have any question, or requests for documentation please reach out to us on [GitHub](https://github.com/outlines-dev/outlines/discussions), [Twitter](https://twitter.com/remilouf) or [Discord](https://discord.gg/UppQmhEpe8).
-
-
-[pydantic]: https://docs.pydantic.dev/latest
-[jsonschema]: https://json-schema.org/
-[fastapi]: https://fastapi.tiangolo.com/
-[cfg]: https://en.wikipedia.org/wiki/Context-free_grammar
-[ebnf]: https://en.wikipedia.org/wiki/Extended_Backus%E2%80%93Naur_form
-[requests]: https://requests.readthedocs.io/en/latest/
-[vllm]: https://docs.vllm.ai/en/latest/index.html
diff --git a/docs/reference/functions.md b/docs/reference/functions.md
deleted file mode 100644
index d29213a8..00000000
--- a/docs/reference/functions.md
+++ /dev/null
@@ -1 +0,0 @@
-# Outlines functions
diff --git a/docs/reference/generation/cfg.md b/docs/reference/generation/cfg.md
deleted file mode 100644
index 4f0285c1..00000000
--- a/docs/reference/generation/cfg.md
+++ /dev/null
@@ -1,149 +0,0 @@
-# Grammar-structured generation
-
-You can pass any context-free grammar in the EBNF format and Outlines will generate an output that is valid to this grammar:
-
-```python
-from outlines import models, generate
-
-arithmetic_grammar = """
-    ?start: expression
-
-    ?expression: term (("+" | "-") term)*
-
-    ?term: factor (("*" | "/") factor)*
-
-    ?factor: NUMBER
-           | "-" factor
-           | "(" expression ")"
-
-    %import common.NUMBER
-"""
-
-model = models.transformers("WizardLM/WizardMath-7B-V1.1")
-generator = generate.cfg(model, arithmetic_grammar)
-sequence = generator(
-  "Alice had 4 apples and Bob ate 2. "
-  + "Write an expression for Alice's apples:"
-)
-
-print(sequence)
-# (8-2)
-```
-
-!!! Note "Performance"
-
-    The implementation of grammar-structured generation in Outlines is very naive. This does not reflect the performance of [.txt](https://dottxt.co)'s product, where we made grammar-structured generation as fast as regex-structured generation.
-
-
-## Ready-to-use grammars
-
-Outlines contains a (small) library of grammars that can be imported and use directly. We can rewrite the previous example as:
-
-```python
-from outlines import models, generate
-
-arithmetic_grammar = outlines.grammars.arithmetic
-
-model = models.transformers("WizardLM/WizardMath-7B-V1.1")
-generator = generate.cfg(model, arithmetic_grammar)
-sequence = generator(
-  "Alice had 4 apples and Bob ate 2. "
-  + "Write an expression for Alice's apples:"
-)
-
-print(sequence)
-# (8-2)
-```
-
-The following grammars are currently available:
-
-- Arithmetic grammar via `outlines.grammars.arithmetic`
-- JSON grammar via `outlines.grammars.json`
-
-If you would like more grammars to be added to the repository, please open an [issue](https://github.com/outlines-dev/outlines/issues) or a [pull request](https://github.com/outlines-dev/outlines/pulls).
-
-
-## Grammar guide
-
-A grammar is a list of rules and terminals that define a *language*:
-
-- Terminals define the vocabulary of the language; they may be a string, regular expression or combination of these and other terminals.
-- Rules define the structure of that language; they are a list of terminals and rules.
-
-Outlines uses the [Lark library](https://github.com/lark-parser/lark) to make Large Language Models generate text in a language of a grammar, it thus uses grammars defined in a format that Lark understands, based on the [EBNF syntax](https://en.wikipedia.org/wiki/Extended_Backus%E2%80%93Naur_form). Read the [Lark documentation](https://lark-parser.readthedocs.io/en/stable/grammar.html) for more details on grammar, the following is a small primer that should help get your started.
-
-In the following we will define a [LOGO-like toy language](https://github.com/lark-parser/lark/blob/master/examples/turtle_dsl.py) for python's [turtle](https://docs.python.org/3/library/turtle.html) library.
-
-### Terminals
-
-A turtle can take 4 different `MOVEMENT` move instructions: forward (`f`), backward (`b`), turn right (`r`) and turn left (`l`). It can take `NUMBER` number of steps in each direction, and draw lines in a specified `COLOR`. These define the vocabulary of our language:
-
-```ebnf
-MOVEMENT: "f"|"b"|"r"|"l"
-COLOR: LETTER+
-
-%import common.LETTER
-%import common.INT -> NUMBER
-%import common.WS
-%ignore WS
-```
-
-The lines that start with `%` are called "directive". They allow to import pre-defined terminals and rules, such as `LETTER` and `NUMBER`. `LETTER+` is a regular expressions, and indicates that a `COLOR` is made of at least one `LETTER`. The last two lines specify that we will ignore white spaces (`WS`) in the grammar.
-
-### Rules
-
-We now need to define our rules, by decomposing instructions we can send to the turtle via our python program. At each line of the program, we can either choose a direction and execute a given number of steps, change the color used to draw the pattern. We can also choose to start filling, make a series of moves, and stop filling. We can also choose to repeat a series of move.
-
-We can easily write the first two rules:
-
-```ebnf
-instruction: MOVEMENT NUMBER   -> movement
-           | "c" COLOR [COLOR] -> change_color
-```
-
-where `movement` and `change_color` represent aliases for the rules. A whitespace implied concatenating the elements, and `|` choosing either of the elements. The `fill` and `repeat` rules are slightly more complex, since they apply to a code block, which is made of instructions. We thus define a new `code_block`  rule that refers to `instruction` and finish implementing our rules:
-
-```ebnf
-instruction: MOVEMENT NUMBER            -> movement
-           | "c" COLOR [COLOR]          -> change_color
-           | "fill" code_block          -> fill
-           | "repeat" NUMBER code_block -> repeat
-
-code_block: "{" instruction "}"
-```
-
-We can now write the full grammar:
-
-```ebnf
-start: instruction+
-
-instruction: MOVEMENT NUMBER            -> movement
-            | "c" COLOR [COLOR]          -> change_color
-            | "fill" code_block          -> fill
-            | "repeat" NUMBER code_block -> repeat
-
-code_block: "{" instruction+ "}"
-
-MOVEMENT: "f"|"b"|"l"|"r"
-COLOR: LETTER+
-
-%import common.LETTER
-%import common.INT -> NUMBER
-%import common.WS
-%ignore WS
-```
-
-Notice the `start` rule, which defines the starting point of the grammar, i.e. the rule with which a program must start. This full grammars allows us to parse programs such as:
-
-```python
-c red yellow
-    fill { repeat 36 {
-        f200 l170
-    }}
-```
-
-The result of the parse, the parse tree, can then easily be translated into a Python program that uses the `turtle` library to draw a pattern.
-
-### Next steps
-
-This section provides a very brief overview of grammars and their possibilities. Check out the [Lark documentation](https://lark-parser.readthedocs.io/en/stable/index.html) for more thorough explanations and more examples.
diff --git a/docs/reference/generation/choices.md b/docs/reference/generation/choices.md
deleted file mode 100644
index aed5af5a..00000000
--- a/docs/reference/generation/choices.md
+++ /dev/null
@@ -1,16 +0,0 @@
-# Multiple choices
-
-Oultines allows you to make sure the generated text is chosen between different options:
-
-```python
-from outlines import models, generate
-
-model = models.transformers("microsoft/Phi-3-mini-4k-instruct")
-generator = generate.choice(model, ["skirt", "dress", "pen", "jacket"])
-answer = generator("Pick the odd word out: skirt, dress, pen, jacket")
-
-```
-
-!!! Note "Performance"
-
-    `generation.choice` computes an index that helps Outlines guide generation. This can take some time, but only needs to be done once. If you want to generate from the same list of choices several times make sure that you only call `generate.choice` once.
diff --git a/docs/reference/generation/custom_fsm_ops.md b/docs/reference/generation/custom_fsm_ops.md
deleted file mode 100644
index 5c4be96f..00000000
--- a/docs/reference/generation/custom_fsm_ops.md
+++ /dev/null
@@ -1,37 +0,0 @@
-# Custom FSM Operations
-
-Outlines is fast because it compiles regular expressions into an index ahead of inference. To do so we use the equivalence between regular expressions and Finite State Machines (FSMs), and the library [interegular](https://github.com/MegaIng/interegular) to perform the translation.
-
-Alternatively, one can pass a FSM built using `integular` directly to structure the generation.
-
-## Example
-
-### Using the `difference` operation
-
-In the following example we build a fsm which recognizes only the strings valid to the first regular expression but not the second. In particular, it will prevent the words "pink" and "elephant" from being generated:
-
-```python
-import interegular
-from outlines import models, generate
-
-
-list_of_strings_pattern = """\["[^"\s]*"(?:,"[^"\s]*")*\]"""
-pink_elephant_pattern = """.*(pink|elephant).*"""
-
-list_of_strings_fsm = interegular.parse_pattern(list_of_strings_pattern).to_fsm()
-pink_elephant_fsm = interegular.parse_pattern(pink_elephant_pattern).to_fsm()
-
-difference_fsm = list_of_strings_fsm - pink_elephant_fsm
-
-difference_fsm_fsm.accepts('["a","pink","elephant"]')
-# False
-difference_fsm_fsm.accepts('["a","blue","donkey"]')
-# True
-
-
-model = models.transformers("microsoft/Phi-3-mini-4k-instruct")
-generator = generate.fsm(model, difference_fsm)
-response = generator("Don't talk about pink elephants")
-```
-
-To see the other operations available, consult [interegular's documentation](https://github.com/MegaIng/interegular/blob/master/interegular/fsm.py).
diff --git a/docs/reference/generation/format.md b/docs/reference/generation/format.md
deleted file mode 100644
index 749baa8b..00000000
--- a/docs/reference/generation/format.md
+++ /dev/null
@@ -1,23 +0,0 @@
-# Type constraints
-
-We can ask completions to be restricted to valid python types:
-
-```python
-from outlines import models, generate
-
-model = models.transformers("microsoft/Phi-3-mini-4k-instruct")
-generator = generate.format(model, int)
-answer = generator("When I was 6 my sister was half my age. Now I’m 70 how old is my sister?")
-print(answer)
-# 67
-```
-
-The following types are currently available:
-
-- int
-- float
-- bool
-- datetime.date
-- datetime.time
-- datetime.datetime
-- We also provide [custom types](types.md)
diff --git a/docs/reference/generation/generation.md b/docs/reference/generation/generation.md
deleted file mode 100644
index 0c090f8a..00000000
--- a/docs/reference/generation/generation.md
+++ /dev/null
@@ -1,216 +0,0 @@
----
-title: Generation
----
-
-# Generation
-
-Once an [Outlines model](../models) is constructed you can use `outlines.generate` to generate text. Standard LLM generation is possible via `outlines.generate.text`, along with a variety of structured generation methods described below. (For a detailed technical explanation of how structured generation works, you may review the [Structured Generation Explanation](./structured_generation_explanation.md) page)
-
-Before generating text, you must construct an `outlines.model`. Example:
-
-```python
-import outlines
-
-model = outlines.models.transformers("microsoft/Phi-3-mini-4k-instruct", device="cuda")
-```
-### Text generator
-
-```python
-generator = outlines.generate.text(model)
-
-result = generator("Question: What's 2+2? Answer:", max_tokens=100)
-print(result)
-# The answer is 4
-
-# Outlines also supports streaming output
-stream = generator.stream("What's 2+2?", max_tokens=4)
-for i in range(5):
-	token = next(stream)
-	print(repr(token))
-# '2'
-# '+'
-# '2'
-# ' equals'
-# '4'
-```
-
-### [Multi-label classification](./choices.md)
-
-Outlines allows you to do multi-label classification by guiding the model so it can only output either of the specified choices:
-
-```python
-import outlines
-
-model = outlines.models.transformers("microsoft/Phi-3-mini-128k-instruct")
-generator = outlines.generate.choice(model, ["Blue", "Red", "Yellow"])
-
-color = generator("What is the closest color to Indigo? ")
-print(color)
-# Blue
-```
-
-### [JSON-structured generation](./json.md)
-
-Outlines can guide models so that they output valid JSON **100%** of the time. You can either specify the structure using [Pydantic][pydantic]{:target="_blank"} or a string that contains a [JSON Schema][jsonschema]{:target="_blank"}:
-
-=== "Pydantic"
-
-    ```python
-    from enum import Enum
-    from pydantic import BaseModel, constr, conint
-
-    import outlines
-
-    class Armor(str, Enum):
-        leather = "leather"
-        chainmail = "chainmail"
-        plate = "plate"
-
-
-    class Character(BaseModel):
-        name: constr(max_length=10)
-        age: conint(gt=18, lt=99)
-        armor: Armor
-        strength: conint(gt=1, lt=100)
-
-    model = outlines.models.transformers("microsoft/Phi-3-mini-128k-instruct")
-    generator = outlines.generate.json(model, Character)
-
-    character = generator(
-        "Generate a new character for my awesome game: "
-        + "name, age (between 1 and 99), armor and strength. "
-        )
-    print(character)
-    # name='Orla' age=21 armor=<Armor.plate: 'plate'> strength=8
-    ```
-
-=== "JSON Schema"
-
-    ```python
-    import outlines
-
-    schema = """{
-        "$defs": {
-            "Armor": {
-                "enum": ["leather", "chainmail", "plate"],
-                "title": "Armor",
-                "type": "string"
-            }
-        },
-        "properties": {
-            "name": {"maxLength": 10, "title": "Name", "type": "string"},
-            "age": {"title": "Age", "type": "integer"},
-            "armor": {"$ref": "#/$defs/Armor"},
-            "strength": {"title": "Strength", "type": "integer"}\
-        },
-        "required": ["name", "age", "armor", "strength"],
-        "title": "Character",
-        "type": "object"
-    }"""
-
-    model = outlines.models.transformers("microsoft/Phi-3-mini-128k-instruct")
-    generator = outlines.generate.json(model, schema)
-    character = generator(
-        "Generate a new character for my awesome game: "
-        + "name, age (between 1 and 99), armor and strength. "
-        )
-    print(character)
-    # {'name': 'Yuki', 'age': 24, 'armor': 'plate', 'strength': 3}
-    ```
-
-!!! Note
-
-    We advise you to constrain the length of the strings fields when first testing your schema, especially with small models.
-
-### [Grammar-structured generation](./cfg.md)
-
-Outlines also allows to generate text that is valid to any [context-free grammar][cfg]{:target="_blank"} (CFG) in the [EBNF format][ebnf]{:target="_blank"}. Grammars can be intimidating, but they are a very powerful tool! Indeed, they determine the syntax of every programming language, valid chess moves, molecule structure, can help with procedural graphics generation, etc.
-
-Here we show a simple example of a grammar that defines arithmetic operations:
-
-```python
-from outlines import models, generate
-
-arithmetic_grammar = """
-    ?start: sum
-
-    ?sum: product
-        | sum "+" product   -> add
-        | sum "-" product   -> sub
-
-    ?product: atom
-        | product "*" atom  -> mul
-        | product "/" atom  -> div
-
-    ?atom: NUMBER           -> number
-         | "-" atom         -> neg
-         | "(" sum ")"
-
-    %import common.NUMBER
-    %import common.WS_INLINE
-
-    %ignore WS_INLINE
-"""
-
-model = models.transformers("microsoft/Phi-3-mini-128k-instruct")
-generator = generate.cfg(model, arithmetic_grammar, max_tokens=100)
-
-result = generator("Question: How can you write 5*5 using addition?\nAnswer:")
-print(result)
-# 5+5+5+5+5
-```
-
-
-EBNF grammars can be cumbersome to write. This is why Outlines provides grammar definitions in the `outlines.grammars.` module
-
-```python
-from outlines import models, generate, grammars
-
-model = models.transformers("microsoft/Phi-3-mini-128k-instruct")
-generator = generate.cfg(model, grammars.arithmetic, max_tokens=100)
-
-result = generator("Question: How can you write 5*5 using addition?\nAnswer:")
-print(result)
-# 5+5+5+5+5
-```
-
-The available grammars are listed [here](https://github.com/outlines-dev/outlines/tree/main/outlines/grammars).
-
-
-### [Regex-structured generation](./regex.md)
-
-Slightly simpler, but no less useful, Outlines can generate text that is in the language of a [regular expression](https://www.regular-expressions.info/tutorial.html). For instance to force the model to generate IP addresses:
-
-```python
-from outlines import models, generate
-
-model = models.transformers("microsoft/Phi-3-mini-128k-instruct")
-
-regex_str = r"((25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(25[0-5]|2[0-4]\d|[01]?\d\d?)"
-generator = generate.regex(model, regex_str)
-
-result = generator("What is the IP address of localhost?\nIP: ")
-print(result)
-# 127.0.0.100
-```
-
-### [Generate a given Python type](./types.md)
-
-We provide a shortcut to regex-structured generation for simple use cases. Pass a Python type to the `outlines.generate.format` function and the LLM will output text that matches this type:
-
-```python
-from outlines import models, generate
-
-model = models.transformers("microsoft/Phi-3-mini-128k-instruct")
-generator = generate.format(model, int)
-
-result = generator("What is 2+2?")
-print(result)
-# 4
-```
-
-
-[jsonschema]: https://json-schema.org/learn/getting-started-step-by-step
-[pydantic]: https://docs.pydantic.dev/latest
-[cfg]: https://en.wikipedia.org/wiki/Context-free_grammar
-[ebnf]: https://en.wikipedia.org/wiki/Extended_Backus%E2%80%93Naur_form
diff --git a/docs/reference/generation/json.md b/docs/reference/generation/json.md
deleted file mode 100644
index da9f1472..00000000
--- a/docs/reference/generation/json.md
+++ /dev/null
@@ -1,105 +0,0 @@
-# JSON structured generation
-
-Outlines can make any open source model return a JSON object that follows a structure that is specified by the user. This is useful whenever we want the output of the model to be processed by code downstream: code does not understand natural language but rather the structured language it has been programmed to understand.
-
-There are mostly two reasons why someone would want to get an output formatted as JSON from a LLM:
-
-1. Parse the answer (e.g. with Pydantic), store it somewhere, return it to a user, etc.
-2. Call a function with the result
-
-Outlines has you covered in both cases! Indeed, to define the structure of the JSON you want the model to follow you can either provide a Pydantic model, or a function. No need to duplicate code!
-
-## Using Pydantic
-
-Outlines can infer the structure of the output from a Pydantic model. The result is an instance of the model that contains the values returned by the LLM:
-
-```python
-from pydantic import BaseModel
-
-from outlines import models, generate
-
-
-class User(BaseModel):
-    name: str
-    last_name: str
-    id: int
-
-
-model = models.transformers("microsoft/Phi-3-mini-4k-instruct")
-generator = generate.json(model, User)
-result = generator(
-    "Create a user profile with the fields name, last_name and id"
-)
-print(result)
-# User(name="John", last_name="Doe", id=11)
-```
-
-!!! Note "JSON and whitespaces"
-
-    By default Outlines prevents the model from generating json with syntactic newlines, tabs, or multiple spaces. The default `whitespace_pattern` is `r"[ ]?"`. Small models tend to enter an infinite repetition loop if the `whitespace_pattern` allows infinite spacing. If you would like to allow the model to generate multiple tabs, newlines, and spaces, you can set the whitespace pattern as follows:
-
-    ```python
-    generator = generate.json(model, User, whitespace_pattern=r"[\n\t ]*")
-    ```
-
-!!! Note "Performance"
-
-    `generation.json` computes an index that helps Outlines guide generation. This can take some time, but only needs to be done once. If you want to generate several times with the same schema make sure that you only call `generate.json` once.
-
-!!! Tip "Custom types"
-
-    Outlines provides [custom Pydantic types](types.md) so you do not have to write regular expressions for common types, such as phone numbers or zip codes.
-
-## Using a JSON Schema
-
-Instead of a Pydantic model you can pass a string that represents a [JSON Schema](https://json-schema.org/) specification to `generate.json`:
-
-```python
-from pydantic import BaseModel
-
-from outlines import models
-from outlines import generate
-
-model = models.transformers("microsoft/Phi-3-mini-4k-instruct")
-
-schema = """
-{
-  "title": "User",
-  "type": "object",
-  "properties": {
-    "name": {"type": "string"},
-    "last_name": {"type": "string"},
-    "id": {"type": "integer"}
-  },
-  "required": ["name", "last_name", "id"]
-}
-"""
-
-generator = generate.json(model, schema)
-result = generator(
-    "Create a user profile with the fields name, last_name and id"
-)
-print(result)
-# User(name="John", last_name="Doe", id=11)
-```
-
-## From a function's signature
-
-Outlines can infer the structure of the output from the signature of a function. The result is a dictionary, and can be passed directly to the function using the usual dictionary expansion syntax `**`:
-
-```python
-from outlines import models
-from outlines import generate
-
-def add(a: int, b: int):
-    return a + b
-
-model = models.transformers("microsoft/Phi-3-mini-4k-instruct")
-generator = generate.json(model, add)
-result = generator("Return two integers named a and b respectively. a is odd and b even.")
-
-print(add(**result))
-# 3
-```
-
-A great advantage of passing functions directly to specify the structure is that the structure of the LLM will change with the function's definition. No need to change the code at several places!
diff --git a/docs/reference/generation/regex.md b/docs/reference/generation/regex.md
deleted file mode 100644
index 23df3556..00000000
--- a/docs/reference/generation/regex.md
+++ /dev/null
@@ -1,27 +0,0 @@
-# Regular expressions
-
-Outlines can guarantee that the text generated by the LLM will be valid to a regular expression:
-
-```python
-from outlines import models, generate
-
-model = models.transformers("microsoft/Phi-3-mini-4k-instruct")
-
-generator = generate.regex(
-    model,
-    r"((25[0-5]|2[0-4]\d|[01]?\d\d?)\.){3}(25[0-5]|2[0-4]\d|[01]?\d\d?)",
-)
-
-prompt = "What is the IP address of the Google DNS servers? "
-answer = generator(prompt, max_tokens=30)
-
-print(answer)
-# What is the IP address of the Google DNS servers?
-# 2.2.6.1
-```
-
-If you find yourself using `generate.regex` to restrict the answers' type you can take a look at [type-structured generation](types.md) instead.
-
-!!! Note "Performance"
-
-    `generate.regex` computes an index that helps Outlines guide generation. This can take some time, but only needs to be done once. If you want to generate several times using the same regular expression make sure that you only call `generate.regex` once.
diff --git a/docs/reference/generation/structured_generation_explanation.md b/docs/reference/generation/structured_generation_explanation.md
deleted file mode 100644
index aa27a7a8..00000000
--- a/docs/reference/generation/structured_generation_explanation.md
+++ /dev/null
@@ -1,54 +0,0 @@
-# How does Outlines work?
-
-
-Language models generate text token by token, using the previous token sequence as input and sampled logits as output. This document explains the structured generation process, where only legal tokens are considered for the next step based on a predefined automata, e.g. a regex-defined [finite-state machine](https://en.wikipedia.org/wiki/Finite-state_machine) (FSM) or [Lark](https://lark-parser.readthedocs.io/en/stable/) grammar.`
-
-
-## Worked Example
-
-Let's consider a worked example with a pattern for whole and decimal numbers:
-
-`^\d*(\.\d+)?$`.
-
-### Creating Automata
-
-The pattern is first converted into an automata. Below is a brief explanation of the automata conversion and its representation.
-
-**Automata Diagram:**
-
-```mermaid
-graph LR
-    node0("1-9") --> node1("1-9")
-    node1 --> node1
-    node1 --> nodeEND{{END}}
-    node1 --> nodePeriod(".")
-    nodePeriod --> node2("1-9")
-    node2 --> node2
-    node2 --> nodeEND{{END}}
-```
-
-### Generating a Token
-
-Let's assume that we're in the middle of generation, and so far "748" has been generated. Here is the automata with the current state highlighted in green, with the legal next characters being another number (1-9), a dot (.), or end of sequence.
-
-```mermaid
-graph LR
-    node0("1-9") --> node1("1-9")
-    node1 --> node1
-    node1 --> nodeEND{{END}}
-    node1 --> nodePeriod(".")
-    nodePeriod --> node2("1-9")
-    node2 --> node2
-    node2 --> nodeEND{{END}}
-
-    style node1 fill:#090
-```
-
-Generating a token requires the following steps:
-
-- Feed the previous input sequence ("748") into the language model.
-- Language model runs a forward pass and produces token logits.
-- Outlines logits processor sets the probability of illegal tokens to 0%.
-- A token is sampled from the set of legal tokens.
-
-![Generation and Logits Processing Flow Chart](../../assets/images/logits_processing_diagram.svg)
diff --git a/docs/reference/generation/types.md b/docs/reference/generation/types.md
deleted file mode 100644
index 5b83a591..00000000
--- a/docs/reference/generation/types.md
+++ /dev/null
@@ -1,83 +0,0 @@
-# Custom types
-
-Outlines provides custom Pydantic types so you can focus on your use case rather than on writing regular expressions:
-
-| Category | Type | Import | Description |
-|:--------:|:----:|:-------|:------------|
-| ISBN | 10 & 13 | `outlines.types.ISBN` | There is no guarantee that the [check digit][wiki-isbn] will be correct |
-| Airport | IATA | `outlines.types.airports.IATA` | Valid [airport IATA codes][wiki-airport-iata] |
-| Country | alpha-2 code | `outlines.types.airports.Alpha2` | Valid [country alpha-2 codes][wiki-country-alpha-2] |
-|  | alpha-3 code | `outlines.types.countries.Alpha3` | Valid [country alpha-3 codes][wiki-country-alpha-3] |
-|  | numeric code | `outlines.types.countries.Numeric` | Valid [country numeric codes][wiki-country-numeric] |
-|  | name | `outlines.types.countries.Name` | Valid country names |
-|  | flag | `outlines.types.countries.Flag` | Valid flag emojis |
-| | email | `outlines.types.Email` | Valid email address |
-
-Some types require localization. We currently only support US types, but please don't hesitate to create localized versions of the different types and open a Pull Request. Localized types are specified using `types.locale` in the following way:
-
-```python
-from outlines import types
-
-types.locale("us").ZipCode
-types.locale("us").PhoneNumber
-```
-
-Here are the localized types that are currently available:
-
-| Category | Locale | Import | Description |
-|:--------:|:----:|:-------|:------------|
-| Zip code | US | `ZipCode` | Generate US Zip(+4) codes |
-| Phone number  | US | `PhoneNumber` | Generate valid US phone numbers |
-
-
-You can use these types in Pydantic schemas for JSON-structured generation:
-
-```python
-from pydantic import BaseModel
-
-from outlines import models, generate, types
-
-# Specify the locale for types
-locale = types.locale("us")
-
-class Client(BaseModel):
-    name: str
-    phone_number: locale.PhoneNumber
-    zip_code: locale.ZipCode
-
-
-model = models.transformers("microsoft/Phi-3-mini-4k-instruct")
-generator = generate.json(model, Client)
-result = generator(
-    "Create a client profile with the fields name, phone_number and zip_code"
-)
-print(result)
-# name='Tommy' phone_number='129-896-5501' zip_code='50766'
-```
-
-Or simply with `outlines.generate.format`:
-
-```python
-from pydantic import BaseModel
-
-from outlines import models, generate, types
-
-
-model = models.transformers("microsoft/Phi-3-mini-4k-instruct")
-generator = generate.format(model, types.locale("us").PhoneNumber)
-result = generator(
-    "Return a US Phone number: "
-)
-print(result)
-# 334-253-2630
-```
-
-
-We plan on adding many more custom types. If you have found yourself writing regular expressions to generate fields of a given type, or if you could benefit from more specific types don't hesite to [submit a PR](https://github.com/outlines-dev/outlines/pulls) or [open an issue](https://github.com/outlines-dev/outlines/issues/new/choose).
-
-
-[wiki-isbn]: https://en.wikipedia.org/wiki/ISBN#Check_digits
-[wiki-airport-iata]: https://en.wikipedia.org/wiki/IATA_airport_code
-[wiki-country-alpha-2]: https://en.wikipedia.org/wiki/ISO_3166-1_alpha-2
-[wiki-country-alpha-3]: https://en.wikipedia.org/wiki/ISO_3166-1_alpha-3
-[wiki-country-numeric]: https://en.wikipedia.org/wiki/ISO_3166-1_numeric
diff --git a/docs/reference/index.md b/docs/reference/index.md
deleted file mode 100644
index a5357fd8..00000000
--- a/docs/reference/index.md
+++ /dev/null
@@ -1,15 +0,0 @@
-# Reference
-
-## Structured generation
-
-While LLM capabilities are increasingly impressive, we can make their output more reliable by steering the generation. Outlines thus offers mechanisms to specify high level constraints on text completions by generative language models.
-
-Stopping sequence
-By default, language models stop generating tokens after and <EOS> token was generated, or after a set maximum number of tokens. Their output can be verbose, and for practical purposes it is often necessary to stop the generation after a given sequence has been found instead. You can use the stop_at keyword argument when calling the model with a prompt:
-
-```python
-import outlines.models as models
-
-complete = models.openai("gpt-3.5-turbo")
-expert = complete("Name an expert in quantum gravity.", stop_at=["\n", "."])
-```
diff --git a/docs/reference/models/exllamav2.md b/docs/reference/models/exllamav2.md
deleted file mode 100644
index afe54211..00000000
--- a/docs/reference/models/exllamav2.md
+++ /dev/null
@@ -1,7 +0,0 @@
-# ExllamaV2
-
-```bash
-pip install exllamav2 transformers torch
-```
-
-*Coming soon*
diff --git a/docs/reference/models/llamacpp.md b/docs/reference/models/llamacpp.md
deleted file mode 100644
index a84f0c18..00000000
--- a/docs/reference/models/llamacpp.md
+++ /dev/null
@@ -1,226 +0,0 @@
-# Llama.cpp
-
-Outlines provides an integration with [Llama.cpp](https://github.com/ggerganov/llama.cpp) using the [llama-cpp-python library][llamacpp]. Llamacpp allows to run quantized models on machines with limited compute.
-
-!!! Note "Installation"
-
-    You need to install the `llama-cpp-python` library to use the llama.cpp integration. See the [installation section](#installation) for instructions to install `llama-cpp-python` with CUDA, Metal, ROCm and other backends.
-
-## Load the model
-
-You can initialize the model by passing the name of the repository on the HuggingFace Hub, and the filenames (or glob pattern):
-
-```python
-from outlines import models
-
-model = models.llamacpp("TheBloke/phi-2-GGUF", "phi-2.Q4_K_M.gguf")
-```
-
-This will download the model files to the hub cache folder and load the weights in memory.
-
-You can also initialize the model by passing the path to the weights on your machine. Assuming [Phi2's weights](https://huggingface.co/TheBloke/phi-2-GGUF) are in the current directory:
-
-```python
-from outlines import models
-from llama_cpp import Llama
-
-llm = Llama("./phi-2.Q4_K_M.gguf")
-model = models.LlamaCpp(llm)
-```
-
-If you need more control, you can pass the same keyword arguments to the model as you would pass in the [llama-ccp-library][llamacpp]:
-
-```python
-from outlines import models
-
-model = models.llamacpp(
-    "TheBloke/phi-2-GGUF",
-    "phi-2.Q4_K_M.gguf"
-    n_ctx=512,  # to set the context length value
-)
-```
-
-**Main parameters:**
-
-| Parameters | Type | Description | Default |
-|------------|------|-------------|---------|
-| `n_gpu_layers`| `int` | Number of layers to offload to GPU. If -1, all layers are offloaded | `0` |
-| `split_mode` | `int` | How to split the model across GPUs. `1` for layer-wise split, `2` for row-wise split | `1` |
-| `main_gpu` | `int` | Main GPU | `0` |
-| `tensor_split` | `Optional[List[float]]` | How split tensors should be distributed accross GPUs. If `None` the model is not split. | `None` |
-| `n_ctx` | `int` | Text context. Inference from the model if set to `0` | `0` |
-| `n_threads` | `Optional[int]` | Number of threads to use for generation. All available threads if set to `None`.| `None` |
-| `verbose` | `bool` | Print verbose outputs to `stderr` | `False` |
-
-See the [llama-cpp-python documentation](https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.__init__) for the full list of parameters.
-
-### Load the model on GPU
-
-!!! Note
-
-    [Make sure](#cuda) that you installed `llama-cpp-python` with GPU support.
-
- To load the model on GPU, pass `n_gpu_layers=-1`:
-
-```python
-from outlines import models
-
-model = models.llamacpp(
-    "TheBloke/phi-2-GGUF",
-    "phi-2.Q4_K_M.gguf"
-    n_gpu_layers=-1,  # to use GPU acceleration
-)
-```
-
-This also works with generators built with `generate.regex`, `generate.json`, `generate.cfg`, `generate.format` and `generate.choice`.
-
-### Load LoRA adapters
-
-You can load LoRA adapters dynamically:
-
-```python
-from outlines import models, generate
-
-model = models.llamacpp("TheBloke/phi-2-GGUF", "phi-2.Q4_K_M.gguf")
-generator = generate.text(model)
-answer_1 = generator("prompt")
-
-model.load_lora("./path/to/adapter.gguf")
-answer_2 = generator("prompt")
-```
-
-To load another adapter you need to re-initialize the model. Otherwise the adapter will be added on top of the previous one:
-
-```python
-from outlines import models
-
-model = models.llamacpp("TheBloke/phi-2-GGUF", "phi-2.Q4_K_M.gguf")
-model.load_lora("./path/to/adapter1.gguf")  # Load first adapter
-
-model = models.llamacpp("TheBloke/phi-2-GGUF", "phi-2.Q4_K_M.gguf")
-model.load_lora("./path/to/adapter2.gguf")  # Load second adapter
-```
-
-## Generate text
-
-In addition to the parameters described in the [text generation section](../text.md) you can pass extra keyword arguments, for instance to set sampling parameters not exposed in Outlines' public API:
-
-```python
-from outlines import models, generate
-
-
-model = models.llamacpp("TheBloke/phi-2-GGUF", "phi-2.Q4_K_M.gguf")
-generator = generate.text(model)
-
-answer = generator("A prompt", presence_penalty=0.8)
-```
-
-**Extra keyword arguments:**
-
-The value of the keyword arguments you pass to the generator suspersede the values set when initializing the sampler or generator. All extra sampling methods and repetition penalties are disabled by default.
-
-| Parameters | Type | Description | Default |
-|------------|------|-------------|---------|
-| `suffix` | `Optional[str]` | A suffix to append to the generated text. If `None` no suffix is added. | `None` |
-| `echo` | `bool` | Whether to preprend the prompt to the completion. | `False` |
-| `seed` | `int` | The random seed to use for sampling. | `None` |
-| `max_tokens` | `Optional[int]` | The maximum number of tokens to generate. If `None` the maximum number of tokens depends on `n_ctx`. | `16` |
-| `frequence_penalty` | `float` | The penalty to apply to tokens based on their frequency in the past 64 tokens. | `0.0` |
-| `presence_penalty` | `float` | The penalty to apply to tokens based on their presence in the past 64 tokens. | `0.0` |
-| `repeat_penalty` | `float` | The penalty to apply to repeated tokens in the past 64 tokens. | `1.` |
-| `stopping_criteria` | `Optional[StoppingCriteriaList]` | A list of stopping criteria to use. | `None`
-| `logits_processor` | `Optional[LogitsProcessorList]` | A list of logits processors to use. The logits processor used for structured generation will be added to this list. | `None`
-| `temperature` | `float` | The temperature to use for sampling | `1.0` |
-| `top_p` | `float` | The top-p value to use for [nucleus sampling][degeneration]. | `1.` |
-| `min_p` | `float` | The min-p value to use for [minimum-p sampling][minimum-p]. | `0.` |
-| `typical_p` | `float` | The p value to use for [locally typical sampling][locally-typical]. | `1.0` |
-| `stop` | `Optional[Union[str, List[str]]]` | A list of strings that stop generation when encountered. | `[]` |
-| `top_k` |  `int` | The top-k value used for [top-k sampling][top-k]. Negative value to consider all logit values. | `-1.` |
-| `tfs_z` | `float` | The [tail-free sampling][tail-free] parameter. | `1.0` |
-| `mirostat_mode` | `int` | The [mirostat sampling][mirostat] mode. | `0` |
-| `mirostat_tau` | `float` | The target cross-entropy for [mirostat sampling][mirostat].| `5.0` |
-| `mirostat_eta` | `float` | The learning rate used to update `mu` in [mirostat sampling][mirostat]. | `0.1` |
-
-See the [llama-cpp-python documentation][llama-cpp-python-call] for the full and up-to-date list of parameters and the [llama.cpp code][llama-cpp-sampling-params] for the default values of other
-sampling parameters.
-
-### Streaming
-
-
-## Installation
-
-You need to install the `llama-cpp-python` library to use the llama.cpp integration.
-
-### CPU
-
-For a *CPU-only* installation run:
-
-```bash
-pip install llama-cpp-python
-```
-
-!!! Warning
-
-    Do not run this command if you want support for BLAS, Metal or CUDA. Follow the instructions below instead.
-
-### CUDA
-
-```bash
-CMAKE_ARGS="-DLLAMA_CUDA=on" pip install llama-cpp-python
-```
-
-It is also possible to install pre-built wheels with CUDA support (Python 3.10 and above):
-
-```bash
-pip install llama-cpp-python \
-  --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/<cuda-version>
-```
-
-Where `<cuda-version>` is one of the following, depending on the version of CUDA installed on your system:
-
-- `cu121` for CUDA 12.1
-- `cu122` for CUDA 12.2
-- `cu123` CUDA 12.3
-
-### Metal
-
-```bash
-CMAKE_ARGS="-DLLAMA_METAL=on" pip install llama-cpp-python
-```
-
-It is also possible to install pre-build wheels with Metal support (Python 3.10 or above, MacOS 11.0 and above):
-
-```bash
-pip install llama-cpp-python \
-  --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/metal
-```
-
-### OpenBLAS
-
-```bash
-CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS" pip install llama-cpp-python
-```
-
-### Other backend
-
-`llama.cpp` supports many other backends. Refer to the [llama.cpp documentation][llama-cpp-python-install] to use the following backends:
-
-- CLBast (OpenCL)
-- hipBLAS (ROCm)
-- Vulkan
-- Kompute
-- SYCL
-
-
-
-
-[llamacpp]: https://github.com/abetlen/llama-cpp-python
-[llama-cpp-python-call]: https://llama-cpp-python.readthedocs.io/en/latest/api-reference/#llama_cpp.Llama.__call__
-[llama-cpp-python-install]: https://github.com/abetlen/llama-cpp-python/tree/08b16afe11e7b42adec2fed0a781123383476045?tab=readme-ov-file#supported-backends
-[llama-cpp-sampling-params]: https://github.com/ggerganov/llama.cpp/blob/e11a8999b5690f810c2c99c14347f0834e68c524/common/sampling.h#L22
-[mirostat]: https://arxiv.org/abs/2007.14966
-[degeneration]: https://arxiv.org/abs/1904.09751
-[top-k]: https://arxiv.org/abs/1805.04833
-[minimum-p]: https://github.com/ggerganov/llama.cpp/pull/3841
-[locally-typical]: https://arxiv.org/abs/2202.00666
-[tail-free]: https://www.trentonbricken.com/Tail-Free-Sampling
diff --git a/docs/reference/models/mlxlm.md b/docs/reference/models/mlxlm.md
deleted file mode 100644
index cf7bb744..00000000
--- a/docs/reference/models/mlxlm.md
+++ /dev/null
@@ -1,89 +0,0 @@
-# mlx-lm
-
-Outlines provides an integration with [mlx-lm](https://github.com/ml-explore/mlx-examples/tree/main/llms), allowing models to be run quickly on Apple Silicon via the [mlx](https://ml-explore.github.io/mlx/build/html/index.html) library.
-
-!!! Note "Installation"
-
-    You need to install the `mlx` and `mlx-lm` libraries on a device which [supports Metal](https://support.apple.com/en-us/102894) to use the mlx-lm integration.
-
-
-## Load the model
-
-You can initialize the model by passing the name of the repository on the HuggingFace Hub. The official repository for mlx-lm supported models is [mlx-community](https://huggingface.co/mlx-community).
-
-```python
-from outlines import models
-
-model = models.mlxlm("mlx-community/Meta-Llama-3.1-8B-Instruct-8bit")
-```
-
-This will download the model files to the hub cache folder and load the weights in memory.
-
-The arguments `model_config` and `tokenizer_config` are available to modify loading behavior. For example, per the `mlx-lm` [documentation](https://github.com/ml-explore/mlx-examples/tree/main/llms#supported-models), you must set an eos_token for `qwen/Qwen-7B`. In outlines you may do so via
-
-```
-model = models.mlxlm(
-    "mlx-community/Meta-Llama-3.1-8B-Instruct-8bit",
-    tokenizer_config={"eos_token": "<|endoftext|>", "trust_remote_code": True},
-)
-```
-
-**Main parameters:**
-
-(Subject to change. Table based on [mlx-lm.load docstring](https://github.com/ml-explore/mlx-examples/blob/main/llms/mlx_lm/utils.py#L429))
-
-| Parameters         | Type   | Description                                                                                      | Default |
-|--------------------|--------|--------------------------------------------------------------------------------------------------|---------|
-| `tokenizer_config` | `dict` | Configuration parameters specifically for the tokenizer. Defaults to an empty dictionary.        | `{}`    |
-| `model_config`     | `dict` | Configuration parameters specifically for the model. Defaults to an empty dictionary.            | `{}`    |
-| `adapter_path`     | `str`  | Path to the LoRA adapters. If provided, applies LoRA layers to the model.                        | `None`  |
-| `lazy`             | `bool` | If False, evaluate the model parameters to make sure they are loaded in memory before returning. | `False` |
-
-
-## Generate text
-
-You may generate text using the parameters described in the [text generation documentation](../text.md).
-
-With the loaded model, you can generate text or perform structured generation, e.g.
-
-```python
-from outlines import models, generate
-
-model = models.mlxlm("mlx-community/Meta-Llama-3.1-8B-Instruct-8bit")
-generator = generate.text(model)
-
-answer = generator("A prompt", temperature=2.0)
-```
-
-## Streaming
-
-You may creating a streaming iterable with minimal changes
-
-```python
-from outlines import models, generate
-
-model = models.mlxlm("mlx-community/Meta-Llama-3.1-8B-Instruct-8bit")
-generator = generate.text(model)
-
-for token_str in generator.text("A prompt", temperature=2.0):
-    print(token_str)
-```
-
-## Structured
-
-You may perform structured generation with mlxlm to guarantee your output will match a regex pattern, json schema, or lark grammar.
-
-Example: Phone number generation with pattern `"\\+?[1-9][0-9]{7,14}"`:
-
-```python
-from outlines import models, generate
-
-model = models.mlxlm("mlx-community/Meta-Llama-3.1-8B-Instruct-8bit")
-
-phone_number_pattern = "\\+?[1-9][0-9]{7,14}"
-generator = generate.regex(model, phone_number_pattern)
-
-model_output = generator("What's Jennys Number?\n")
-print(model_output)
-# '8675309'
-```
diff --git a/docs/reference/models/models.md b/docs/reference/models/models.md
deleted file mode 100644
index 34b5be4c..00000000
--- a/docs/reference/models/models.md
+++ /dev/null
@@ -1,53 +0,0 @@
----
-title: Models
----
-
-# Models
-
-Outlines supports generation using a number of inference engines (`outlines.models`). Loading a model using outlines follows a similar interface between inference engines:
-
-```python
-import outlines
-
-model = outlines.models.transformers("microsoft/Phi-3-mini-128k-instruct")
-model = outlines.models.transformers_vision("llava-hf/llava-v1.6-mistral-7b-hf")
-model = outlines.models.vllm("microsoft/Phi-3-mini-128k-instruct")
-model = outlines.models.llamacpp(
-    "microsoft/Phi-3-mini-4k-instruct-gguf", "Phi-3-mini-4k-instruct-q4.gguf"
-)
-model = outlines.models.exllamav2("bartowski/Phi-3-mini-128k-instruct-exl2")
-model = outlines.models.mlxlm("mlx-community/Phi-3-mini-4k-instruct-4bit")
-
-model = outlines.models.openai(
-    "gpt-4o-mini",
-    api_key=os.environ["OPENAI_API_KEY"]
-)
-```
-
-
-# Feature Matrix
-|                   | [Transformers](transformers.md) | [Transformers Vision](transformers_vision.md) | [vLLM](vllm.md) | [llama.cpp](llamacpp.md) | [ExLlamaV2](exllamav2.md) | [MLXLM](mlxlm.md) | [OpenAI](openai.md)* |
-|-------------------|--------------|---------------------|------|-----------|-----------|-------|---------|
-| **Device**        |              |                     |      |           |           |       |         |
-| Cuda              | ✅           | ✅                  | ✅   | ✅        | ✅        | ❌    | N/A     |
-| Apple Silicon     | ✅           | ✅                  | ❌   | ✅        | ✅        | ✅    | N/A     |
-| x86 / AMD64       | ✅           | ✅                  | ❌   | ✅        | ✅        | ❌    | N/A     |
-| **Sampling**      |              |                     |      |           |           |       |         |
-| Greedy            | ✅           | ✅                  | ✅   | ✅*       | ✅        | ✅    | ❌      |
-| Multinomial       | ✅           | ✅                  | ✅   | ✅        | ✅        | ✅    | ✅      |
-| Multiple Samples  | ✅           | ✅                  |      | ❌        |           | ❌    | ✅      |
-| Beam Search       | ✅           | ✅                  | ✅   | ❌        | ✅        | ❌    | ❌      |
-| **Generation**    |              |                     |      |           |           |       |         |
-| Batch             | ✅           | ✅                  | ✅   | ❌        | ?         | ❌    | ❌      |
-| Stream            | ✅           | ❌                  | ❌   | ✅        | ?         | ✅    | ❌      |
-| **`outlines.generate`** |        |                     |      |           |           |       |         |
-| Text              | ✅           | ✅                  | ✅   | ✅        | ✅        | ✅    | ✅      |
-| Structured*       | ✅           | ✅                  | ✅   | ✅        | ✅        | ✅    | ❌      |
-
-
-## Caveats
-
-- OpenAI doesn't support structured generation due to limitations in their API and server implementation.
-- `outlines.generate` ["Structured"](../generation/generation.md) includes methods such as `outlines.generate.regex`, `outlines.generate.json`, `outlines.generate.cfg`, etc.
-- MLXLM only supports Apple Silicon.
-- llama.cpp greedy sampling available via multinomial with `temperature = 0.0`.
diff --git a/docs/reference/models/openai.md b/docs/reference/models/openai.md
deleted file mode 100644
index 7f610c17..00000000
--- a/docs/reference/models/openai.md
+++ /dev/null
@@ -1,170 +0,0 @@
-# OpenAI and compatible APIs
-
-!!! Installation
-
-    You need to install the `openai` and `tiktoken` libraries to be able to use the OpenAI API in Outlines.
-
-## OpenAI models
-
-Outlines supports models available via the OpenAI Chat API, e.g. ChatGPT and GPT-4. You can initialize the model by passing the model name to `outlines.models.openai`:
-
-```python
-from outlines import models
-
-
-model = models.openai("gpt-3.5-turbo")
-model = models.openai("gpt-4-turbo")
-model = models.openai("gpt-4o")
-```
-
-Check the [OpenAI documentation](https://platform.openai.com/docs/models/gpt-4-turbo-and-gpt-4) for an up-to-date list of available models. You can pass any parameter you would pass to `openai.AsyncOpenAI` as keyword arguments:
-
-```python
-import os
-from outlines import models
-
-
-model = models.openai(
-    "gpt-3.5-turbo",
-    api_key=os.environ["OPENAI_API_KEY"]
-)
-```
-
-The following table enumerates the possible parameters. Refer to the [OpenAI SDK's code](https://github.com/openai/openai-python/blob/54a5911f5215148a0bdeb10e2bcfb84f635a75b9/src/openai/_client.py) for an up-to-date list.
-
-**Parameters:**
-
-| **Parameters** | **Type** | **Description** | **Default** |
-|----------------|:---------|:----------------|:------------|
-| `api_key` | `str` | OpenAI API key. Infered from `OPENAI_API_KEY` if not specified | `None` |
-| `organization` | `str` | OpenAI organization id. Infered from `OPENAI_ORG_ID` if not specified | `None` |
-| `project` | `str` | OpenAI project id. Infered from `OPENAI_PROJECT_ID` if not specified.| `None` |
-| `base_url` | `str | https.URL` | Base URL for the endpoint. Infered from `OPENAI_BASE_URL` if no specified. | `None` |
-| `timeout` | `float` | Request timeout.| `NOT_GIVEN` |
-| `max_retries` | `int` | Maximum number of retries for failing requests | `2` |
-| `default_headers` | `Mapping[str, str]` | Default HTTP headers | `None` |
-| `default_query` | `Mapping[str, str]` | Custom parameters added to the HTTP queries | `None` |
-| `http_client` | `https.AsyncClient` | User-specified `httpx` client | `None` |
-
-## Azure OpenAI models
-
-Outlines also supports Azure OpenAI models:
-
-```python
-from outlines import models
-
-
-model = models.azure_openai(
-    "azure-deployment-name",
-    "gpt-3.5-turbo",
-    api_version="2023-07-01-preview",
-    azure_endpoint="https://example-endpoint.openai.azure.com",
-)
-```
-
-!!! Question "Why do I need to specify model and deployment name?"
-
-    The model name is needed to load the correct tokenizer for the model. The tokenizer is necessary for structured generation.
-
-
-You can pass any parameter you would pass to `openai.AsyncAzureOpenAI`. You can consult the [OpenAI SDK's code](https://github.com/openai/openai-python/blob/54a5911f5215148a0bdeb10e2bcfb84f635a75b9/src/openai/lib/azure.py) for an up-to-date list.
-
-**Parameters:**
-
-
-| **Parameters** | **Type** | **Description** | **Default** |
-|----------------|:---------|:----------------|:------------|
-| `azure_endpoint` | `str` | Azure endpoint, including the resource. Infered from `AZURE_OPENAI_ENDPOINT` if not specified | `None` |
-| `api_version` | `str` | API version. Infered from `AZURE_OPENAI_API_KEY` if not specified | `None` |
-| `api_key` | `str` | OpenAI API key. Infered from `OPENAI_API_KEY` if not specified | `None` |
-| `azure_ad_token` | `str` | Azure active directory token. Inference from `AZURE_OPENAI_AD_TOKEN` if not specified | `None` |
-| `azure_ad_token_provider` | `AzureADTokenProvider` | A function that returns an Azure Active Directory token | `None` |
-| `organization` | `str` | OpenAI organization id. Infered from `OPENAI_ORG_ID` if not specified | `None` |
-| `project` | `str` | OpenAI project id. Infered from `OPENAI_PROJECT_ID` if not specified.| `None` |
-| `base_url` | `str | https.URL` | Base URL for the endpoint. Infered from `OPENAI_BASE_URL` if not specified. | `None` |
-| `timeout` | `float` | Request timeout.| `NOT_GIVEN` |
-| `max_retries` | `int` | Maximum number of retries for failing requests | `2` |
-| `default_headers` | `Mapping[str, str]` | Default HTTP headers | `None` |
-| `default_query` | `Mapping[str, str]` | Custom parameters added to the HTTP queries | `None` |
-| `http_client` | `https.AsyncClient` | User-specified `httpx` client | `None` |
-
-## Models that follow the OpenAI standard
-
-Outlines supports models that follow the OpenAI standard. You will need to initialize the OpenAI client properly configured and pass it to `outlines.models.openai`
-
-```python
-import os
-from openai import AsyncOpenAI
-from outlines import models
-from outlines.models.openai import OpenAIConfig
-
-
-client = AsyncOpenAI(
-    api_key=os.environ.get("PROVIDER_KEY"),
-    base_url="http://other.provider.server.com"
-)
-config = OpenAIConfig("model_name")
-model = models.openai(client, config)
-```
-
-!!! Warning
-
-    You need to pass the async client to be able to do batch inference.
-
-## Advanced configuration
-
-For more advanced configuration option, such as support proxy, please consult the [OpenAI SDK's documentation](https://github.com/openai/openai-python):
-
-
-```python
-from openai import AsyncOpenAI, DefaultHttpxClient
-from outlines import models
-from outlines.models.openai import OpenAIConfig
-
-
-client = AsyncOpenAI(
-    base_url="http://my.test.server.example.com:8083",
-    http_client=DefaultHttpxClient(
-        proxies="http://my.test.proxy.example.com",
-        transport=httpx.HTTPTransport(local_address="0.0.0.0"),
-    ),
-)
-config = OpenAIConfig("model_name")
-model = models.openai(client, config)
-```
-
-It is possible to specify the values for `seed`, `presence_penalty`, `frequence_penalty`, `top_p` by passing an instance of `OpenAIConfig` when initializing the model:
-
-```python
-from outlines.models.openai import OpenAIConfig
-from outlines import models
-
-
-config = OpenAIConfig(
-    presence_penalty=1.,
-    frequency_penalty=1.,
-    top_p=.95,
-    seed=0,
-)
-model = models.openai("gpt-3.5-turbo", config)
-```
-
-## Monitoring API use
-
-It is important to be able to track your API usage when working with OpenAI's API. The number of prompt tokens and completion tokens is directly accessible via the model instance:
-
-```python
-from openai import AsyncOpenAI
-import outlines.models
-
-
-model = models.openai("gpt-4")
-
-print(model.prompt_tokens)
-# 0
-
-print(model.completion_tokens)
-# 0
-```
-
-These numbers are updated every time you call the model.
diff --git a/docs/reference/models/tgi.md b/docs/reference/models/tgi.md
deleted file mode 100644
index 3f056810..00000000
--- a/docs/reference/models/tgi.md
+++ /dev/null
@@ -1,3 +0,0 @@
-# Text-generation-inference (TGI)
-
-TGI uses Outlines to provide structured generation, see [their documentation](https://huggingface.co/docs/text-generation-inference/en/basic_tutorials/using_guidance).
diff --git a/docs/reference/models/transformers.md b/docs/reference/models/transformers.md
deleted file mode 100644
index 2a13e28e..00000000
--- a/docs/reference/models/transformers.md
+++ /dev/null
@@ -1,148 +0,0 @@
-# transformers
-
-
-!!! Installation
-
-    You need to install the `transformer`, `datasets` and `torch` libraries to be able to use these models in Outlines:
-
-    ```bash
-    pip install torch transformers datasets
-    ```
-
-
-Outlines provides an integration with the `torch` implementation of causal models in the [transformers][transformers] library. You can initialize the model by passing its name:
-
-```python
-from outlines import models
-
-model = models.transformers("microsoft/Phi-3-mini-4k-instruct", device="cuda")
-```
-
-If you need more fine-grained control you can also initialize the model and tokenizer separately:
-
-
-```python
-from transformers import AutoModelForCausalLM, AutoTokenizer
-from outlines import models
-
-llm = AutoModelForCausalLM.from_pretrained("gpt2", output_attentions=True)
-tokenizer = AutoTokenizer.from_pretrained("gpt2")
-model = models.Transformers(llm, tokenizer)
-```
-
-# Using Logits Processors
-
-There are two ways to use Outlines Structured Generation with HuggingFace Transformers:
-
-1. Use Outlines generation wrapper, `outlines.models.transformers`
-2. Use `OutlinesLogitsProcessor` with `transformers.AutoModelForCausalLM`
-
-Outlines supports a myriad of logits processors for structured generation. In these example, we will use the `RegexLogitsProcessor` which guarantees generated text matches the specified pattern.
-
-## Using `outlines.models.transformers`
-
-```python
-import outlines
-
-time_regex_pattern = r"(0?[1-9]|1[0-2]):[0-5]\d\s?(am|pm)?"
-
-model = outlines.models.transformers("microsoft/Phi-3-mini-4k-instruct", device="cuda")
-generator = outlines.generate.regex(model, time_regex_pattern)
-
-output = generator("The the best time to visit a dentist is at ")
-print(output)
-# 2:30 pm
-```
-
-## Using models initialized via the `transformers`  library
-
-```python
-import outlines
-import transformers
-
-
-model_uri = "microsoft/Phi-3-mini-4k-instruct"
-
-outlines_tokenizer = outlines.models.TransformerTokenizer(
-    transformers.AutoTokenizer.from_pretrained(model_uri)
-)
-phone_number_logits_processor = outlines.processors.RegexLogitsProcessor(
-    "\\+?[1-9][0-9]{7,14}",  # phone number pattern
-    outlines_tokenizer,
-)
-
-generator = transformers.pipeline('text-generation', model=model_uri)
-
-output = generator(
-    "Jenny gave me her number it's ",
-	logits_processor=transformers.LogitsProcessorList([phone_number_logits_processor])
-)
-print(output)
-# [{'generated_text': "Jenny gave me her number it's 2125550182"}]
-# not quite 8675309 what we expected, but it is a valid phone number
-```
-
-[transformers]: https://github.com/huggingface/transformers
-
-
-# Alternative Model Classes
-
-`outlines.models.transformers` defaults to `transformers.AutoModelForCausalLM`, which is the appropriate class for most standard large language models, including Llama 3, Mistral, Phi-3, etc.
-
-However other variants with unique behavior can be used as well by passing the appropriate class.
-
-### Mamba
-
-[Mamba](https://github.com/state-spaces/mamba) is a transformers alternative which employs memory efficient, linear-time decoding.
-
-To use Mamba with outlines you must first install the necessary requirements:
-```
-pip install causal-conv1d>=1.2.0 mamba-ssm torch transformers
-```
-
-Then you can either create an Mamba-2 Outlines model via
-```python
-import outlines
-
-model = outlines.models.mamba("state-spaces/mamba-2.8b-hf")
-```
-
-or explicitly with
-```python
-import outlines
-from transformers import MambaForCausalLM
-
-model = outlines.models.transformers(
-    "state-spaces/mamba-2.8b-hf",
-    model_class=MambaForCausalLM
-)
-```
-
-
-
-Read [`transformers`'s documentation](https://huggingface.co/docs/transformers/en/model_doc/mamba) for more information.
-
-### Encoder-Decoder Models
-
-You can use encoder-decoder (seq2seq) models like T5 and BART with Outlines.
-
-Be cautious with model selection though, some models such as `t5-base` don't include certain characters (`{`) and you may get an error when trying to perform structured generation.
-
-T5 Example:
-```python
-import outlines
-from transformers import AutoModelForSeq2SeqLM
-
-model_pile_t5 = models.transformers(
-    model_name="EleutherAI/pile-t5-large",
-    model_class=AutoModelForSeq2SeqLM,
-)
-```
-
-Bart Example:
-```python
-model_bart = models.transformers(
-    model_name="facebook/bart-large",
-    model_class=AutoModelForSeq2SeqLM,
-)
-```
diff --git a/docs/reference/models/transformers_vision.md b/docs/reference/models/transformers_vision.md
deleted file mode 100644
index b488bca1..00000000
--- a/docs/reference/models/transformers_vision.md
+++ /dev/null
@@ -1,115 +0,0 @@
-# Transformers Vision
-
-Outlines allows seamless use of [vision models](https://huggingface.co/learn/computer-vision-course/en/unit4/multimodal-models/tasks-models-part1).
-
-`outlines.models.transformers_vision` has shares interfaces with, and is based on [outlines.models.transformers](./transformers.md).
-
-Tasks supported include
-
-- image + text -> text
-- video + text -> text
-
-
-
-## Example: Using [Llava-Next](https://huggingface.co/docs/transformers/en/model_doc/llava_next) Vision Models
-
-Install dependencies
-`pip install torchvision pillow flash-attn`
-
-Create the model
-```python
-import outlines
-from transformers import LlavaNextForConditionalGeneration
-
-model = outlines.models.transformers_vision(
-    "llava-hf/llava-v1.6-mistral-7b-hf",
-    model_class=LlavaNextForConditionalGeneration,
-	device="cuda",
-)
-```
-
-Create convenience function to load a `PIL.Image` from URL
-```python
-from PIL import Image
-from io import BytesIO
-from urllib.request import urlopen
-
-def img_from_url(url):
-    img_byte_stream = BytesIO(urlopen(url).read())
-    return Image.open(img_byte_stream).convert("RGB")
-```
-
-### Describing an image
-
-```python
-description_generator = outlines.generate.text(model)
-description_generator(
-    "<image> detailed description:",
-    [img_from_url("https://upload.wikimedia.org/wikipedia/commons/2/25/Siam_lilacpoint.jpg")]
-)
-```
-
-> This is a color photograph featuring a Siamese cat with striking blue eyes. The cat has a creamy coat and a light eye color, which is typical for the Siamese breed. Its features include elongated ears, a long, thin tail, and a striking coat pattern. The cat is sitting in an indoor setting, possibly on a cat tower or a similar raised platform, which is covered with a beige fabric, providing a comfortable and soft surface for the cat to rest or perch. The surface of the wall behind the cat appears to be a light-colored stucco or plaster.
-
-#### Multiple Images
-
-To include multiple images in your prompt you simply add more `<image>` tokens to the prompt
-
-```python
-image_urls = [
-	"https://cdn1.byjus.com/wp-content/uploads/2020/08/ShapeArtboard-1-copy-3.png",  # triangle
-	"https://cdn1.byjus.com/wp-content/uploads/2020/08/ShapeArtboard-1-copy-11.png",  # hexagon
-]
-description_generator = outlines.generate.text(model)
-description_generator(
-    "<image><image><image>What shapes are present?",
-    list(map(img_from_url, image_urls)),
-)
-```
-
-> There are two shapes present. One shape is a hexagon and the other shape is an triangle. '
-
-
-### Classifying an Image
-
-```python
-pattern = "Mercury|Venus|Earth|Mars|Saturn|Jupiter|Neptune|Uranus|Pluto"
-planet_generator = outlines.generate.regex(model, pattern)
-
-planet_generator(
-    "What planet is this: <image>",
-    [img_from_url("https://upload.wikimedia.org/wikipedia/commons/e/e3/Saturn_from_Cassini_Orbiter_%282004-10-06%29.jpg")]
-)
-```
-
-> Saturn
-
-
-### Extracting Structured Image data
-
-```python
-from pydantic import BaseModel
-from typing import List, Optional
-
-class ImageData(BaseModel):
-    caption: str
-    tags_list: List[str]
-    object_list: List[str]
-    is_photo: bool
-
-image_data_generator = outlines.generate.json(model, ImageData)
-
-image_data_generator(
-    "<image> detailed JSON metadata:",
-    [img_from_url("https://upload.wikimedia.org/wikipedia/commons/9/98/Aldrin_Apollo_11_original.jpg")]
-)
-```
-
-> `ImageData(caption='An astronaut on the moon', tags_list=['moon', 'space', 'nasa', 'americanflag'], object_list=['moon', 'moon_surface', 'space_suit', 'americanflag'], is_photo=True)`
-
-
-## Resources
-
-### Chosing a model
-- https://mmbench.opencompass.org.cn/leaderboard
-- https://huggingface.co/spaces/WildVision/vision-arena
diff --git a/docs/reference/models/vllm.md b/docs/reference/models/vllm.md
deleted file mode 100644
index 1380d3d2..00000000
--- a/docs/reference/models/vllm.md
+++ /dev/null
@@ -1,231 +0,0 @@
-# vLLM
-
-
-!!! Note "Installation"
-
-    You need to install the `vllm` library to use the vLLM integration. See the [installation section](#installation) for instructions to install vLLM for CPU or ROCm.
-
-## Load the model
-
-Outlines supports models available via vLLM's offline batched inference interface. You can load a model using:
-
-
-```python
-from outlines import models
-
-model = models.vllm("microsoft/Phi-3-mini-4k-instruct")
-```
-
-Or alternatively:
-
-```python
-import vllm
-from outlines import models
-
-llm = vllm.LLM("microsoft/Phi-3-mini-4k-instruct")
-model = models.VLLM(llm)
-```
-
-
-Models are loaded from the [HuggingFace hub](https://huggingface.co/).
-
-
-!!! Warning "Device"
-
-    The default installation of vLLM only allows to load models on GPU. See the [installation instructions](#installation) to run models on CPU.
-
-
-You can pass any parameter that you would normally pass to `vllm.LLM`, as keyword arguments:
-
-```python
-from outlines import models
-
-model = models.vllm(
-    "microsoft/Phi-3-mini-4k-instruct",
-    trust_remote_code=True,
-    gpu_memory_utilization=0.7
-)
-```
-
-**Main parameters:**
-
-| **Parameters** | **Type** | **Description** | **Default** |
-|----------------|:---------|:----------------|:------------|
-| `tokenizer_mode`| `str`  | "auto" will use the fast tokenizer if available and "slow" will always use the slow tokenizer. | `auto`
-| `trust_remote_code`| `bool` | Trust remote code when downloading the model and tokenizer. | `False` |
-| `tensor_parallel_size`| `int` | The number of GPUs to use for distributed execution with tensor parallelism.| `1` |
-| `dtype`| `str` | The data type for the model weights and activations. Currently, we support `float32`, `float16`, and `bfloat16`. If `auto`, we use the `torch_dtype` attribute specified in the model config file. However, if the `torch_dtype` in the config is `float32`, we will use `float16` instead.| `auto` |
-| `quantization`| `Optional[str]` | The method used to quantize the model weights. Currently, we support "awq", "gptq" and "squeezellm". If None, we first check the `quantization_config` attribute in the model config file. If that is None, we assume the model weights are not quantized and use `dtype` to determine the data type of the weights.| `None` |
-| `revision`| `Optional[str]` | The specific model version to use. It can be a branch name, a tag name, or a commit id.| `None` |
-| `tokenizer_revision`| `Optional[str]`| The specific tokenizer version to use. It can be a branch name, a tag name, or a commit id.| `None` |
-| `gpu_memory_utilization`| `float` | The ratio (between 0 and 1) of GPU memory to reserve for the model weights, activations, and KV cache. Higher values will increase the KV cache size and thus improve the model's throughput. However, if the value is too high, it may cause out-of-memory (OOM) errors.| `0.9` |
-| `swap_space`| `int` | The size (GiB) of CPU memory per GPU to use as swap space. This can be used for temporarily storing the states of the requests when their `best_of` sampling parameters are larger than 1. If all requests will have `best_of=1`, you can safely set this to 0. Otherwise, too small values may cause out-of-memory (OOM) errors.| 4 |
-| `enforce_eager`| `bool` | Whether to enforce eager execution. If True, we will disable CUDA graph and always execute the model in eager mode. If False, we will use CUDA graph and eager execution in hybrid.| `False` |
-| `enable_lora` | `bool` | Whether to enable loading LoRA adapters | `False` |
-
-See the [vLLM code](https://github.com/vllm-project/vllm/blob/8f44facdddcf3c704f7d6a2719b6e85efc393449/vllm/entrypoints/llm.py#L72) for a list of all the available parameters.
-
-### Use quantized models
-
-vLLM supports AWQ, GPTQ and SqueezeLLM quantized models:
-
-
-```python
-from outlines import models
-
-model = models.vllm("TheBloke/Llama2-7b-Chat-AWQ", quantization="awq")
-model = models.vllm("TheBloke/Mistral-7B-Instruct-v0.2-GPTQ", quantization="gptq")
-model = models.vllm("https://huggingface.co/squeeze-ai-lab/sq-llama-30b-w4-s5", quantization="squeezellm")
-```
-
-!!! Warning "Dependencies"
-
-    To use AWQ model you need to install the autoawq library `pip install autoawq`.
-
-    To use GPTQ models you need to install the autoGTPQ and optimum libraries `pip install auto-gptq optimum`.
-
-
-### Multi-GPU usage
-
-To run multi-GPU inference with vLLM you need to set the `tensor_parallel_size` argument to the number of GPUs available when initializing the model. For instance to run inference on 2 GPUs:
-
-
-```python
-from outlines import models
-
-model = models.vllm(
-    "microsoft/Phi-3-mini-4k-instruct"
-    tensor_parallel_size=2
-)
-```
-
-### Load LoRA adapters
-
-You can load LoRA adapters and alternate between them dynamically:
-
-```python
-from outlines import models
-
-model = models.vllm("facebook/opt-350m", enable_lora=True)
-model.load_lora("ybelkaa/opt-350m-lora")  # Load LoRA adapter
-model.load_lora(None)  # Unload LoRA adapter
-```
-
-## Generate text
-
-In addition to the parameters described in the [text generation section](../text.md) you can pass an instance of `SamplingParams` directly to any generator via the `sampling_params` keyword argument:
-
-```python
-from vllm.sampling_params import SamplingParams
-from outlines import models, generate
-
-
-model = models.vllm("microsoft/Phi-3-mini-4k-instruct")
-generator = generate.text(model)
-
-params = SamplingParams(n=2, frequency_penalty=1., min_tokens=2)
-answer = generator("A prompt", sampling_params=params)
-```
-
-This also works with generators built with `generate.regex`, `generate.json`, `generate.cfg`, `generate.format` and `generate.choice`.
-
-!!! Note
-
-    The values passed via the `SamplingParams` instance supersede the other arguments to the generator or the samplers.
-
-**`SamplingParams` attributes:**
-
-| Parameters | Type             | Description            | Default |
-|:-----------|------------------|:-----------------------|---------|
-| `n` | `int` | Number of output sequences to return for the given prompt. | `1` |
-| `best_of` | `Optional[int]` | Number of output sequences that are generated from the prompt. From these `best_of` sequences, the top `n` sequences are returned. `best_of` must be greater than or equal to `n`. This is treated as the beam width when `use_beam_search` is True. By default, `best_of` is set to `n`. | `None` |
-| `presence_penalty` | `float` | Float that penalizes new tokens based on whether they appear in the generated text so far. Values > 0 encourage the model to use new tokens, while values < 0 encourage the model to repeat tokens.| `0.0` |
-| `frequency_penalty` | `float` | Float that penalizes new tokens based on their frequency in the generated text so far. Values > 0 encourage the model to use new tokens, while values < 0 encourage the model to repeat tokens. | `0.0`
-| `repetition_penalty` | `float` | Float that penalizes new tokens based on whether they appear in the prompt and the generated text so far. Values > 1 encourage the model to use new tokens, while values < 1 encourage the model to repeat tokens. | `1.0` |
-| `temperature` | `float` | Float that controls the randomness of the sampling. Lower values make the model more deterministic, while higher values make the model more random. Zero means greedy sampling. | `1.0` |
-| `top_p` | `float` |  Float that controls the cumulative probability of the top tokens to consider. Must be in (0, 1]. Set to 1 to consider all tokens. | `1.0` |
-| `top_k` | `int` |  Integer that controls the number of top tokens to consider. Set to -1 to consider all tokens. | `-1` |
-| `min_p` |`float` |  Float that represents the minimum probability for a token to be considered, relative to the probability of the most likely token. Must be in [0, 1]. Set to 0 to disable this. | `0.0` |
-| `seed` | `Optional[int]` | Random seed to use for the generation. | `None` |
-| `use_beam_search` | `bool` |  Whether to use beam search instead of sampling. | `False` |
-| `length_penalty` | `float` | Float that penalizes sequences based on their length. Used in beam search.  | `1.0` |
-| `early_stopping` | `Union[bool, str]` |  Controls the stopping condition for beam search. It accepts the following values: `True`, where the generation stops as soon as there are `best_of` complete candidates; `False`, where an heuristic is applied and the generation stops when is it very unlikely to find better candidates; `"never"`, where the beam search procedure only stops when there cannot be better candidates (canonical beam search algorithm). | `False` |
-| `stop` | `Optional[Union[str, List[str]]]` |  List of strings that stop the generation when they are generated. The returned output will not contain the stop strings. | `None` |
-| `stop_token_ids` | `Optional[List[int]]` |  List of tokens that stop the generation when they are generated. The returned output will contain the stop tokens unless the stop tokens are special tokens. | `None` |
-| `include_stop_str_in_output` | `bool` |  Whether to include the stop strings in output text. Defaults to False. | `False` |
-| `ignore_eos` | `bool` |  Whether to ignore the EOS token and continue generating tokens after the EOS token is generated. | `False` |
-| `max_tokens` | `int` |  Maximum number of tokens to generate per output sequence. | `16` |
-| `min_tokens` | `int` | Minimum number of tokens to generate per output sequence before EOS or stop_token_ids can be generated | `0` |
-| `skip_special_tokens` | `bool` | Whether to skip special tokens in the output. | `True` |
-| `spaces_between_special_tokens` | `bool` |  Whether to add spaces between special tokens in the output.  Defaults to True. | `True` |
-
-### Streaming
-
-!!! Warning
-
-    Streaming is not available for the offline vLLM integration.
-
-
-## Installation
-
-By default the vLLM library is installed with pre-commpiled C++ and CUDA binaries and will only run on GPU:
-
-```python
-pip install vllm
-```
-
-### CPU
-
-You need to have the `gcc` compiler installed on your system. Then you will need to install vLLM from source. First clone the repository:
-
-```bash
-git clone https://github.com/vllm-project/vllm.git
-cd vllm
-```
-
-Install the Python packages needed for the installation:
-
-```bash
-pip install --upgrade pip
-pip install wheel packaging ninja setuptools>=49.4.0 numpy
-pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
-```
-
-and finally run:
-
-```bash
-VLLM_TARGET_DEVICE=cpu python setup.py install
-```
-
-See the [vLLM documentation][vllm-install-cpu] for more details, alternative installation methods (Docker) and performance tips.
-
-### ROCm
-
-
-You will need to install vLLM from source. First install Pytorch on ROCm:
-
-```bash
-pip install torch==2.2.0.dev20231206+rocm5.7 --index-url https://download.pytorch.org/whl/nightly/rocm5.7 # tested version
-```
-
-You will then need to install flash attention for ROCm following [these instructions][rocm-flash-attention]. You can then install `xformers=0.0.23` and apply the patches needed to adapt Flash Attention for ROCm:
-
-```bash
-pip install xformers==0.0.23 --no-deps
-bash patch_xformers.rocm.sh
-```
-
-And finally build vLLM:
-
-```bash
-cd vllm
-pip install -U -r requirements-rocm.txt
-python setup.py install # This may take 5-10 minutes.
-```
-
-See the [vLLM documentation][vllm-install-rocm] for alternative installation methods (Docker).
-
-
-[vllm-install-cpu]: https://docs.vllm.ai/en/latest/getting_started/cpu-installation.html
-[vllm-install-rocm]: https://docs.vllm.ai/en/latest/getting_started/amd-installation.html
-[rocm-flash-attention]: https://github.com/ROCm/flash-attention/tree/flash_attention_for_rocm#amd-gpurocm-support
diff --git a/docs/reference/prompting.md b/docs/reference/prompting.md
deleted file mode 100644
index 8ea9c024..00000000
--- a/docs/reference/prompting.md
+++ /dev/null
@@ -1,403 +0,0 @@
-# Prompt templating
-
-Outlines provides a powerful domain-specific language to write and manage
-prompts, via what we call *prompt functions*.  Prompt functions are Python
-functions that contain a template for the prompt in their docstring, and their
-arguments correspond to the variables used in the prompt. When called, a prompt
-function returns the template rendered with the values of the arguments.
-
-The aim of prompt functions is to solve several recurrent problems with prompting:
-
-1. **Building complex prompts quickly leads to messy code.** This problem has
-   already been solved in the web development community by using templating, so
-   why not use it here?
-2. **Composing prompts is difficult.** Why not just compose functions?
-3. **Separating prompts from code.** Encapsulation in functions allows a clean
-   separation between prompts and code. Moreover, like any function, prompt
-   functions can be imported from other modules.
-
-Outlines uses the [Jinja templating
-engine](https://jinja.palletsprojects.com/en/3.1.x/) to render prompts, which
-allows to easily compose complex prompts.
-
-!!! warning "Prompt rendering"
-
-    Prompt functions are opinionated when it comes to prompt rendering. These opinions are meant to avoid common prompting errors, but can have unintended consequences if you are doing something unusual. We advise to always print the prompt before using it. You can also [read the
-    reference](#formatting-conventions) section if you want to know more.
-
-## Your first prompt
-
-The following snippet showcases a very simple prompt. The variables between
-curly brackets `{{  }}` are placeholders for the values of the arguments you
-will pass to the prompt function.
-
-=== "Code"
-
-    ```python
-    import outlines
-
-    @outlines.prompt
-    def greetings(name, question):
-        """Hello, {{ name }}!
-        {{ question }}
-        """
-
-    prompt = greetings("user", "How are you?")
-    print(prompt)
-    ```
-
-=== "Output"
-
-    ```text
-    Hello, user!
-    How are you?
-    ```
-
-If a variable is missing in the function's arguments, Jinja2 will throw an `UndefinedError` exception:
-
-=== "Code"
-
-    ```python
-    import outlines
-
-    @outlines.prompt
-    def greetings(name):
-        """Hello, {{ surname }}!"""
-
-    prompt = greetings("user")
-    ```
-
-=== "Output"
-
-    ```text
-    Traceback (most recent call last):
-      File "<stdin>", line 9, in <module>
-      File "/home/remi/projects/normal/outlines/outlines/prompts.py", line 38, in __call__
-          return render(self.template, **bound_arguments.arguments)
-      File "/home/remi/projects/normal/outlines/outlines/prompts.py", line 213, in render
-          return jinja_template.render(**values)
-      File "/home/remi/micromamba/envs/outlines/lib/python3.9/site-packages/jinja2/environment.py", line 1301, in render
-          self.environment.handle_exception()
-      File "/home/remi/micromamba/envs/outlines/lib/python3.9/site-packages/jinja2/environment.py", line 936, in handle_exception
-          raise rewrite_traceback_stack(source=source)
-      File "<template>", line 1, in top-level template code
-      jinja2.exceptions.UndefinedError: 'surname' is undefined
-    ```
-
-## Importing prompt functions
-
-Prompt functions are functions, and thus can be imported from other modules:
-
-=== "prompts.py"
-    ```python
-    import outlines
-
-    @outlines.prompt
-    def greetings(name, question):
-        """Hello, {{ name }}!
-        {{ question }}
-        """
-    ```
-
-=== "generate.py"
-
-    ```python
-    from .prompts import greetings
-
-    prompt = greetings("John Doe", "How are you today?")
-    ```
-
-=== "Output"
-
-    ```text
-    Hello, John Doe!
-    How are you today?
-    ```
-
-## Few-shot prompting
-
-Few-shot prompting can lead to messy code. Prompt functions allow you to loop
-over lists or dictionaries from the template. In the following example we
-demonstrate how we can generate a prompt by passing a list of dictionaries with
-keys `question` and `answer` to the prompt function:
-
-=== "Code"
-
-    ```python
-    import outlines
-
-    @outlines.prompt
-    def few_shots(instructions, examples, question):
-        """{{ instructions }}
-
-        Examples
-        --------
-
-        {% for example in examples %}
-        Q: {{ example.question }}
-        A: {{ example.answer }}
-
-        {% endfor %}
-        Question
-        --------
-
-        Q: {{ question }}
-        A:
-        """
-
-    instructions = "Please answer the following question following the examples"
-    examples = [
-        {"question": "2+2=?", "answer":4},
-        {"question": "3+3=?", "answer":6}
-    ]
-    question = "4+4 = ?"
-
-    prompt = few_shots(instructions, examples, question)
-    print(prompt)
-    ```
-
-=== "Output"
-
-    ```text
-    Please answer the following question following the examples
-
-    Examples
-    --------
-
-    Q: 2+2=?
-    A: 4
-
-    Q: 3+3=?
-    A: 6
-
-    Question
-    --------
-
-    Q: 4+4 = ?
-    A:
-    ```
-
-## Conditionals, filters, etc.
-
-Jinja2 has many features beyond looping that are not described here:
-conditionals, filtering, formatting, etc. Please refer to the [Jinja
-documentation](https://jinja.palletsprojects.com/en/3.1.x/>) for more
-information about the syntax of the templating language. The Jinja syntax is
-powerful, and we recommend you take some time to read their documentation if you
-are building complex prompts.
-
-
-## Tools
-
-Several projects (e.g.[Toolformer](https://arxiv.org/abs/2302.04761), [ViperGPT](https://viper.cs.columbia.edu/), [AutoGPT](https://github.com/Significant-Gravitas/Auto-GPT), etc.) have shown that we can "teach" language models to use external functions by describing what these functions do in the prompt. In these projects the same information is often repeated twice: the function implementation, name, docstring, or arguments are copy-pasted in the prompt. This is cumbersome and error prone; you can directly pull this information from within an Outlines prompt function:
-
-=== "Code"
-
-    ```python
-    import outlines
-
-    def my_tool(arg1: str, arg2: int):
-        """Tool description.
-
-        The rest of the docstring
-        """
-        pass
-
-    @outlines.prompt
-    def tool_prompt(question, tool):
-        """{{ question }}
-
-        COMMANDS
-        1. {{ tool | name }}: {{ tool | description }}, args: {{ tool | args }}
-
-        {{ tool | source }}
-        """
-
-    prompt = tool_prompt("Can you do something?", my_tool)
-    print(prompt)
-    ```
-
-=== "Output"
-
-    ```text
-    Can you do something?
-
-    COMMANDS
-    1. my_tool: Tool description., args: arg1: str, arg2: int
-
-    def my_tool(arg1: str, arg2: int):
-        """Tool description.
-
-        The rest of the docstring
-        """
-        pass
-    ```
-
-## JSON response format
-
-To build reliable chains with language models we often need to instruct them the
-format in which we would like them to return their response.
-
-Without prompt templating, the information is repeated twice between creating
-the parsing function (e.g. a Pydantic model), and writing the desired schema in
-the prompt. This can lead to errors that are hard to debug.
-
-Outlines allows you to directly pull the JSON schema of a pydantic model, or
-pretty print a dictionary from within an Outlines prompt function
-
-=== "Code"
-
-    ```python
-    from pydantic import BaseModel, Field
-
-    import outlines
-
-    class MyResponse(BaseModel):
-        field1: int = Field(description="an int")
-        field2: str
-
-    @outlines.prompt
-    def my_prompt(response_model):
-        """{{ response_model | schema }}"""
-
-    prompt = my_prompt(MyResponse)
-    print(prompt)
-    # {
-    #   "field1": "an int",
-    #   "field2": "<field2>"
-    # }
-    ```
-
-=== "Output"
-
-    ```python
-    response = {
-        "field1": "<field1>",
-        "field2": "a string"
-    }
-
-    my_prompt(MyResponse)
-    # {
-    #   "field1": "<field1>",
-    #   "field2": "a string"
-    # }
-    ```
-
-## Formatting conventions
-
-Prompt functions are opinionated when it comes to rendering, and these opinions
-are meant to avoid prompting mistakes and help with formatting.
-
-### Whitespaces
-
-If you have experience working with strings between triple quotes you know that
-indenting has an influence on the string's formatting. Prompt functions adopt
-a few conventions so you don't have to think about indents when writing prompt.
-
-First, whether you start the prompt right after the triple quotes or on the line
-below does not matter for formatting:
-
-=== "Code"
-
-    ```python
-    import outlines
-
-    @outlines.prompt
-    def prompt1():
-        """My prompt
-        """
-
-    @outlines.prompt
-    def prompt2():
-        """
-        My prompt
-        """
-
-    print(prompt1())
-    print(prompt2())
-    ```
-
-=== "Output"
-
-    ```text
-    My prompt
-    My prompt
-    ```
-
-Indentation is relative to the second line of the docstring, and leading spaces are removed:
-
-=== "Code"
-
-    ```python
-    import outlines
-
-    @outlines.prompt
-    def example1():
-        """First line
-        Second line
-        """
-
-    @outlines.prompt
-    def example2():
-        """
-          Second line
-          Third line
-        """
-
-    @outlines.prompt
-    def example3():
-        """
-          Second line
-            Third line
-        """
-
-    print(example1())
-    print(example2())
-    print(example3())
-    ```
-
-=== "Output"
-
-    ```text
-    First line
-    Second line
-
-    Second line
-    Third line
-
-    Second line
-      Third line
-    ```
-
-Trailing whitespaces are not removed, unless they follow a linebreak symbol `\` (see [linebreaks](#linebreaks)).
-
-### Linebreaks
-
-You can use the backslash `\` to break a long line of text. It will render as a single line:
-
-=== "Code"
-
-    ```python
-    import outlines
-
-    @outlines.prompt
-    def example():
-       """
-       Break in \
-       several lines \
-       But respect the indentation
-           on line breaks.
-       And after everything \
-       Goes back to normal
-       """
-
-    print(example())
-    ```
-
-=== "Output"
-
-    ```text
-    Break in several lines But respect the indentation
-        on line breaks.
-    And after everything Goes back to normal
-    ```
diff --git a/docs/reference/samplers.md b/docs/reference/samplers.md
deleted file mode 100644
index 56e4e1f7..00000000
--- a/docs/reference/samplers.md
+++ /dev/null
@@ -1,115 +0,0 @@
-# Samplers
-
-Outlines offers different sequence sampling algorithms, and we will integrate more in the future. You can read [this blog post](https://huggingface.co/blog/how-to-generate) for an overview of the different sampling algorithm.
-
-## Multinomial sampling
-
-Outlines defaults to the multinomial sampler without top-p or top-k sampling, and temperature equal to 1. Not specifying a sampler is equivalent to:
-
-```python
-from outlines import models, generate, samplers
-
-
-model = models.transformers("microsoft/Phi-3-mini-4k-instruct")
-sampler = samplers.multinomial()
-
-generator = generate.text(model, sampler)
-answer = generator("What is 2+2?")
-
-print(answer)
-# 4
-```
-
-You can ask the generator to take multiple samples by passing the number of samples when initializing the sampler:
-
-```python
-from outlines import models, generate, samplers
-
-
-model = models.transformers("microsoft/Phi-3-mini-4k-instruct")
-sampler = samplers.multinomial(3)
-
-generator = generate.text(model, sampler)
-answer = generator("What is 2+2?")
-
-print(answer)
-# [4, 4, 4]
-```
-
-If you ask multiple samples for a batch of prompt the returned array will be of shape `(num_samples, num_batches)`:
-
-```python
-from outlines import models, generate, samplers
-
-
-model = models.transformers("microsoft/Phi-3-mini-4k-instruct")
-sampler = samplers.multinomial(3)
-
-generator = generate.text(model, sampler)
-answer = generator(["What is 2+2?", "What is 3+3?"])
-
-print(answer)
-# [[4, 4, 4], [6, 6, 6]]
-```
-
-### Top-k sampling
-
-You can ask Outlines to only consider the top-k logits at each step by specifying the value of the `top-k` keyword argument when initializing the sampler.
-
-```python
-sampler = samplers.multinomial(3, top_k=10)
-```
-
-### Top-p sampling
-
-You can ask Outlines to only consider the highest probability tokens such that their cumulative probability is greater than a threshold `p`. Specify the `top_p` keyword argument when initializing the sampler:
-
-
-```python
-sampler = samplers.multinomial(3, top_p=0.95)
-```
-
-## Greedy sampler
-
-You can also use the greedy sampler. For this you need to initialize the generator with the sampler:
-
-
-```python
-from outlines import models, generate, samplers
-
-
-model = models.transformers("microsoft/Phi-3-mini-4k-instruct")
-sampler = samplers.greedy()
-
-generator = generate.text(model, sampler)
-answer = generator("What is 2+2?")
-
-print(answer)
-# 4
-```
-
-You cannot ask for multiple samples with the greedy sampler since it does not clear what the result should be.
-
-
-## Beam Search
-
-Outlines also comes with the Beam Search sampling algorithm:
-
-```python
-from outlines import models, generate, samplers
-
-
-model = models.transformers("microsoft/Phi-3-mini-4k-instruct")
-sampler = samplers.beam_search(beams=5)
-
-generator = generate.text(model, sampler)
-answer = generator("What is 2+2?")
-
-print(answer)
-# 4
-```
-
-
-!!! Warning "Compatibility"
-
-    Only models from the `transformers`  and `exllamav2 ` libraries are compatible with Beam Search.
diff --git a/docs/reference/serve/vllm.md b/docs/reference/serve/vllm.md
deleted file mode 100644
index 0b584568..00000000
--- a/docs/reference/serve/vllm.md
+++ /dev/null
@@ -1,73 +0,0 @@
-# Serve with vLLM
-
-!!! tip "Would rather not self-host?"
-
-    If you want to get started quickly with JSON-structured generaton you can call instead [.json](https://h1xbpbfsf0w.typeform.com/to/ZgBCvJHF), a [.txt](http://dottxt.co) API that guarantees valid JSON.
-
-Outlines can be deployed as an LLM service using the vLLM inference engine and a FastAPI server. vLLM is not installed by default so will need to install Outlines with:
-
-```bash
-pip install outlines[serve]
-```
-
-You can then start the server with:
-
-```bash
-python -m outlines.serve.serve --model="microsoft/Phi-3-mini-4k-instruct"
-```
-
-This will by default start a server at `http://127.0.0.1:8000` (check what the console says, though). Without the `--model` argument set, the OPT-125M model is used. The `--model` argument allows you to specify any model of your choosing.
-
-To run inference on multiple GPUs you must pass the `--tensor-parallel-size` argument when initializing the server. For instance, to run inference on 2 GPUs:
-
-
-```bash
-python -m outlines.serve.serve --model="microsoft/Phi-3-mini-4k-instruct" --tensor-parallel-size 2
-```
-
-
-### Alternative Method: Via Docker
-
-You can install and run the server with Outlines' official Docker image using the command
-
-```bash
-docker run -p 8000:8000 outlinesdev/outlines --model="microsoft/Phi-3-mini-4k-instruct"
-```
-
-## Querying Endpoint
-
-You can then query the model in shell by passing a prompt and either
-
-1. a [JSON Schema][jsonschema]{:target="_blank"} specification or
-2. a [Regex][regex]{:target="_blank"} pattern
-
-with the `schema` or `regex` parameters, respectively, to the `/generate` endpoint. If both are specified, the schema will be used. If neither is specified, the generated text will be unconstrained.
-
-For example, to generate a string that matches the schema `{"type": "string"}` (any string):
-
-```bash
-curl http://127.0.0.1:8000/generate \
-    -d '{
-        "prompt": "What is the capital of France?",
-        "schema": {"type": "string", "maxLength": 5}
-        }'
-```
-
-To generate a string that matches the regex `(-)?(0|[1-9][0-9]*)(\.[0-9]+)?([eE][+-][0-9]+)?` (a number):
-
-```bash
-curl http://127.0.0.1:8000/generate \
-    -d '{
-        "prompt": "What is Pi? Give me the first 15 digits: ",
-        "regex": "(-)?(0|[1-9][0-9]*)(\\.[0-9]+)?([eE][+-][0-9]+)?"
-        }'
-```
-
-Instead of `curl`, you can also use the [requests][requests]{:target="_blank"} library from another python program.
-
-Please consult the [vLLM documentation][vllm]{:target="_blank"} for details on additional request parameters. You can also [read the code](https://github.com/outlines-dev/outlines/blob/main/outlines/serve/serve.py) in case you need to customize the solution to your needs.
-
-[requests]: https://requests.readthedocs.io/en/latest/
-[vllm]: https://docs.vllm.ai/en/latest/index.html
-[jsonschema]: https://json-schema.org/learn/getting-started-step-by-step
-[regex]: https://www.regular-expressions.info/tutorial.html
diff --git a/docs/reference/text.md b/docs/reference/text.md
deleted file mode 100644
index 3d0d130f..00000000
--- a/docs/reference/text.md
+++ /dev/null
@@ -1,86 +0,0 @@
-# Text generation
-
-Outlines provides a unified interface to generate text with many language models, API-based and local. The same pattern is used throughout the library:
-
-1. Instantiate a generator by calling `outlines.generate.text` with the model to be used.
-2. Call the generator with the prompt and (optionally) some generation parameters.
-
-
-```python
-from outlines import models, generate
-
-model = models.openai("gpt-4")
-generator = generate.text(model)
-answer = generator("What is 2+2?")
-
-model = models.transformers("microsoft/Phi-3-mini-4k-instruct")
-generator = generate.text(model)
-answer = generator("What is 2+2?")
-```
-
-By default Outlines uses the multinomial sampler with `temperature=1`. See [this section](samplers.md) to learn how to use different samplers.
-
-## Streaming
-
-Outlines allows you to stream the model's response by calling the `.stream` method of the generator with the prompt:
-
-
-```python
-from outlines import models, generate
-
-model = models.transformers("microsoft/Phi-3-mini-4k-instruct")
-generator = generate.text(model)
-
-tokens = generator.stream("What is 2+2?")
-for token in tokens:
-    print(token)
-```
-
-## Parameters
-
-### Limit the number of tokens generated
-
-To limit the number of tokens generated you can pass the `max_tokens` positional argument to the generator:
-
-```python
-from outlines import models, generate
-
-model = models.transformers("microsoft/Phi-3-mini-4k-instruct")
-generator = generate.text(model)
-
-answer = generator("What is 2+2?", 5)
-answer = generator("What is 2+2?", max_tokens=5)
-```
-
-### Stop after a given string is generated
-
-You can also ask the model to stop generating text after a given string has been generated, for instance a period or a line break. You can pass a string or a line of string for the `stop_at` argument:
-
-
-```python
-from outlines import models, generate
-
-model = models.transformers("microsoft/Phi-3-mini-4k-instruct")
-generator = generate.text(model)
-
-answer = generator("What is 2+2?", stop_at=".")
-answer = generator("What is 2+2?", stop_at=[".", "\n"])
-```
-
-*The stopping string will be included in the response.*
-
-
-### Seed the generation
-
-It can be useful to seed the generation in order to get reproducible results:
-
-```python
-import torch
-from outlines import models, generate
-
-model = models.transformers("microsoft/Phi-3-mini-4k-instruct")
-
-seed = 789001
-
-answer = generator("What is 2+2?", seed=seed)
-```
diff --git a/docs/stylesheets/extra.css b/docs/stylesheets/extra.css
deleted file mode 100644
index c4539ab8..00000000
--- a/docs/stylesheets/extra.css
+++ /dev/null
@@ -1,145 +0,0 @@
-@font-face {
-  font-family: 'Source Code Pro Custom', monospace;
-  src: url(https://fonts.googleapis.com/css2?family=Source+Code+Pro:ital,wght@0,200..900;1,200..900&display=swap);
-}
-
-:root > * {
-  --md-default-bg-color: #FFFFFF;
-  --md-code-bg-color: #2E3440;
-  --md-code-fg-color: #FFFFFF;
-  --md-text-font-family: "Inter";
-  --md-code-font: "Source Code Pro Custom";
-
-  /* don't inherit white fg color for mermaid diagrams from --md-code-fg-color */
-  --md-mermaid-label-fg-color: #000000;
-  --md-mermaid-edge-color: #000000;
-}
-
-.index-pre-code {
-  max-width: 700px;
-  left: 50%;
-}
-
-.index-pre-code pre>code {
-  text-align: left;
-}
-
-.md-clipboard::after {
-  color: #FFFFFF;
-  transition: color 0.3s ease-in-out;
-}
-
-.md-clipboard:hover::after {
-  color: #D8DEE9;
-}
-
-.md-source-file {
-  text-align: center;
-  padding: 24px 0;
-}
-
-.md-typeset pre>code {
-  border-radius: .2rem;
-  box-shadow: 10px 5px 5px #D8DEE9;
-}
-
-.md-typeset p > code {
-  background: #ECEFF4;
-  color: #000000;
-  font-weight: 500;
-}
-
-.md-typeset strong > code {
-  background: #ECEFF4;
-  color: #000000;
-  font-weight: 500;
-}
-
-.md-content p > code {
-  background: #ECEFF4;
-  color: #000000;
-  font-weight: 500;
-}
-
-.md-typeset td > code {
-  background: #ECEFF4;
-  color: #000000;
-  font-weight: 500;
-}
-
-.md-typeset li > code {
-  background: #ECEFF4;
-  color: #000000;
-  font-weight: 500;
-}
-
-.md-typeset code {
-  font-weight: 500;
-}
-
-.md-typeset pre {
-  margin-left: .5rem;
-  margin-right: .5rem;
-  margin-top: 2rem;
-  margin-bottom: 2rem;
-}
-
-.language-python {
-  background: #FFFFFF ! important
-}
-
-.language-bash {
-  background: #FFFFFF ! important
-}
-
-.language-toml {
-  background: #FFFFFF ! important
-}
-
-.language-text {
-  background: #FFFFFF ! important
-}
-
-.language-json {
-  background: #FFFFFF ! important
-}
-
-h1.title {
-  color: #FFFFFF;
-  margin: 0px 0px 5px;
-}
-
-h2.subtitle {
-  margin: 5px 0px 25px;
-}
-
-.md-typeset {
-  line-height: 24px;
-  font-weight: 400;
-}
-
-.md-typeset h1 {
-  font-weight: bold;
-  color: #000000;
-}
-
-.md-typeset h2 {
-  font-weight: bold;
-  color: #000000;
-}
-
-span.md-ellipsis {
-  color: black;
-}
-
-.md-nav__link--active {
-  background-color: #ECEFF4;
-}
-
-.md-typeset ol li {
-  margin-bottom: .1rem;
-}
-
-.md-typeset ul li {
-  margin-bottom: .1rem;
-}
diff --git a/docs/welcome.md b/docs/welcome.md
deleted file mode 100644
index a7800f7a..00000000
--- a/docs/welcome.md
+++ /dev/null
@@ -1,183 +0,0 @@
----
-title: Welcome to Outlines!
----
-
-Outlines〰 is a Python library that allows you to use Large Language Model in a simple and robust way (with structured generation). It is built by [.txt][.txt]{:target="_blank"}, and is already used in production by many companies.
-
-## What models do you support?
-
-We support [Openai](reference/models/openai.md), but the true power of Outlines〰 is unleashed with Open Source models available via the [transformers](reference/models/transformers.md), [llama.cpp](reference/models/llamacpp.md), [exllama2](reference/models/exllamav2.md), [mlx-lm](reference/models/mlxlm.md) and [vllm](reference/models/vllm.md) models. If you want to build and maintain an integration with another library, [get in touch][discord].
-
-## What are the main features?
-
-<div class="grid cards" markdown>
--   :material-code-json:{ .lg .middle } __Make LLMs generate valid JSON__
-
-    ---
-
-    No more invalid JSON outputs, 100% guaranteed
-
-    [:octicons-arrow-right-24: Generate JSON](reference/generation/json.md)
-
--   :material-keyboard-outline:{ .lg .middle } __JSON mode for vLLM__
-
-    ---
-
-    Deploy a LLM service using Outlines' JSON structured generation and vLLM
-
-    [:octicons-arrow-right-24: Deploy outlines](reference/serve/vllm.md)
-
-
--   :material-regex:{ .lg .middle } __Make LLMs follow a Regex__
-
-    ---
-
-    Generate text that parses correctly 100% of the time
-
-    [:octicons-arrow-right-24: Guide LLMs](reference/generation/regex.md)
-
--    :material-chat-processing-outline:{ .lg .middle } __Powerful Prompt Templating__
-
-     ---
-
-     Better manage your prompts' complexity with prompt templating
-
-    [:octicons-arrow-right-24: Learn more](reference/prompting.md)
-</div>
-
-## Why use Outlines?
-
-
-Outlines〰 is built at [.txt][.txt] by engineers with decades of experience in software engineering, machine learning (Bayesian Statistics and NLP), and compilers. [.txt][.txt] is a VC-backed company fully focused on the topic of structured generation and is committed to make the community benefit from its experience.
-
-We are also open source veterans and have authored/maintained many libraries over the years: the [Aesara][aesara]{:target="_blank"} and [Pythological][pythological]{:target="_blank"} ecosystems, [Blackjax][blackjax]{:target="_blank"} and [Hy][hy]{:target="_blank"} among many others.
-.
-
-Outlines does not use unnecessary abstractions that tend to get in your way. We have a laser focus on reliable text generation with LLMs, a clear roadmap to push the state of the art in this area and a commitment to clean and robust code.
-
-And last but not least, unlike alternatives, Outlines' structured generation introduces **no overhead** during inference.
-
-
-## Who is using Outlines?
-
-Hundreds of organisations and the main LLM serving frameworks ([vLLM][vllm], [TGI][tgi], [LoRAX][lorax], [xinference][xinference], [SGLang][sglang]) are using Outlines. Some of the prominent companies and organizations that are using Outlines include:
-
-<head>
-  <style>
-  .row {
-      display: inline-block;
-      width: 100%;
-      margin-bottom: 50px;
-      margin-top: 0px !important;
-      break-inside: avoid;
-  }
-
-  /* Create two equal columns that sits next to each other */
-  .column {
-      column-count: 3;
-      column-gap: 20px;
-      padding: 20px;
-  }
-
-  </style>
-</head>
-<body>
-
-<div class="column">
-  <div class="row"><img src="../logos/amazon.png" width="200"></div>
-  <div class="row"><img src="../logos/apple.png" width="200"></div>
-  <div class="row"><img src="../logos/best_buy.png" width="200"></div>
-  <div class="row"><img src="../logos/canoe.png" width="200"></div>
-  <div class="row"><img src="../logos/cisco.png" width="200"></div>
-  <div class="row"><img src="../logos/dassault_systems.png" width="200"></div>
-  <div class="row"><img src="../logos/databricks.png" width="200"></div>
-  <div class="row"><img src="../logos/datadog.png" width="200"></div>
-  <div class="row"><img src="../logos/dbt_labs.png" width="200"></div>
-  <div class="row"><img src="../assets/images/dottxt.png" width="200"></div>
-  <div class="row"><img src="../logos/gladia.jpg" width="200"></div>
-  <div class="row"><img src="../logos/harvard.png" width="200"></div>
-  <div class="row"><img src="../logos/hf.png" width="200"></div>
-  <div class="row"><img src="../logos/johns_hopkins.png" width="200"></div>
-  <div class="row"><img src="../logos/meta.png" width="200"></div>
-  <div class="row"><img src="../logos/mit.png" width="200"></div>
-  <div class="row"><img src="../logos/mount_sinai.png" width="200"></div>
-  <div class="row"><img src="../logos/nvidia.png" width="200"></div>
-  <div class="row"><img src="../logos/nyu.png" width="200"></div>
-  <div class="row"><img src="../logos/safran.png" width="200"></div>
-  <div class="row"><img src="../logos/salesforce.png" width="200"></div>
-  <div class="row"><img src="../logos/shopify.png" width="200"></div>
-  <div class="row"><img src="../logos/smithsonian.png" width="200"></div>
-  <div class="row"><img src="../logos/tinder.png" width="200"></div>
-  <div class="row"><img src="../logos/upenn.png" width="200"></div>
-</div>
-
-</body>
-
-Organizations are included either because they use Outlines as a dependency in a public repository, or because of direct communication between members of the Outlines team and employees at these organizations.
-
-Still not convinced, read [what people say about us](community/feedback.md). And make sure to take a look at what the [community is building](community/examples.md)!
-
-## Philosophy
-
-**Outlines** 〰 is a library for neural text generation. You can think of it as a
-more flexible replacement for the `generate` method in the
-[transformers](https://github.com/huggingface/transformers) library.
-
-**Outlines** 〰 helps developers *structure text generation* to build robust
-interfaces with external systems. It provides generation methods that
-guarantee that the output will match a regular expressions, or follow
-a JSON schema.
-
-**Outlines** 〰 provides *robust prompting primitives* that separate the prompting
-from the execution logic and lead to simple implementations of few-shot
-generations, ReAct, meta-prompting, agents, etc.
-
-**Outlines** 〰 is designed as a *library* that is meant to be compatible the
-broader ecosystem, not to replace it. We use as few abstractions as possible,
-and generation can be interleaved with control flow, conditionals, custom Python
-functions and calls to other libraries.
-
-**Outlines** 〰 is *compatible with every auto-regressive model*. It only interfaces with models
-via the next-token logits distribution.
-
-## Outlines people
-
-Outlines would not be what it is today without a community of dedicated developers:
-
-<a href="https://github.com/outlines-dev/outlines/graphs/contributors">
-  <img src="https://contrib.rocks/image?repo=outlines-dev/outlines" />
-</a>
-
-## Acknowledgements
-
-<div class="grid" markdown>
-
-
-<figure markdown>
-  <a href="http://www.dottxt.co">
-  ![Normal Computing logo](assets/images/dottxt.png){ width="150" }
-  </a>
-</figure>
-
-<figure markdown>
-  <a href="https://www.normalcomputing.ai">
-  ![Normal Computing logo](assets/images/normal_computing.jpg){ width="150" }
-  </a>
-</figure>
-
-</div>
-
-Outlines was originally developed at [@NormalComputing](https://twitter.com/NormalComputing) by [@remilouf](https://twitter.com/remilouf) and [@BrandonTWillard](https://twitter.com/BrandonTWillard). It is now maintained by [.txt](https://dottxt.co).
-
-
-[discord]: https://discord.gg/R9DSu34mGd
-[aesara]: https://github.com/aesara-devs
-[blackjax]: https://github.com/blackjax-devs/blackjax
-[pythological]: https://github.com/pythological
-[hy]: https://hylang.org/
-[.txt]: https://dottxt.co
-[vllm]: https://github.com/vllm-project/vllm
-[tgi]: https://github.com/huggingface/text-generation-inference
-[lorax]: https://github.com/predibase/lorax
-[xinference]: https://github.com/xorbitsai/inference
-[sglang]: https://github.com/sgl-project/sglang/
diff --git a/mkdocs.yml b/mkdocs.yml
deleted file mode 100644
index afc56528..00000000
--- a/mkdocs.yml
+++ /dev/null
@@ -1,169 +0,0 @@
-# Site information
-site_name: Outlines 〰️
-site_author: The Outlines developers
-site_description: >-
-  Structured text generation with LLMs
-
-
-# Repository
-repo_name: outlines-dev/outlines
-repo_url: https://github.com/outlines-dev/outlines
-
-# Copyright
-copyright: Copyright &copy; 2023- The Outlines Developers
-
-# Configuration
-theme:
-  name: material
-  custom_dir: docs/overrides
-  palette:
-    - scheme: default
-      primary: white
-  logo: assets/images/logo.png
-  favicon: assets/images/logo.png
-  icon:
-    repo: fontawesome/brands/github
-  features:
-    - content.code.copy
-    - navigation.expand
-    - navigation.tabs
-    - navigation.sections
-    - header.autohide
-    - announce.dismiss
-  font:
-    text: Inter
-    code: Source Code Pro
-
-# Additional configuration
-extra:
-  social:
-    - icon: fontawesome/brands/github
-      link: https://github.com/outlines-dev
-    - icon: fontawesome/brands/twitter
-      link: https://twitter.com/remilouf
-  generator: false
-  analytics:
-    provider: google
-    property: !ENV GOOGLE_ANALYTICS_KEY
-
-# Extensions
-markdown_extensions:
-  - admonition
-  - def_list
-  - attr_list
-  - md_in_html
-  - pymdownx.highlight:
-      anchor_linenums: true
-      line_spans: __span
-      pygments_lang_class: true
-      noclasses: True
-      pygments_style: nord
-  - pymdownx.superfences:
-      custom_fences:
-        - name: mermaid
-          class: mermaid
-          format: !!python/name:pymdownx.superfences.fence_code_format
-  - pymdownx.tabbed:
-      alternate_style: true
-  - pymdownx.inlinehilite
-  - pymdownx.details
-  - pymdownx.emoji:
-      emoji_index: !!python/name:material.extensions.emoji.twemoji
-      emoji_generator: !!python/name:material.extensions.emoji.to_svg
-
-
-extra_css:
-  - stylesheets/extra.css
-
-plugins:
-  - blog
-  - mkdocstrings:
-      default_handler: python
-      handlers:
-        python:
-          options:
-            show_submodules: true
-  - search
-  - section-index
-  - social:
-      cards_layout_options:
-        color: #173a58
-  - git-committers:
-      repository: outlines-dev/outlines
-      branch: main
-  - git-revision-date-localized:
-      enable_creation_date: true
-      type: timeago
-
-nav:
-  - Home: index.md
-  - Getting Started:
-      - Welcome to Outlines: welcome.md
-      - Installation: installation.md
-      - Quickstart: quickstart.md
-      - Licence: licence.md
-  - Cookbook:
-      - cookbook/index.md
-      - Classification: cookbook/classification.md
-      - Named Entity Extraction: cookbook/extraction.md
-      - Generate synthetic data: cookbook/dating_profiles.md
-      - Summarize a document: cookbook/chain_of_density.md
-      - Playing chess: cookbook/models_playing_chess.md
-      - Perspective-taking prompting: cookbook/simtom.md
-      - Question-answering with citations: cookbook/qa-with-citations.md
-      - Knowledge Graph Extraction: cookbook/knowledge_graph_extraction.md
-      - Chain of Thought (CoT): cookbook/chain_of_thought.md
-      - ReAct Agent: cookbook/react_agent.md
-      - Run on the cloud:
-          - BentoML: cookbook/deploy-using-bentoml.md
-          - Cerebrium: cookbook/deploy-using-cerebrium.md
-          - Modal: cookbook/deploy-using-modal.md
-  - Docs:
-    - reference/index.md
-    - Generation:
-        - Overview: reference/generation/generation.md
-        - Text: reference/text.md
-        - Samplers: reference/samplers.md
-        - Structured generation:
-            - How does it work?: reference/generation/structured_generation_explanation.md
-            - Classification: reference/generation/choices.md
-            - Regex: reference/generation/regex.md
-            - Type constraints: reference/generation/format.md
-            - JSON (function calling): reference/generation/json.md
-            - Grammar: reference/generation/cfg.md
-            - Custom FSM operations: reference/generation/custom_fsm_ops.md
-    - Utilities:
-        - Serve with vLLM: reference/serve/vllm.md
-        - Custom types: reference/generation/types.md
-        - Prompt templating: reference/prompting.md
-        - Outlines functions: reference/functions.md
-    - Models:
-        - Overview: reference/models/models.md
-        - Open source:
-          - Transformers: reference/models/transformers.md
-          - Transformers Vision: reference/models/transformers_vision.md
-          - Llama.cpp: reference/models/llamacpp.md
-          - vLLM: reference/models/vllm.md
-          - TGI: reference/models/tgi.md
-          - ExllamaV2: reference/models/exllamav2.md
-          - MLX: reference/models/mlxlm.md
-          - Mamba: reference/models/transformers.md
-        - API:
-            - OpenAI: reference/models/openai.md
-  - API Reference:
-    - api/index.md
-    - api/models.md
-    - api/prompts.md
-    - api/json_schema.md
-    - api/guide.md
-    - api/parsing.md
-    - api/regex.md
-    - api/samplers.md
-  - Community:
-    - community/index.md
-    - Feedback 🫶: community/feedback.md
-    - Chat with us ☕: https://discord.com/invite/R9DSu34mGd
-    - How to contribute 🏗️: community/contribute.md
-    - Your projects 👏: community/examples.md
-    - Versioning Guide 📌: community/versioning.md
-  - Blog: blog/index.md
diff --git a/requirements-doc.txt b/requirements-doc.txt
deleted file mode 100644
index cf8c674a..00000000
--- a/requirements-doc.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-mkdocs
-mkdocs-material
-mkdocs-material[imaging]
-mkdocs-mermaid2-plugin
-mkdocs-section-index
-mkdocstrings[python]
-mkdocs-git-committers-plugin-2
-mkdocs-git-revision-date-localized-plugin