From ee85483c4d444ffb00922961dd1d5ae10c1ba6ac Mon Sep 17 00:00:00 2001
From: Andrew Lapp <andrew@github.rew.la>
Date: Fri, 31 May 2024 17:04:34 -0500
Subject: [PATCH 1/4] ASV PR bench workflow, pytest-bench -> ASV, add peakmem
 tests

---
 .github/workflows/asv_benchmark_pr.yml        | 52 +++++++++++++++++++
 .gitignore                                    |  1 +
 benchmarks/__init__.py                        |  0
 benchmarks/asv.conf.json                      | 20 +++++++
 .../bench_json_schema.py                      | 43 +++++++--------
 benchmarks/bench_numba_compile.py             | 37 +++++++++++++
 .../bench_regex_guide.py                      | 39 +++++++++-----
 .../conftest.py => benchmarks/common.py       | 10 ++--
 docs/community/contribute.md                  | 34 ++++++++++--
 .../benchmark/test_benchmark_numba_compile.py | 33 ------------
 10 files changed, 191 insertions(+), 78 deletions(-)
 create mode 100644 .github/workflows/asv_benchmark_pr.yml
 create mode 100644 benchmarks/__init__.py
 create mode 100644 benchmarks/asv.conf.json
 rename tests/benchmark/test_benchmark_json_schema.py => benchmarks/bench_json_schema.py (70%)
 create mode 100644 benchmarks/bench_numba_compile.py
 rename tests/benchmark/test_benchmark_regex_fsm.py => benchmarks/bench_regex_guide.py (68%)
 rename tests/benchmark/conftest.py => benchmarks/common.py (74%)
 delete mode 100644 tests/benchmark/test_benchmark_numba_compile.py

diff --git a/.github/workflows/asv_benchmark_pr.yml b/.github/workflows/asv_benchmark_pr.yml
new file mode 100644
index 000000000..09786b72c
--- /dev/null
+++ b/.github/workflows/asv_benchmark_pr.yml
@@ -0,0 +1,52 @@
+name: Benchmark PR
+
+on:
+  pull_request:
+    branches: [main]
+  workflow_dispatch:
+env:
+  PYTHON_VERSION: "3.10"
+  WORKING_DIR: ${{ github.workspace }}/benchmarks
+  BENCHMARKS_OUTPUT: ${{ github.workspace }}/benchmarks_output
+
+jobs:
+  benchmark-pr:
+    runs-on: ubuntu-latest
+    if: contains(github.event.pull_request.labels.*.name, 'run_benchmarks') || github.event_name == 'workflow_dispatch' || github.event_name == 'workflow_run'
+
+    defaults:
+      run:
+        working-directory: ${{ env.WORKING_DIR }}
+
+    steps:
+
+    - name: Checkout repository
+      uses: actions/checkout@v3
+      with:
+        fetch-depth: 0
+
+    - name: Set up Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: ${{ env.PYTHON_VERSION }}
+
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install asv virtualenv lf-asv-formatter
+
+    - name: Create ASV machine config file
+      run: asv machine --machine gh-runner --yes
+
+    - name: Run Benchmarks - `PR HEAD` vs `main`
+      run: |
+        # prepare main branch for comparison
+        git remote add upstream https://github.com/${{ github.repository }}.git
+        git fetch upstream main
+
+        # Run benchmarks, allow errors, they will be caught in the next step
+        asv continuous upstream/main HEAD \
+            --no-stats --interleave-rounds -a repeat=3 || true
+
+    - name: BENCHMARK RESULTS
+      run: asv compare --factor=1.1 --no-stats --split upstream/main HEAD
diff --git a/.gitignore b/.gitignore
index 9e95a8732..9add6d8c4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,3 +6,4 @@ docs/build
 .idea/
 *.gguf
 .venv
+benchmarks/results
diff --git a/benchmarks/__init__.py b/benchmarks/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/benchmarks/asv.conf.json b/benchmarks/asv.conf.json
new file mode 100644
index 000000000..287bff98f
--- /dev/null
+++ b/benchmarks/asv.conf.json
@@ -0,0 +1,20 @@
+{
+    "version": 1,
+    "project": "Outlines",
+    "project_url": "https://outlines-dev.github.io/outlines/",
+    "repo": "..",
+    "branches": [
+	"HEAD"
+    ],
+    "build_command": [
+        "python -mpip install .[test]",
+        "PIP_NO_BUILD_ISOLATION=false python -mpip wheel --no-deps --no-index -w {build_cache_dir} {build_dir}",
+    ],
+    "environment_type": "virtualenv",
+    "show_commit_url": "https://github.com/lapp0/outlines/commit/",
+    "benchmark_dir": ".",
+    "env_dir": "env",
+    "results_dir": "results",
+    "html_dir": "html",
+    "build_cache_size": 8
+}
diff --git a/tests/benchmark/test_benchmark_json_schema.py b/benchmarks/bench_json_schema.py
similarity index 70%
rename from tests/benchmark/test_benchmark_json_schema.py
rename to benchmarks/bench_json_schema.py
index 33f3f5b16..daa77510b 100644
--- a/tests/benchmark/test_benchmark_json_schema.py
+++ b/benchmarks/bench_json_schema.py
@@ -1,5 +1,3 @@
-import pytest
-
 import outlines
 
 outlines.disable_cache()
@@ -7,6 +5,12 @@
 from outlines.fsm.guide import RegexGuide  # noqa: E402
 from outlines.fsm.json_schema import build_regex_from_schema  # noqa: E402
 
+from .common import (  # noqa: E402
+    clear_outlines_cache,
+    ensure_numba_compiled,
+    setup_tokenizer,
+)
+
 simple_schema = """{
         "$defs": {
             "Armor": {
@@ -63,30 +67,21 @@
   "required": ["id", "work", "recording_artists"]
 }"""
 
-
 schemas = dict(simple_schema=simple_schema, complex_schema=complex_schema)
 
 
-@pytest.mark.parametrize("schema_name", schemas.keys())
-def test_benchmark_json_schema_to_regex(benchmark, ensure_numba_compiled, schema_name):
-    """Benchmark convert json schema to regex"""
-    schema = schemas[schema_name]
-    benchmark.pedantic(
-        build_regex_from_schema,
-        args=(schema,),
-        rounds=8,
-    )
+class JsonSchemaBenchmark:
+    params = schemas.keys()
+
+    def setup(self, schema_name):
+        clear_outlines_cache()
+        self.tokenizer = setup_tokenizer()
+        self.schema = schemas[schema_name]
+        ensure_numba_compiled(self.tokenizer)
 
+    def time_json_schema_to_regex(self, schema_name):
+        build_regex_from_schema(self.schema)
 
-@pytest.mark.parametrize("schema_name", schemas.keys())
-def test_benchmark_json_schema_to_fsm(
-    benchmark, tokenizer, ensure_numba_compiled, schema_name
-):
-    """Benchmark compile json schema as FSM"""
-    schema = schemas[schema_name]
-    regex = build_regex_from_schema(schema)
-    benchmark.pedantic(
-        RegexGuide,
-        args=(regex, tokenizer),
-        rounds=8,
-    )
+    def time_json_schema_to_fsm(self, schema_name):
+        regex = build_regex_from_schema(self.schema)
+        RegexGuide(regex, self.tokenizer)
diff --git a/benchmarks/bench_numba_compile.py b/benchmarks/bench_numba_compile.py
new file mode 100644
index 000000000..c0e9d87c4
--- /dev/null
+++ b/benchmarks/bench_numba_compile.py
@@ -0,0 +1,37 @@
+import importlib
+
+import interegular
+import numba
+
+import outlines
+
+from .common import clear_outlines_cache, setup_tokenizer
+
+outlines.disable_cache()
+
+
+class NumbaCompileBenchmark:
+    def setup(self):
+        clear_outlines_cache()
+        from outlines.fsm import regex
+
+        self.tokenizer = setup_tokenizer()
+        self.regex = regex
+        original_njit = numba.njit
+
+        def mock_njit(*args, **kwargs):
+            kwargs["cache"] = False
+            return original_njit(*args, **kwargs)
+
+        self.original_njit = original_njit
+        numba.njit = mock_njit
+        importlib.reload(self.regex)
+        self.regex_pattern, _ = self.regex.make_deterministic_fsm(
+            interegular.parse_pattern("a").to_fsm().reduce()
+        )
+
+    def teardown(self):
+        numba.njit = self.original_njit
+
+    def time_compile_numba(self):
+        self.regex.create_fsm_index_tokenizer(self.regex_pattern, self.tokenizer)
diff --git a/tests/benchmark/test_benchmark_regex_fsm.py b/benchmarks/bench_regex_guide.py
similarity index 68%
rename from tests/benchmark/test_benchmark_regex_fsm.py
rename to benchmarks/bench_regex_guide.py
index e9e45052a..efaea9e1f 100644
--- a/tests/benchmark/test_benchmark_regex_fsm.py
+++ b/benchmarks/bench_regex_guide.py
@@ -1,7 +1,7 @@
-import pytest
-
 import outlines
 
+from .common import clear_outlines_cache, ensure_numba_compiled, setup_tokenizer
+
 outlines.disable_cache()
 
 from outlines.fsm.guide import RegexGuide  # noqa: E402
@@ -19,14 +19,27 @@
 }
 
 
-@pytest.mark.parametrize("regex_name", regex_samples.keys())
-def test_benchmark_regex_to_fsm(
-    benchmark, tokenizer, ensure_numba_compiled, regex_name
-):
-    """Benchmark converting regex to FSM"""
-    regex_str = regex_samples[regex_name]
-    benchmark.pedantic(
-        RegexGuide,
-        args=(regex_str, tokenizer),
-        rounds=8,
-    )
+class RegexGuideBenchmark:
+    params = regex_samples.keys()
+
+    def setup(self, pattern_name):
+        clear_outlines_cache()
+        self.tokenizer = setup_tokenizer()
+        ensure_numba_compiled(self.tokenizer)
+        self.pattern = regex_samples[pattern_name]
+
+    def time_regex_to_guide(self, pattern_name):
+        RegexGuide(self.pattern, self.tokenizer)
+
+
+class MemoryRegexGuideBenchmark:
+    params = ["simple_phone", "complex_span_constrained_relation_extraction"]
+
+    def setup(self, pattern_name):
+        clear_outlines_cache()
+        self.tokenizer = setup_tokenizer()
+        ensure_numba_compiled(self.tokenizer)
+        self.pattern = regex_samples[pattern_name]
+
+    def peakmem_regex_to_guide(self, pattern_name):
+        RegexGuide(self.pattern, self.tokenizer)
diff --git a/tests/benchmark/conftest.py b/benchmarks/common.py
similarity index 74%
rename from tests/benchmark/conftest.py
rename to benchmarks/common.py
index 902d5d6eb..e0fe36f14 100644
--- a/tests/benchmark/conftest.py
+++ b/benchmarks/common.py
@@ -1,17 +1,19 @@
-import pytest
 from transformers import AutoTokenizer
 
+import outlines.caching
 from outlines.fsm.guide import RegexGuide
 from outlines.models.transformers import TransformerTokenizer
 
 
-@pytest.fixture
-def tokenizer():
+def clear_outlines_cache():
+    outlines.caching.clear_cache()
+
+
+def setup_tokenizer():
     tokenizer = AutoTokenizer.from_pretrained("gpt2")
     return TransformerTokenizer(tokenizer)
 
 
-@pytest.fixture
 def ensure_numba_compiled(tokenizer):
     RegexGuide("a", tokenizer)
     return True
diff --git a/docs/community/contribute.md b/docs/community/contribute.md
index fb67576e4..b336eacad 100644
--- a/docs/community/contribute.md
+++ b/docs/community/contribute.md
@@ -57,12 +57,38 @@ And run the code style checks:
 pre-commit run --all-files
 ```
 
-When modifying the code related to the index compilation, we kindly ask you to
-post benchmarks before and after your changes. You can run benchmarks using:
+### Benchmarking
 
-```python
-pytest --benchmark-only
+Outlines uses [asv](https://asv.readthedocs.io) for automated benchmark testing. Benchmarks are run automatically before pull requests are merged to prevent performance degredation.
+
+You can run the benchmark test suite locally with the following command:
+```
+asv run --config benchmarks/asv.conf.json
+```
+
+Run a specific test:
+```
+asv run --config benchmarks/asv.conf.json -b bench_json_schema.JsonSchemaBenchmark.time_json_schema_to_fsm
+```
+
+Profile a specific test:
 ```
+asv run --config benchmarks/asv.conf.json --profile -b bench_json_schema.JsonSchemaBenchmark.time_json_schema_to_fsm
+```
+
+Compare to `origin/main`
+```
+get fetch origin
+asv continuous origin/main HEAD --config benchmarks/asv.conf.json
+```
+
+#### ASV PR Behavior
+
+- **View ASV Benchmark Results:** Open the workflow, view `BENCHMARK RESULTS` section.
+- Merging is blocked unless benchmarks are run for the latest commit.
+- Benchmarks fail if performance degrades by more than 10% for any individual benchmark.
+- The "Benchmark PR" workflow runs when its manually dispatched, or if the `run_benchmarks` label is added to the PR they run for every commit.
+
 
 ### Contribute to the documentation
 
diff --git a/tests/benchmark/test_benchmark_numba_compile.py b/tests/benchmark/test_benchmark_numba_compile.py
deleted file mode 100644
index 827d561bd..000000000
--- a/tests/benchmark/test_benchmark_numba_compile.py
+++ /dev/null
@@ -1,33 +0,0 @@
-import importlib
-
-import interegular
-import numba
-
-import outlines
-
-outlines.disable_cache()
-
-
-def test_benchmark_compile_numba(benchmark, tokenizer, mocker):
-    """Compile a basic regex to benchmark the numba compilation time"""
-
-    def setup():
-        from outlines.fsm import regex
-
-        original_njit = numba.njit
-
-        def mock_njit(*args, **kwargs):
-            kwargs["cache"] = False
-            return original_njit(*args, **kwargs)
-
-        mocker.patch("numba.njit", new=mock_njit)
-        importlib.reload(regex)
-
-        regex_pattern, _ = regex.make_deterministic_fsm(
-            interegular.parse_pattern("a").to_fsm().reduce()
-        )
-        return (regex, regex_pattern, tokenizer), {}
-
-    benchmark.pedantic(
-        lambda r, *args: r.create_fsm_index_tokenizer(*args), rounds=2, setup=setup
-    )

From 02d8a845715519434812285301ae28afb21c1adf Mon Sep 17 00:00:00 2001
From: Andrew Lapp <andrew@github.rew.la>
Date: Fri, 31 May 2024 19:41:25 -0500
Subject: [PATCH 2/4] ensure workflow fails if benchmark degredation >10%

---
 .github/workflows/asv_benchmark_pr.yml | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/asv_benchmark_pr.yml b/.github/workflows/asv_benchmark_pr.yml
index 09786b72c..90fb47423 100644
--- a/.github/workflows/asv_benchmark_pr.yml
+++ b/.github/workflows/asv_benchmark_pr.yml
@@ -49,4 +49,9 @@ jobs:
             --no-stats --interleave-rounds -a repeat=3 || true
 
     - name: BENCHMARK RESULTS
-      run: asv compare --factor=1.1 --no-stats --split upstream/main HEAD
+      run: |
+        asv compare --factor=1.1 --no-stats --split upstream/main HEAD | tee ${{ env.BENCHMARKS_OUTPUT }}
+        if grep -q "Benchmarks that have got worse" "${{ env.BENCHMARKS_OUTPUT }}"; then
+          echo "Performance degradation detected!"
+          exit 1
+        fi

From d59a2397d8682e4eee8297f4e3c0175600e5489c Mon Sep 17 00:00:00 2001
From: Andrew Lapp <andrew@github.rew.la>
Date: Tue, 4 Jun 2024 01:32:32 -0500
Subject: [PATCH 3/4] disable outlines cache localized to the benchmarks scope

---
 benchmarks/bench_json_schema.py   | 18 ++++++------------
 benchmarks/bench_numba_compile.py | 11 ++++-------
 benchmarks/bench_regex_guide.py   | 13 +++++--------
 benchmarks/common.py              |  5 -----
 outlines/caching.py               | 13 +++++++++++++
 tests/test_cache.py               | 31 +++++++++++++++++++++++++++++++
 6 files changed, 59 insertions(+), 32 deletions(-)

diff --git a/benchmarks/bench_json_schema.py b/benchmarks/bench_json_schema.py
index daa77510b..8d1ceeb24 100644
--- a/benchmarks/bench_json_schema.py
+++ b/benchmarks/bench_json_schema.py
@@ -1,15 +1,8 @@
-import outlines
+from outlines.caching import cache_disabled
+from outlines.fsm.guide import RegexGuide
+from outlines.fsm.json_schema import build_regex_from_schema
 
-outlines.disable_cache()
-
-from outlines.fsm.guide import RegexGuide  # noqa: E402
-from outlines.fsm.json_schema import build_regex_from_schema  # noqa: E402
-
-from .common import (  # noqa: E402
-    clear_outlines_cache,
-    ensure_numba_compiled,
-    setup_tokenizer,
-)
+from .common import ensure_numba_compiled, setup_tokenizer  # noqa: E402
 
 simple_schema = """{
         "$defs": {
@@ -74,14 +67,15 @@ class JsonSchemaBenchmark:
     params = schemas.keys()
 
     def setup(self, schema_name):
-        clear_outlines_cache()
         self.tokenizer = setup_tokenizer()
         self.schema = schemas[schema_name]
         ensure_numba_compiled(self.tokenizer)
 
+    @cache_disabled()
     def time_json_schema_to_regex(self, schema_name):
         build_regex_from_schema(self.schema)
 
+    @cache_disabled()
     def time_json_schema_to_fsm(self, schema_name):
         regex = build_regex_from_schema(self.schema)
         RegexGuide(regex, self.tokenizer)
diff --git a/benchmarks/bench_numba_compile.py b/benchmarks/bench_numba_compile.py
index c0e9d87c4..2713707e5 100644
--- a/benchmarks/bench_numba_compile.py
+++ b/benchmarks/bench_numba_compile.py
@@ -3,18 +3,14 @@
 import interegular
 import numba
 
-import outlines
+from outlines.caching import cache_disabled
+from outlines.fsm import regex
 
-from .common import clear_outlines_cache, setup_tokenizer
-
-outlines.disable_cache()
+from .common import setup_tokenizer
 
 
 class NumbaCompileBenchmark:
     def setup(self):
-        clear_outlines_cache()
-        from outlines.fsm import regex
-
         self.tokenizer = setup_tokenizer()
         self.regex = regex
         original_njit = numba.njit
@@ -33,5 +29,6 @@ def mock_njit(*args, **kwargs):
     def teardown(self):
         numba.njit = self.original_njit
 
+    @cache_disabled()
     def time_compile_numba(self):
         self.regex.create_fsm_index_tokenizer(self.regex_pattern, self.tokenizer)
diff --git a/benchmarks/bench_regex_guide.py b/benchmarks/bench_regex_guide.py
index efaea9e1f..099f94df2 100644
--- a/benchmarks/bench_regex_guide.py
+++ b/benchmarks/bench_regex_guide.py
@@ -1,10 +1,7 @@
-import outlines
+from outlines.caching import cache_disabled
+from outlines.fsm.guide import RegexGuide
 
-from .common import clear_outlines_cache, ensure_numba_compiled, setup_tokenizer
-
-outlines.disable_cache()
-
-from outlines.fsm.guide import RegexGuide  # noqa: E402
+from .common import ensure_numba_compiled, setup_tokenizer
 
 regex_samples = {
     "email": r"[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?",
@@ -23,11 +20,11 @@ class RegexGuideBenchmark:
     params = regex_samples.keys()
 
     def setup(self, pattern_name):
-        clear_outlines_cache()
         self.tokenizer = setup_tokenizer()
         ensure_numba_compiled(self.tokenizer)
         self.pattern = regex_samples[pattern_name]
 
+    @cache_disabled()
     def time_regex_to_guide(self, pattern_name):
         RegexGuide(self.pattern, self.tokenizer)
 
@@ -36,10 +33,10 @@ class MemoryRegexGuideBenchmark:
     params = ["simple_phone", "complex_span_constrained_relation_extraction"]
 
     def setup(self, pattern_name):
-        clear_outlines_cache()
         self.tokenizer = setup_tokenizer()
         ensure_numba_compiled(self.tokenizer)
         self.pattern = regex_samples[pattern_name]
 
+    @cache_disabled()
     def peakmem_regex_to_guide(self, pattern_name):
         RegexGuide(self.pattern, self.tokenizer)
diff --git a/benchmarks/common.py b/benchmarks/common.py
index e0fe36f14..7d999ea9b 100644
--- a/benchmarks/common.py
+++ b/benchmarks/common.py
@@ -1,14 +1,9 @@
 from transformers import AutoTokenizer
 
-import outlines.caching
 from outlines.fsm.guide import RegexGuide
 from outlines.models.transformers import TransformerTokenizer
 
 
-def clear_outlines_cache():
-    outlines.caching.clear_cache()
-
-
 def setup_tokenizer():
     tokenizer = AutoTokenizer.from_pretrained("gpt2")
     return TransformerTokenizer(tokenizer)
diff --git a/outlines/caching.py b/outlines/caching.py
index 52d66af74..95392c7e8 100644
--- a/outlines/caching.py
+++ b/outlines/caching.py
@@ -1,4 +1,5 @@
 import asyncio
+import contextlib
 import functools
 import os
 from typing import Callable, Optional
@@ -164,3 +165,15 @@ def clear_cache():
     """Erase the cache completely."""
     memory = get_cache()
     memory.clear()
+
+
+@contextlib.contextmanager
+def cache_disabled():
+    # outlines.caching._caching_enabled
+    global _caching_enabled
+    original_state = _caching_enabled
+    _caching_enabled = False
+    try:
+        yield
+    finally:
+        _caching_enabled = original_state
diff --git a/tests/test_cache.py b/tests/test_cache.py
index 5a2de778e..eb4ec406e 100644
--- a/tests/test_cache.py
+++ b/tests/test_cache.py
@@ -1,5 +1,6 @@
 import os
 import tempfile
+import unittest
 
 import diskcache
 import pytest
@@ -157,3 +158,33 @@ def foo():
 
     # assert with version upgrade, old cache is invalidated and new cache is used
     a, b = foo()
+
+
+def test_cache_disabled_decorator(test_cache):
+    """Ensure cache can be disabled in a local scope"""
+
+    from outlines.caching import cache_disabled
+
+    mock = unittest.mock.MagicMock()
+
+    @test_cache
+    def fn():
+        mock()
+        return 1
+
+    # first call isn't cached
+    fn()
+    assert mock.call_count == 1
+
+    # second call doesn't run fn, uses cache
+    fn()
+    assert mock.call_count == 1
+
+    # cache_disabled decorator disables cache within scope
+    with cache_disabled():
+        fn()
+    assert mock.call_count == 2  # called once in cache_disabled scope
+
+    # scope has exited, cache is enabled again
+    fn()
+    assert mock.call_count == 2

From 0ea382d28ded1cce90f9e037de5f80ac940c0fe7 Mon Sep 17 00:00:00 2001
From: Andrew Lapp <andrew@github.rew.la>
Date: Tue, 4 Jun 2024 01:33:09 -0500
Subject: [PATCH 4/4] use outlines-dev/outlines for asv.conf.json
 show_commit_url

---
 benchmarks/asv.conf.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/asv.conf.json b/benchmarks/asv.conf.json
index 287bff98f..f57db9a0b 100644
--- a/benchmarks/asv.conf.json
+++ b/benchmarks/asv.conf.json
@@ -11,7 +11,7 @@
         "PIP_NO_BUILD_ISOLATION=false python -mpip wheel --no-deps --no-index -w {build_cache_dir} {build_dir}",
     ],
     "environment_type": "virtualenv",
-    "show_commit_url": "https://github.com/lapp0/outlines/commit/",
+    "show_commit_url": "https://github.com/outlines-dev/outlines/commit/",
     "benchmark_dir": ".",
     "env_dir": "env",
     "results_dir": "results",