From ee85483c4d444ffb00922961dd1d5ae10c1ba6ac Mon Sep 17 00:00:00 2001 From: Andrew Lapp Date: Fri, 31 May 2024 17:04:34 -0500 Subject: [PATCH 1/4] ASV PR bench workflow, pytest-bench -> ASV, add peakmem tests --- .github/workflows/asv_benchmark_pr.yml | 52 +++++++++++++++++++ .gitignore | 1 + benchmarks/__init__.py | 0 benchmarks/asv.conf.json | 20 +++++++ .../bench_json_schema.py | 43 +++++++-------- benchmarks/bench_numba_compile.py | 37 +++++++++++++ .../bench_regex_guide.py | 39 +++++++++----- .../conftest.py => benchmarks/common.py | 10 ++-- docs/community/contribute.md | 34 ++++++++++-- .../benchmark/test_benchmark_numba_compile.py | 33 ------------ 10 files changed, 191 insertions(+), 78 deletions(-) create mode 100644 .github/workflows/asv_benchmark_pr.yml create mode 100644 benchmarks/__init__.py create mode 100644 benchmarks/asv.conf.json rename tests/benchmark/test_benchmark_json_schema.py => benchmarks/bench_json_schema.py (70%) create mode 100644 benchmarks/bench_numba_compile.py rename tests/benchmark/test_benchmark_regex_fsm.py => benchmarks/bench_regex_guide.py (68%) rename tests/benchmark/conftest.py => benchmarks/common.py (74%) delete mode 100644 tests/benchmark/test_benchmark_numba_compile.py diff --git a/.github/workflows/asv_benchmark_pr.yml b/.github/workflows/asv_benchmark_pr.yml new file mode 100644 index 000000000..09786b72c --- /dev/null +++ b/.github/workflows/asv_benchmark_pr.yml @@ -0,0 +1,52 @@ +name: Benchmark PR + +on: + pull_request: + branches: [main] + workflow_dispatch: +env: + PYTHON_VERSION: "3.10" + WORKING_DIR: ${{ github.workspace }}/benchmarks + BENCHMARKS_OUTPUT: ${{ github.workspace }}/benchmarks_output + +jobs: + benchmark-pr: + runs-on: ubuntu-latest + if: contains(github.event.pull_request.labels.*.name, 'run_benchmarks') || github.event_name == 'workflow_dispatch' || github.event_name == 'workflow_run' + + defaults: + run: + working-directory: ${{ env.WORKING_DIR }} + + steps: + + - name: Checkout repository + uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: ${{ env.PYTHON_VERSION }} + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install asv virtualenv lf-asv-formatter + + - name: Create ASV machine config file + run: asv machine --machine gh-runner --yes + + - name: Run Benchmarks - `PR HEAD` vs `main` + run: | + # prepare main branch for comparison + git remote add upstream https://github.com/${{ github.repository }}.git + git fetch upstream main + + # Run benchmarks, allow errors, they will be caught in the next step + asv continuous upstream/main HEAD \ + --no-stats --interleave-rounds -a repeat=3 || true + + - name: BENCHMARK RESULTS + run: asv compare --factor=1.1 --no-stats --split upstream/main HEAD diff --git a/.gitignore b/.gitignore index 9e95a8732..9add6d8c4 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,4 @@ docs/build .idea/ *.gguf .venv +benchmarks/results diff --git a/benchmarks/__init__.py b/benchmarks/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/benchmarks/asv.conf.json b/benchmarks/asv.conf.json new file mode 100644 index 000000000..287bff98f --- /dev/null +++ b/benchmarks/asv.conf.json @@ -0,0 +1,20 @@ +{ + "version": 1, + "project": "Outlines", + "project_url": "https://outlines-dev.github.io/outlines/", + "repo": "..", + "branches": [ + "HEAD" + ], + "build_command": [ + "python -mpip install .[test]", + "PIP_NO_BUILD_ISOLATION=false python -mpip wheel --no-deps --no-index -w {build_cache_dir} {build_dir}", + ], + "environment_type": "virtualenv", + "show_commit_url": "https://github.com/lapp0/outlines/commit/", + "benchmark_dir": ".", + "env_dir": "env", + "results_dir": "results", + "html_dir": "html", + "build_cache_size": 8 +} diff --git a/tests/benchmark/test_benchmark_json_schema.py b/benchmarks/bench_json_schema.py similarity index 70% rename from tests/benchmark/test_benchmark_json_schema.py rename to benchmarks/bench_json_schema.py index 33f3f5b16..daa77510b 100644 --- a/tests/benchmark/test_benchmark_json_schema.py +++ b/benchmarks/bench_json_schema.py @@ -1,5 +1,3 @@ -import pytest - import outlines outlines.disable_cache() @@ -7,6 +5,12 @@ from outlines.fsm.guide import RegexGuide # noqa: E402 from outlines.fsm.json_schema import build_regex_from_schema # noqa: E402 +from .common import ( # noqa: E402 + clear_outlines_cache, + ensure_numba_compiled, + setup_tokenizer, +) + simple_schema = """{ "$defs": { "Armor": { @@ -63,30 +67,21 @@ "required": ["id", "work", "recording_artists"] }""" - schemas = dict(simple_schema=simple_schema, complex_schema=complex_schema) -@pytest.mark.parametrize("schema_name", schemas.keys()) -def test_benchmark_json_schema_to_regex(benchmark, ensure_numba_compiled, schema_name): - """Benchmark convert json schema to regex""" - schema = schemas[schema_name] - benchmark.pedantic( - build_regex_from_schema, - args=(schema,), - rounds=8, - ) +class JsonSchemaBenchmark: + params = schemas.keys() + + def setup(self, schema_name): + clear_outlines_cache() + self.tokenizer = setup_tokenizer() + self.schema = schemas[schema_name] + ensure_numba_compiled(self.tokenizer) + def time_json_schema_to_regex(self, schema_name): + build_regex_from_schema(self.schema) -@pytest.mark.parametrize("schema_name", schemas.keys()) -def test_benchmark_json_schema_to_fsm( - benchmark, tokenizer, ensure_numba_compiled, schema_name -): - """Benchmark compile json schema as FSM""" - schema = schemas[schema_name] - regex = build_regex_from_schema(schema) - benchmark.pedantic( - RegexGuide, - args=(regex, tokenizer), - rounds=8, - ) + def time_json_schema_to_fsm(self, schema_name): + regex = build_regex_from_schema(self.schema) + RegexGuide(regex, self.tokenizer) diff --git a/benchmarks/bench_numba_compile.py b/benchmarks/bench_numba_compile.py new file mode 100644 index 000000000..c0e9d87c4 --- /dev/null +++ b/benchmarks/bench_numba_compile.py @@ -0,0 +1,37 @@ +import importlib + +import interegular +import numba + +import outlines + +from .common import clear_outlines_cache, setup_tokenizer + +outlines.disable_cache() + + +class NumbaCompileBenchmark: + def setup(self): + clear_outlines_cache() + from outlines.fsm import regex + + self.tokenizer = setup_tokenizer() + self.regex = regex + original_njit = numba.njit + + def mock_njit(*args, **kwargs): + kwargs["cache"] = False + return original_njit(*args, **kwargs) + + self.original_njit = original_njit + numba.njit = mock_njit + importlib.reload(self.regex) + self.regex_pattern, _ = self.regex.make_deterministic_fsm( + interegular.parse_pattern("a").to_fsm().reduce() + ) + + def teardown(self): + numba.njit = self.original_njit + + def time_compile_numba(self): + self.regex.create_fsm_index_tokenizer(self.regex_pattern, self.tokenizer) diff --git a/tests/benchmark/test_benchmark_regex_fsm.py b/benchmarks/bench_regex_guide.py similarity index 68% rename from tests/benchmark/test_benchmark_regex_fsm.py rename to benchmarks/bench_regex_guide.py index e9e45052a..efaea9e1f 100644 --- a/tests/benchmark/test_benchmark_regex_fsm.py +++ b/benchmarks/bench_regex_guide.py @@ -1,7 +1,7 @@ -import pytest - import outlines +from .common import clear_outlines_cache, ensure_numba_compiled, setup_tokenizer + outlines.disable_cache() from outlines.fsm.guide import RegexGuide # noqa: E402 @@ -19,14 +19,27 @@ } -@pytest.mark.parametrize("regex_name", regex_samples.keys()) -def test_benchmark_regex_to_fsm( - benchmark, tokenizer, ensure_numba_compiled, regex_name -): - """Benchmark converting regex to FSM""" - regex_str = regex_samples[regex_name] - benchmark.pedantic( - RegexGuide, - args=(regex_str, tokenizer), - rounds=8, - ) +class RegexGuideBenchmark: + params = regex_samples.keys() + + def setup(self, pattern_name): + clear_outlines_cache() + self.tokenizer = setup_tokenizer() + ensure_numba_compiled(self.tokenizer) + self.pattern = regex_samples[pattern_name] + + def time_regex_to_guide(self, pattern_name): + RegexGuide(self.pattern, self.tokenizer) + + +class MemoryRegexGuideBenchmark: + params = ["simple_phone", "complex_span_constrained_relation_extraction"] + + def setup(self, pattern_name): + clear_outlines_cache() + self.tokenizer = setup_tokenizer() + ensure_numba_compiled(self.tokenizer) + self.pattern = regex_samples[pattern_name] + + def peakmem_regex_to_guide(self, pattern_name): + RegexGuide(self.pattern, self.tokenizer) diff --git a/tests/benchmark/conftest.py b/benchmarks/common.py similarity index 74% rename from tests/benchmark/conftest.py rename to benchmarks/common.py index 902d5d6eb..e0fe36f14 100644 --- a/tests/benchmark/conftest.py +++ b/benchmarks/common.py @@ -1,17 +1,19 @@ -import pytest from transformers import AutoTokenizer +import outlines.caching from outlines.fsm.guide import RegexGuide from outlines.models.transformers import TransformerTokenizer -@pytest.fixture -def tokenizer(): +def clear_outlines_cache(): + outlines.caching.clear_cache() + + +def setup_tokenizer(): tokenizer = AutoTokenizer.from_pretrained("gpt2") return TransformerTokenizer(tokenizer) -@pytest.fixture def ensure_numba_compiled(tokenizer): RegexGuide("a", tokenizer) return True diff --git a/docs/community/contribute.md b/docs/community/contribute.md index fb67576e4..b336eacad 100644 --- a/docs/community/contribute.md +++ b/docs/community/contribute.md @@ -57,12 +57,38 @@ And run the code style checks: pre-commit run --all-files ``` -When modifying the code related to the index compilation, we kindly ask you to -post benchmarks before and after your changes. You can run benchmarks using: +### Benchmarking -```python -pytest --benchmark-only +Outlines uses [asv](https://asv.readthedocs.io) for automated benchmark testing. Benchmarks are run automatically before pull requests are merged to prevent performance degredation. + +You can run the benchmark test suite locally with the following command: +``` +asv run --config benchmarks/asv.conf.json +``` + +Run a specific test: +``` +asv run --config benchmarks/asv.conf.json -b bench_json_schema.JsonSchemaBenchmark.time_json_schema_to_fsm +``` + +Profile a specific test: ``` +asv run --config benchmarks/asv.conf.json --profile -b bench_json_schema.JsonSchemaBenchmark.time_json_schema_to_fsm +``` + +Compare to `origin/main` +``` +get fetch origin +asv continuous origin/main HEAD --config benchmarks/asv.conf.json +``` + +#### ASV PR Behavior + +- **View ASV Benchmark Results:** Open the workflow, view `BENCHMARK RESULTS` section. +- Merging is blocked unless benchmarks are run for the latest commit. +- Benchmarks fail if performance degrades by more than 10% for any individual benchmark. +- The "Benchmark PR" workflow runs when its manually dispatched, or if the `run_benchmarks` label is added to the PR they run for every commit. + ### Contribute to the documentation diff --git a/tests/benchmark/test_benchmark_numba_compile.py b/tests/benchmark/test_benchmark_numba_compile.py deleted file mode 100644 index 827d561bd..000000000 --- a/tests/benchmark/test_benchmark_numba_compile.py +++ /dev/null @@ -1,33 +0,0 @@ -import importlib - -import interegular -import numba - -import outlines - -outlines.disable_cache() - - -def test_benchmark_compile_numba(benchmark, tokenizer, mocker): - """Compile a basic regex to benchmark the numba compilation time""" - - def setup(): - from outlines.fsm import regex - - original_njit = numba.njit - - def mock_njit(*args, **kwargs): - kwargs["cache"] = False - return original_njit(*args, **kwargs) - - mocker.patch("numba.njit", new=mock_njit) - importlib.reload(regex) - - regex_pattern, _ = regex.make_deterministic_fsm( - interegular.parse_pattern("a").to_fsm().reduce() - ) - return (regex, regex_pattern, tokenizer), {} - - benchmark.pedantic( - lambda r, *args: r.create_fsm_index_tokenizer(*args), rounds=2, setup=setup - ) From 02d8a845715519434812285301ae28afb21c1adf Mon Sep 17 00:00:00 2001 From: Andrew Lapp Date: Fri, 31 May 2024 19:41:25 -0500 Subject: [PATCH 2/4] ensure workflow fails if benchmark degredation >10% --- .github/workflows/asv_benchmark_pr.yml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.github/workflows/asv_benchmark_pr.yml b/.github/workflows/asv_benchmark_pr.yml index 09786b72c..90fb47423 100644 --- a/.github/workflows/asv_benchmark_pr.yml +++ b/.github/workflows/asv_benchmark_pr.yml @@ -49,4 +49,9 @@ jobs: --no-stats --interleave-rounds -a repeat=3 || true - name: BENCHMARK RESULTS - run: asv compare --factor=1.1 --no-stats --split upstream/main HEAD + run: | + asv compare --factor=1.1 --no-stats --split upstream/main HEAD | tee ${{ env.BENCHMARKS_OUTPUT }} + if grep -q "Benchmarks that have got worse" "${{ env.BENCHMARKS_OUTPUT }}"; then + echo "Performance degradation detected!" + exit 1 + fi From d59a2397d8682e4eee8297f4e3c0175600e5489c Mon Sep 17 00:00:00 2001 From: Andrew Lapp Date: Tue, 4 Jun 2024 01:32:32 -0500 Subject: [PATCH 3/4] disable outlines cache localized to the benchmarks scope --- benchmarks/bench_json_schema.py | 18 ++++++------------ benchmarks/bench_numba_compile.py | 11 ++++------- benchmarks/bench_regex_guide.py | 13 +++++-------- benchmarks/common.py | 5 ----- outlines/caching.py | 13 +++++++++++++ tests/test_cache.py | 31 +++++++++++++++++++++++++++++++ 6 files changed, 59 insertions(+), 32 deletions(-) diff --git a/benchmarks/bench_json_schema.py b/benchmarks/bench_json_schema.py index daa77510b..8d1ceeb24 100644 --- a/benchmarks/bench_json_schema.py +++ b/benchmarks/bench_json_schema.py @@ -1,15 +1,8 @@ -import outlines +from outlines.caching import cache_disabled +from outlines.fsm.guide import RegexGuide +from outlines.fsm.json_schema import build_regex_from_schema -outlines.disable_cache() - -from outlines.fsm.guide import RegexGuide # noqa: E402 -from outlines.fsm.json_schema import build_regex_from_schema # noqa: E402 - -from .common import ( # noqa: E402 - clear_outlines_cache, - ensure_numba_compiled, - setup_tokenizer, -) +from .common import ensure_numba_compiled, setup_tokenizer # noqa: E402 simple_schema = """{ "$defs": { @@ -74,14 +67,15 @@ class JsonSchemaBenchmark: params = schemas.keys() def setup(self, schema_name): - clear_outlines_cache() self.tokenizer = setup_tokenizer() self.schema = schemas[schema_name] ensure_numba_compiled(self.tokenizer) + @cache_disabled() def time_json_schema_to_regex(self, schema_name): build_regex_from_schema(self.schema) + @cache_disabled() def time_json_schema_to_fsm(self, schema_name): regex = build_regex_from_schema(self.schema) RegexGuide(regex, self.tokenizer) diff --git a/benchmarks/bench_numba_compile.py b/benchmarks/bench_numba_compile.py index c0e9d87c4..2713707e5 100644 --- a/benchmarks/bench_numba_compile.py +++ b/benchmarks/bench_numba_compile.py @@ -3,18 +3,14 @@ import interegular import numba -import outlines +from outlines.caching import cache_disabled +from outlines.fsm import regex -from .common import clear_outlines_cache, setup_tokenizer - -outlines.disable_cache() +from .common import setup_tokenizer class NumbaCompileBenchmark: def setup(self): - clear_outlines_cache() - from outlines.fsm import regex - self.tokenizer = setup_tokenizer() self.regex = regex original_njit = numba.njit @@ -33,5 +29,6 @@ def mock_njit(*args, **kwargs): def teardown(self): numba.njit = self.original_njit + @cache_disabled() def time_compile_numba(self): self.regex.create_fsm_index_tokenizer(self.regex_pattern, self.tokenizer) diff --git a/benchmarks/bench_regex_guide.py b/benchmarks/bench_regex_guide.py index efaea9e1f..099f94df2 100644 --- a/benchmarks/bench_regex_guide.py +++ b/benchmarks/bench_regex_guide.py @@ -1,10 +1,7 @@ -import outlines +from outlines.caching import cache_disabled +from outlines.fsm.guide import RegexGuide -from .common import clear_outlines_cache, ensure_numba_compiled, setup_tokenizer - -outlines.disable_cache() - -from outlines.fsm.guide import RegexGuide # noqa: E402 +from .common import ensure_numba_compiled, setup_tokenizer regex_samples = { "email": r"[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?", @@ -23,11 +20,11 @@ class RegexGuideBenchmark: params = regex_samples.keys() def setup(self, pattern_name): - clear_outlines_cache() self.tokenizer = setup_tokenizer() ensure_numba_compiled(self.tokenizer) self.pattern = regex_samples[pattern_name] + @cache_disabled() def time_regex_to_guide(self, pattern_name): RegexGuide(self.pattern, self.tokenizer) @@ -36,10 +33,10 @@ class MemoryRegexGuideBenchmark: params = ["simple_phone", "complex_span_constrained_relation_extraction"] def setup(self, pattern_name): - clear_outlines_cache() self.tokenizer = setup_tokenizer() ensure_numba_compiled(self.tokenizer) self.pattern = regex_samples[pattern_name] + @cache_disabled() def peakmem_regex_to_guide(self, pattern_name): RegexGuide(self.pattern, self.tokenizer) diff --git a/benchmarks/common.py b/benchmarks/common.py index e0fe36f14..7d999ea9b 100644 --- a/benchmarks/common.py +++ b/benchmarks/common.py @@ -1,14 +1,9 @@ from transformers import AutoTokenizer -import outlines.caching from outlines.fsm.guide import RegexGuide from outlines.models.transformers import TransformerTokenizer -def clear_outlines_cache(): - outlines.caching.clear_cache() - - def setup_tokenizer(): tokenizer = AutoTokenizer.from_pretrained("gpt2") return TransformerTokenizer(tokenizer) diff --git a/outlines/caching.py b/outlines/caching.py index 52d66af74..95392c7e8 100644 --- a/outlines/caching.py +++ b/outlines/caching.py @@ -1,4 +1,5 @@ import asyncio +import contextlib import functools import os from typing import Callable, Optional @@ -164,3 +165,15 @@ def clear_cache(): """Erase the cache completely.""" memory = get_cache() memory.clear() + + +@contextlib.contextmanager +def cache_disabled(): + # outlines.caching._caching_enabled + global _caching_enabled + original_state = _caching_enabled + _caching_enabled = False + try: + yield + finally: + _caching_enabled = original_state diff --git a/tests/test_cache.py b/tests/test_cache.py index 5a2de778e..eb4ec406e 100644 --- a/tests/test_cache.py +++ b/tests/test_cache.py @@ -1,5 +1,6 @@ import os import tempfile +import unittest import diskcache import pytest @@ -157,3 +158,33 @@ def foo(): # assert with version upgrade, old cache is invalidated and new cache is used a, b = foo() + + +def test_cache_disabled_decorator(test_cache): + """Ensure cache can be disabled in a local scope""" + + from outlines.caching import cache_disabled + + mock = unittest.mock.MagicMock() + + @test_cache + def fn(): + mock() + return 1 + + # first call isn't cached + fn() + assert mock.call_count == 1 + + # second call doesn't run fn, uses cache + fn() + assert mock.call_count == 1 + + # cache_disabled decorator disables cache within scope + with cache_disabled(): + fn() + assert mock.call_count == 2 # called once in cache_disabled scope + + # scope has exited, cache is enabled again + fn() + assert mock.call_count == 2 From 0ea382d28ded1cce90f9e037de5f80ac940c0fe7 Mon Sep 17 00:00:00 2001 From: Andrew Lapp Date: Tue, 4 Jun 2024 01:33:09 -0500 Subject: [PATCH 4/4] use outlines-dev/outlines for asv.conf.json show_commit_url --- benchmarks/asv.conf.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/asv.conf.json b/benchmarks/asv.conf.json index 287bff98f..f57db9a0b 100644 --- a/benchmarks/asv.conf.json +++ b/benchmarks/asv.conf.json @@ -11,7 +11,7 @@ "PIP_NO_BUILD_ISOLATION=false python -mpip wheel --no-deps --no-index -w {build_cache_dir} {build_dir}", ], "environment_type": "virtualenv", - "show_commit_url": "https://github.com/lapp0/outlines/commit/", + "show_commit_url": "https://github.com/outlines-dev/outlines/commit/", "benchmark_dir": ".", "env_dir": "env", "results_dir": "results",