diff --git a/.github/workflows/asv_benchmark_pr.yml b/.github/workflows/asv_benchmark_pr.yml new file mode 100644 index 000000000..09786b72c --- /dev/null +++ b/.github/workflows/asv_benchmark_pr.yml @@ -0,0 +1,52 @@ +name: Benchmark PR + +on: + pull_request: + branches: [main] + workflow_dispatch: +env: + PYTHON_VERSION: "3.10" + WORKING_DIR: ${{ github.workspace }}/benchmarks + BENCHMARKS_OUTPUT: ${{ github.workspace }}/benchmarks_output + +jobs: + benchmark-pr: + runs-on: ubuntu-latest + if: contains(github.event.pull_request.labels.*.name, 'run_benchmarks') || github.event_name == 'workflow_dispatch' || github.event_name == 'workflow_run' + + defaults: + run: + working-directory: ${{ env.WORKING_DIR }} + + steps: + + - name: Checkout repository + uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: ${{ env.PYTHON_VERSION }} + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install asv virtualenv lf-asv-formatter + + - name: Create ASV machine config file + run: asv machine --machine gh-runner --yes + + - name: Run Benchmarks - `PR HEAD` vs `main` + run: | + # prepare main branch for comparison + git remote add upstream https://github.com/${{ github.repository }}.git + git fetch upstream main + + # Run benchmarks, allow errors, they will be caught in the next step + asv continuous upstream/main HEAD \ + --no-stats --interleave-rounds -a repeat=3 || true + + - name: BENCHMARK RESULTS + run: asv compare --factor=1.1 --no-stats --split upstream/main HEAD diff --git a/.gitignore b/.gitignore index 9e95a8732..9add6d8c4 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,4 @@ docs/build .idea/ *.gguf .venv +benchmarks/results diff --git a/benchmarks/__init__.py b/benchmarks/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/benchmarks/asv.conf.json b/benchmarks/asv.conf.json new file mode 100644 index 000000000..287bff98f --- /dev/null +++ b/benchmarks/asv.conf.json @@ -0,0 +1,20 @@ +{ + "version": 1, + "project": "Outlines", + "project_url": "https://outlines-dev.github.io/outlines/", + "repo": "..", + "branches": [ + "HEAD" + ], + "build_command": [ + "python -mpip install .[test]", + "PIP_NO_BUILD_ISOLATION=false python -mpip wheel --no-deps --no-index -w {build_cache_dir} {build_dir}", + ], + "environment_type": "virtualenv", + "show_commit_url": "https://github.com/lapp0/outlines/commit/", + "benchmark_dir": ".", + "env_dir": "env", + "results_dir": "results", + "html_dir": "html", + "build_cache_size": 8 +} diff --git a/tests/benchmark/test_benchmark_json_schema.py b/benchmarks/bench_json_schema.py similarity index 70% rename from tests/benchmark/test_benchmark_json_schema.py rename to benchmarks/bench_json_schema.py index 33f3f5b16..daa77510b 100644 --- a/tests/benchmark/test_benchmark_json_schema.py +++ b/benchmarks/bench_json_schema.py @@ -1,5 +1,3 @@ -import pytest - import outlines outlines.disable_cache() @@ -7,6 +5,12 @@ from outlines.fsm.guide import RegexGuide # noqa: E402 from outlines.fsm.json_schema import build_regex_from_schema # noqa: E402 +from .common import ( # noqa: E402 + clear_outlines_cache, + ensure_numba_compiled, + setup_tokenizer, +) + simple_schema = """{ "$defs": { "Armor": { @@ -63,30 +67,21 @@ "required": ["id", "work", "recording_artists"] }""" - schemas = dict(simple_schema=simple_schema, complex_schema=complex_schema) -@pytest.mark.parametrize("schema_name", schemas.keys()) -def test_benchmark_json_schema_to_regex(benchmark, ensure_numba_compiled, schema_name): - """Benchmark convert json schema to regex""" - schema = schemas[schema_name] - benchmark.pedantic( - build_regex_from_schema, - args=(schema,), - rounds=8, - ) +class JsonSchemaBenchmark: + params = schemas.keys() + + def setup(self, schema_name): + clear_outlines_cache() + self.tokenizer = setup_tokenizer() + self.schema = schemas[schema_name] + ensure_numba_compiled(self.tokenizer) + def time_json_schema_to_regex(self, schema_name): + build_regex_from_schema(self.schema) -@pytest.mark.parametrize("schema_name", schemas.keys()) -def test_benchmark_json_schema_to_fsm( - benchmark, tokenizer, ensure_numba_compiled, schema_name -): - """Benchmark compile json schema as FSM""" - schema = schemas[schema_name] - regex = build_regex_from_schema(schema) - benchmark.pedantic( - RegexGuide, - args=(regex, tokenizer), - rounds=8, - ) + def time_json_schema_to_fsm(self, schema_name): + regex = build_regex_from_schema(self.schema) + RegexGuide(regex, self.tokenizer) diff --git a/benchmarks/bench_numba_compile.py b/benchmarks/bench_numba_compile.py new file mode 100644 index 000000000..c0e9d87c4 --- /dev/null +++ b/benchmarks/bench_numba_compile.py @@ -0,0 +1,37 @@ +import importlib + +import interegular +import numba + +import outlines + +from .common import clear_outlines_cache, setup_tokenizer + +outlines.disable_cache() + + +class NumbaCompileBenchmark: + def setup(self): + clear_outlines_cache() + from outlines.fsm import regex + + self.tokenizer = setup_tokenizer() + self.regex = regex + original_njit = numba.njit + + def mock_njit(*args, **kwargs): + kwargs["cache"] = False + return original_njit(*args, **kwargs) + + self.original_njit = original_njit + numba.njit = mock_njit + importlib.reload(self.regex) + self.regex_pattern, _ = self.regex.make_deterministic_fsm( + interegular.parse_pattern("a").to_fsm().reduce() + ) + + def teardown(self): + numba.njit = self.original_njit + + def time_compile_numba(self): + self.regex.create_fsm_index_tokenizer(self.regex_pattern, self.tokenizer) diff --git a/tests/benchmark/test_benchmark_regex_fsm.py b/benchmarks/bench_regex_guide.py similarity index 68% rename from tests/benchmark/test_benchmark_regex_fsm.py rename to benchmarks/bench_regex_guide.py index e9e45052a..efaea9e1f 100644 --- a/tests/benchmark/test_benchmark_regex_fsm.py +++ b/benchmarks/bench_regex_guide.py @@ -1,7 +1,7 @@ -import pytest - import outlines +from .common import clear_outlines_cache, ensure_numba_compiled, setup_tokenizer + outlines.disable_cache() from outlines.fsm.guide import RegexGuide # noqa: E402 @@ -19,14 +19,27 @@ } -@pytest.mark.parametrize("regex_name", regex_samples.keys()) -def test_benchmark_regex_to_fsm( - benchmark, tokenizer, ensure_numba_compiled, regex_name -): - """Benchmark converting regex to FSM""" - regex_str = regex_samples[regex_name] - benchmark.pedantic( - RegexGuide, - args=(regex_str, tokenizer), - rounds=8, - ) +class RegexGuideBenchmark: + params = regex_samples.keys() + + def setup(self, pattern_name): + clear_outlines_cache() + self.tokenizer = setup_tokenizer() + ensure_numba_compiled(self.tokenizer) + self.pattern = regex_samples[pattern_name] + + def time_regex_to_guide(self, pattern_name): + RegexGuide(self.pattern, self.tokenizer) + + +class MemoryRegexGuideBenchmark: + params = ["simple_phone", "complex_span_constrained_relation_extraction"] + + def setup(self, pattern_name): + clear_outlines_cache() + self.tokenizer = setup_tokenizer() + ensure_numba_compiled(self.tokenizer) + self.pattern = regex_samples[pattern_name] + + def peakmem_regex_to_guide(self, pattern_name): + RegexGuide(self.pattern, self.tokenizer) diff --git a/tests/benchmark/conftest.py b/benchmarks/common.py similarity index 74% rename from tests/benchmark/conftest.py rename to benchmarks/common.py index 902d5d6eb..e0fe36f14 100644 --- a/tests/benchmark/conftest.py +++ b/benchmarks/common.py @@ -1,17 +1,19 @@ -import pytest from transformers import AutoTokenizer +import outlines.caching from outlines.fsm.guide import RegexGuide from outlines.models.transformers import TransformerTokenizer -@pytest.fixture -def tokenizer(): +def clear_outlines_cache(): + outlines.caching.clear_cache() + + +def setup_tokenizer(): tokenizer = AutoTokenizer.from_pretrained("gpt2") return TransformerTokenizer(tokenizer) -@pytest.fixture def ensure_numba_compiled(tokenizer): RegexGuide("a", tokenizer) return True diff --git a/docs/community/contribute.md b/docs/community/contribute.md index fb67576e4..b336eacad 100644 --- a/docs/community/contribute.md +++ b/docs/community/contribute.md @@ -57,12 +57,38 @@ And run the code style checks: pre-commit run --all-files ``` -When modifying the code related to the index compilation, we kindly ask you to -post benchmarks before and after your changes. You can run benchmarks using: +### Benchmarking -```python -pytest --benchmark-only +Outlines uses [asv](https://asv.readthedocs.io) for automated benchmark testing. Benchmarks are run automatically before pull requests are merged to prevent performance degredation. + +You can run the benchmark test suite locally with the following command: +``` +asv run --config benchmarks/asv.conf.json +``` + +Run a specific test: +``` +asv run --config benchmarks/asv.conf.json -b bench_json_schema.JsonSchemaBenchmark.time_json_schema_to_fsm +``` + +Profile a specific test: ``` +asv run --config benchmarks/asv.conf.json --profile -b bench_json_schema.JsonSchemaBenchmark.time_json_schema_to_fsm +``` + +Compare to `origin/main` +``` +get fetch origin +asv continuous origin/main HEAD --config benchmarks/asv.conf.json +``` + +#### ASV PR Behavior + +- **View ASV Benchmark Results:** Open the workflow, view `BENCHMARK RESULTS` section. +- Merging is blocked unless benchmarks are run for the latest commit. +- Benchmarks fail if performance degrades by more than 10% for any individual benchmark. +- The "Benchmark PR" workflow runs when its manually dispatched, or if the `run_benchmarks` label is added to the PR they run for every commit. + ### Contribute to the documentation diff --git a/tests/benchmark/test_benchmark_numba_compile.py b/tests/benchmark/test_benchmark_numba_compile.py deleted file mode 100644 index 827d561bd..000000000 --- a/tests/benchmark/test_benchmark_numba_compile.py +++ /dev/null @@ -1,33 +0,0 @@ -import importlib - -import interegular -import numba - -import outlines - -outlines.disable_cache() - - -def test_benchmark_compile_numba(benchmark, tokenizer, mocker): - """Compile a basic regex to benchmark the numba compilation time""" - - def setup(): - from outlines.fsm import regex - - original_njit = numba.njit - - def mock_njit(*args, **kwargs): - kwargs["cache"] = False - return original_njit(*args, **kwargs) - - mocker.patch("numba.njit", new=mock_njit) - importlib.reload(regex) - - regex_pattern, _ = regex.make_deterministic_fsm( - interegular.parse_pattern("a").to_fsm().reduce() - ) - return (regex, regex_pattern, tokenizer), {} - - benchmark.pedantic( - lambda r, *args: r.create_fsm_index_tokenizer(*args), rounds=2, setup=setup - )