diff --git a/.github/workflows/asv_benchmark_pr.yml b/.github/workflows/asv_benchmark_pr.yml new file mode 100644 index 000000000..90fb47423 --- /dev/null +++ b/.github/workflows/asv_benchmark_pr.yml @@ -0,0 +1,57 @@ +name: Benchmark PR + +on: + pull_request: + branches: [main] + workflow_dispatch: +env: + PYTHON_VERSION: "3.10" + WORKING_DIR: ${{ github.workspace }}/benchmarks + BENCHMARKS_OUTPUT: ${{ github.workspace }}/benchmarks_output + +jobs: + benchmark-pr: + runs-on: ubuntu-latest + if: contains(github.event.pull_request.labels.*.name, 'run_benchmarks') || github.event_name == 'workflow_dispatch' || github.event_name == 'workflow_run' + + defaults: + run: + working-directory: ${{ env.WORKING_DIR }} + + steps: + + - name: Checkout repository + uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: ${{ env.PYTHON_VERSION }} + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install asv virtualenv lf-asv-formatter + + - name: Create ASV machine config file + run: asv machine --machine gh-runner --yes + + - name: Run Benchmarks - `PR HEAD` vs `main` + run: | + # prepare main branch for comparison + git remote add upstream https://github.com/${{ github.repository }}.git + git fetch upstream main + + # Run benchmarks, allow errors, they will be caught in the next step + asv continuous upstream/main HEAD \ + --no-stats --interleave-rounds -a repeat=3 || true + + - name: BENCHMARK RESULTS + run: | + asv compare --factor=1.1 --no-stats --split upstream/main HEAD | tee ${{ env.BENCHMARKS_OUTPUT }} + if grep -q "Benchmarks that have got worse" "${{ env.BENCHMARKS_OUTPUT }}"; then + echo "Performance degradation detected!" + exit 1 + fi diff --git a/.gitignore b/.gitignore index 9e95a8732..9add6d8c4 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,4 @@ docs/build .idea/ *.gguf .venv +benchmarks/results diff --git a/benchmarks/__init__.py b/benchmarks/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/benchmarks/asv.conf.json b/benchmarks/asv.conf.json new file mode 100644 index 000000000..f57db9a0b --- /dev/null +++ b/benchmarks/asv.conf.json @@ -0,0 +1,20 @@ +{ + "version": 1, + "project": "Outlines", + "project_url": "https://outlines-dev.github.io/outlines/", + "repo": "..", + "branches": [ + "HEAD" + ], + "build_command": [ + "python -mpip install .[test]", + "PIP_NO_BUILD_ISOLATION=false python -mpip wheel --no-deps --no-index -w {build_cache_dir} {build_dir}", + ], + "environment_type": "virtualenv", + "show_commit_url": "https://github.com/outlines-dev/outlines/commit/", + "benchmark_dir": ".", + "env_dir": "env", + "results_dir": "results", + "html_dir": "html", + "build_cache_size": 8 +} diff --git a/tests/benchmark/test_benchmark_json_schema.py b/benchmarks/bench_json_schema.py similarity index 63% rename from tests/benchmark/test_benchmark_json_schema.py rename to benchmarks/bench_json_schema.py index 33f3f5b16..8d1ceeb24 100644 --- a/tests/benchmark/test_benchmark_json_schema.py +++ b/benchmarks/bench_json_schema.py @@ -1,11 +1,8 @@ -import pytest +from outlines.caching import cache_disabled +from outlines.fsm.guide import RegexGuide +from outlines.fsm.json_schema import build_regex_from_schema -import outlines - -outlines.disable_cache() - -from outlines.fsm.guide import RegexGuide # noqa: E402 -from outlines.fsm.json_schema import build_regex_from_schema # noqa: E402 +from .common import ensure_numba_compiled, setup_tokenizer # noqa: E402 simple_schema = """{ "$defs": { @@ -63,30 +60,22 @@ "required": ["id", "work", "recording_artists"] }""" - schemas = dict(simple_schema=simple_schema, complex_schema=complex_schema) -@pytest.mark.parametrize("schema_name", schemas.keys()) -def test_benchmark_json_schema_to_regex(benchmark, ensure_numba_compiled, schema_name): - """Benchmark convert json schema to regex""" - schema = schemas[schema_name] - benchmark.pedantic( - build_regex_from_schema, - args=(schema,), - rounds=8, - ) +class JsonSchemaBenchmark: + params = schemas.keys() + + def setup(self, schema_name): + self.tokenizer = setup_tokenizer() + self.schema = schemas[schema_name] + ensure_numba_compiled(self.tokenizer) + @cache_disabled() + def time_json_schema_to_regex(self, schema_name): + build_regex_from_schema(self.schema) -@pytest.mark.parametrize("schema_name", schemas.keys()) -def test_benchmark_json_schema_to_fsm( - benchmark, tokenizer, ensure_numba_compiled, schema_name -): - """Benchmark compile json schema as FSM""" - schema = schemas[schema_name] - regex = build_regex_from_schema(schema) - benchmark.pedantic( - RegexGuide, - args=(regex, tokenizer), - rounds=8, - ) + @cache_disabled() + def time_json_schema_to_fsm(self, schema_name): + regex = build_regex_from_schema(self.schema) + RegexGuide(regex, self.tokenizer) diff --git a/benchmarks/bench_numba_compile.py b/benchmarks/bench_numba_compile.py new file mode 100644 index 000000000..2713707e5 --- /dev/null +++ b/benchmarks/bench_numba_compile.py @@ -0,0 +1,34 @@ +import importlib + +import interegular +import numba + +from outlines.caching import cache_disabled +from outlines.fsm import regex + +from .common import setup_tokenizer + + +class NumbaCompileBenchmark: + def setup(self): + self.tokenizer = setup_tokenizer() + self.regex = regex + original_njit = numba.njit + + def mock_njit(*args, **kwargs): + kwargs["cache"] = False + return original_njit(*args, **kwargs) + + self.original_njit = original_njit + numba.njit = mock_njit + importlib.reload(self.regex) + self.regex_pattern, _ = self.regex.make_deterministic_fsm( + interegular.parse_pattern("a").to_fsm().reduce() + ) + + def teardown(self): + numba.njit = self.original_njit + + @cache_disabled() + def time_compile_numba(self): + self.regex.create_fsm_index_tokenizer(self.regex_pattern, self.tokenizer) diff --git a/tests/benchmark/test_benchmark_regex_fsm.py b/benchmarks/bench_regex_guide.py similarity index 66% rename from tests/benchmark/test_benchmark_regex_fsm.py rename to benchmarks/bench_regex_guide.py index e9e45052a..099f94df2 100644 --- a/tests/benchmark/test_benchmark_regex_fsm.py +++ b/benchmarks/bench_regex_guide.py @@ -1,10 +1,7 @@ -import pytest +from outlines.caching import cache_disabled +from outlines.fsm.guide import RegexGuide -import outlines - -outlines.disable_cache() - -from outlines.fsm.guide import RegexGuide # noqa: E402 +from .common import ensure_numba_compiled, setup_tokenizer regex_samples = { "email": r"[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?", @@ -19,14 +16,27 @@ } -@pytest.mark.parametrize("regex_name", regex_samples.keys()) -def test_benchmark_regex_to_fsm( - benchmark, tokenizer, ensure_numba_compiled, regex_name -): - """Benchmark converting regex to FSM""" - regex_str = regex_samples[regex_name] - benchmark.pedantic( - RegexGuide, - args=(regex_str, tokenizer), - rounds=8, - ) +class RegexGuideBenchmark: + params = regex_samples.keys() + + def setup(self, pattern_name): + self.tokenizer = setup_tokenizer() + ensure_numba_compiled(self.tokenizer) + self.pattern = regex_samples[pattern_name] + + @cache_disabled() + def time_regex_to_guide(self, pattern_name): + RegexGuide(self.pattern, self.tokenizer) + + +class MemoryRegexGuideBenchmark: + params = ["simple_phone", "complex_span_constrained_relation_extraction"] + + def setup(self, pattern_name): + self.tokenizer = setup_tokenizer() + ensure_numba_compiled(self.tokenizer) + self.pattern = regex_samples[pattern_name] + + @cache_disabled() + def peakmem_regex_to_guide(self, pattern_name): + RegexGuide(self.pattern, self.tokenizer) diff --git a/tests/benchmark/conftest.py b/benchmarks/common.py similarity index 83% rename from tests/benchmark/conftest.py rename to benchmarks/common.py index 902d5d6eb..7d999ea9b 100644 --- a/tests/benchmark/conftest.py +++ b/benchmarks/common.py @@ -1,17 +1,14 @@ -import pytest from transformers import AutoTokenizer from outlines.fsm.guide import RegexGuide from outlines.models.transformers import TransformerTokenizer -@pytest.fixture -def tokenizer(): +def setup_tokenizer(): tokenizer = AutoTokenizer.from_pretrained("gpt2") return TransformerTokenizer(tokenizer) -@pytest.fixture def ensure_numba_compiled(tokenizer): RegexGuide("a", tokenizer) return True diff --git a/docs/community/contribute.md b/docs/community/contribute.md index fb67576e4..b336eacad 100644 --- a/docs/community/contribute.md +++ b/docs/community/contribute.md @@ -57,12 +57,38 @@ And run the code style checks: pre-commit run --all-files ``` -When modifying the code related to the index compilation, we kindly ask you to -post benchmarks before and after your changes. You can run benchmarks using: +### Benchmarking -```python -pytest --benchmark-only +Outlines uses [asv](https://asv.readthedocs.io) for automated benchmark testing. Benchmarks are run automatically before pull requests are merged to prevent performance degredation. + +You can run the benchmark test suite locally with the following command: +``` +asv run --config benchmarks/asv.conf.json +``` + +Run a specific test: +``` +asv run --config benchmarks/asv.conf.json -b bench_json_schema.JsonSchemaBenchmark.time_json_schema_to_fsm +``` + +Profile a specific test: ``` +asv run --config benchmarks/asv.conf.json --profile -b bench_json_schema.JsonSchemaBenchmark.time_json_schema_to_fsm +``` + +Compare to `origin/main` +``` +get fetch origin +asv continuous origin/main HEAD --config benchmarks/asv.conf.json +``` + +#### ASV PR Behavior + +- **View ASV Benchmark Results:** Open the workflow, view `BENCHMARK RESULTS` section. +- Merging is blocked unless benchmarks are run for the latest commit. +- Benchmarks fail if performance degrades by more than 10% for any individual benchmark. +- The "Benchmark PR" workflow runs when its manually dispatched, or if the `run_benchmarks` label is added to the PR they run for every commit. + ### Contribute to the documentation diff --git a/outlines/caching.py b/outlines/caching.py index 52d66af74..95392c7e8 100644 --- a/outlines/caching.py +++ b/outlines/caching.py @@ -1,4 +1,5 @@ import asyncio +import contextlib import functools import os from typing import Callable, Optional @@ -164,3 +165,15 @@ def clear_cache(): """Erase the cache completely.""" memory = get_cache() memory.clear() + + +@contextlib.contextmanager +def cache_disabled(): + # outlines.caching._caching_enabled + global _caching_enabled + original_state = _caching_enabled + _caching_enabled = False + try: + yield + finally: + _caching_enabled = original_state diff --git a/tests/benchmark/test_benchmark_numba_compile.py b/tests/benchmark/test_benchmark_numba_compile.py deleted file mode 100644 index 827d561bd..000000000 --- a/tests/benchmark/test_benchmark_numba_compile.py +++ /dev/null @@ -1,33 +0,0 @@ -import importlib - -import interegular -import numba - -import outlines - -outlines.disable_cache() - - -def test_benchmark_compile_numba(benchmark, tokenizer, mocker): - """Compile a basic regex to benchmark the numba compilation time""" - - def setup(): - from outlines.fsm import regex - - original_njit = numba.njit - - def mock_njit(*args, **kwargs): - kwargs["cache"] = False - return original_njit(*args, **kwargs) - - mocker.patch("numba.njit", new=mock_njit) - importlib.reload(regex) - - regex_pattern, _ = regex.make_deterministic_fsm( - interegular.parse_pattern("a").to_fsm().reduce() - ) - return (regex, regex_pattern, tokenizer), {} - - benchmark.pedantic( - lambda r, *args: r.create_fsm_index_tokenizer(*args), rounds=2, setup=setup - ) diff --git a/tests/test_cache.py b/tests/test_cache.py index 5a2de778e..eb4ec406e 100644 --- a/tests/test_cache.py +++ b/tests/test_cache.py @@ -1,5 +1,6 @@ import os import tempfile +import unittest import diskcache import pytest @@ -157,3 +158,33 @@ def foo(): # assert with version upgrade, old cache is invalidated and new cache is used a, b = foo() + + +def test_cache_disabled_decorator(test_cache): + """Ensure cache can be disabled in a local scope""" + + from outlines.caching import cache_disabled + + mock = unittest.mock.MagicMock() + + @test_cache + def fn(): + mock() + return 1 + + # first call isn't cached + fn() + assert mock.call_count == 1 + + # second call doesn't run fn, uses cache + fn() + assert mock.call_count == 1 + + # cache_disabled decorator disables cache within scope + with cache_disabled(): + fn() + assert mock.call_count == 2 # called once in cache_disabled scope + + # scope has exited, cache is enabled again + fn() + assert mock.call_count == 2