Skip to content

Commit

Permalink
Merge pull request #3962 from tybug/shrinker-ir
Browse files Browse the repository at this point in the history
Migrate most shrinker functions to the ir
  • Loading branch information
Zac-HD authored May 29, 2024
2 parents a30c0ef + 3814b54 commit 822e39d
Show file tree
Hide file tree
Showing 34 changed files with 970 additions and 1,325 deletions.
7 changes: 7 additions & 0 deletions hypothesis-python/RELEASE.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
RELEASE_TYPE: minor

This release migrates the shrinker to our new internal representation, called the IR layer (:pull:`3962`). This improves the shrinker's performance in the majority of cases. For example, on the Hypothesis test suite, shrinking is a median of 1.38x faster.

It is possible this release regresses performance while shrinking certain strategies. If you encounter strategies which reliably shrink more slowly than they used to (or shrink slowly at all), please open an issue!

You can read more about the IR layer at :issue:`3921`.
14 changes: 14 additions & 0 deletions hypothesis-python/benchmark/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
This directory contains code for benchmarking Hypothesis' shrinking. This was written for [pull/3962](https://github.com/HypothesisWorks/hypothesis/pull/3962) and is a manual process at the moment, though we may eventually integrate it more closely with ci for automated benchmarking.

To run a benchmark:

* Add the contents of `conftest.py` to the bottom of `hypothesis-python/tests/conftest.py`
* In `hypothesis-python/tests/common/debug.py`, change `derandomize=True` to `derandomize=False` (if you are running more than one trial)
* Run the tests: `pytest hypothesis-python/tests/`
* Note that the benchmarking script does not currently support xdist, so do not use `-n 8` or similar.

When pytest finishes the output will contain a dictionary of the benchmarking results. Add that as a new entry in `data.json`. Repeat for however many trials you want; n=5 seems reasonable.

Also repeat for both your baseline ("old") and your comparison ("new") code.

Then run `python graph.py` to generate a graph comparing the old and new results.
66 changes: 66 additions & 0 deletions hypothesis-python/benchmark/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# This file is part of Hypothesis, which may be found at
# https://github.com/HypothesisWorks/hypothesis/
#
# Copyright the Hypothesis Authors.
# Individual contributors are listed in AUTHORS.rst and the git log.
#
# This Source Code Form is subject to the terms of the Mozilla Public License,
# v. 2.0. If a copy of the MPL was not distributed with this file, You can
# obtain one at https://mozilla.org/MPL/2.0/.

import inspect
import json
from collections import defaultdict

import pytest
from _pytest.monkeypatch import MonkeyPatch

# we'd like to support xdist here for parallelism, but a session-scope fixture won't
# be enough: https://github.com/pytest-dev/pytest-xdist/issues/271. need a lockfile
# or equivalent.
shrink_calls = defaultdict(list)


def pytest_collection_modifyitems(config, items):
skip = pytest.mark.skip(reason="Does not call minimal()")
for item in items:
# is this perfect? no. but it is cheap!
if " minimal(" in inspect.getsource(item.obj):
continue
item.add_marker(skip)


@pytest.fixture(scope="function", autouse=True)
def _benchmark_shrinks():
from hypothesis.internal.conjecture.shrinker import Shrinker

monkeypatch = MonkeyPatch()

def record_shrink_calls(calls):
name = None
for frame in inspect.stack():
if frame.function.startswith("test_"):
name = f"{frame.filename.split('/')[-1]}::{frame.function}"
# some minimal calls happen at collection-time outside of a test context
# (maybe something we should fix/look into)
if name is None:
return

shrink_calls[name].append(calls)

old_shrink = Shrinker.shrink

def shrink(self, *args, **kwargs):
v = old_shrink(self, *args, **kwargs)
record_shrink_calls(self.engine.call_count - self.initial_calls)
return v

monkeypatch.setattr(Shrinker, "shrink", shrink)
yield

# start teardown
Shrinker.shrink = old_shrink


def pytest_sessionfinish(session, exitstatus):
print(f"\nshrinker profiling:\n{json.dumps(shrink_calls)}")
4 changes: 4 additions & 0 deletions hypothesis-python/benchmark/data.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{
"old": [],
"new": []
}
114 changes: 114 additions & 0 deletions hypothesis-python/benchmark/graph.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
# This file is part of Hypothesis, which may be found at
# https://github.com/HypothesisWorks/hypothesis/
#
# Copyright the Hypothesis Authors.
# Individual contributors are listed in AUTHORS.rst and the git log.
#
# This Source Code Form is subject to the terms of the Mozilla Public License,
# v. 2.0. If a copy of the MPL was not distributed with this file, You can
# obtain one at https://mozilla.org/MPL/2.0/.

import json
import statistics
from pathlib import Path

import matplotlib.pyplot as plt
import seaborn as sns

data_path = Path(__file__).parent / "data.json"
with open(data_path) as f:
data = json.loads(f.read())

old_runs = data["old"]
new_runs = data["new"]
all_runs = old_runs + new_runs

# every run should involve the same functions
names = set()
for run in all_runs:
names.add(frozenset(run.keys()))

intersection = frozenset.intersection(*names)
diff = frozenset.union(*[intersection.symmetric_difference(n) for n in names])

print(f"skipping these tests which were not present in all runs: {', '.join(diff)}")
names = list(intersection)

# the similar invariant for number of minimal calls per run is not true: functions
# may make a variable number of minimal() calls.
# it would be nice to compare identically just the ones which don't vary, to get
# a very fine grained comparison instead of averaging.
# sizes = []
# for run in all_runs:
# sizes.append(tuple(len(value) for value in run.values()))
# assert len(set(sizes)) == 1

new_names = []
for name in names:
if all(all(x == 0 for x in run[name]) for run in all_runs):
print(f"no shrinks for {name}, skipping")
continue
new_names.append(name)
names = new_names


# name : average calls
old_values = {}
new_values = {}
for name in names:

# mean across the different minimal() calls in a single test function, then
# median across the n iterations we ran that for to reduce error
old_vals = [statistics.mean(run[name]) for run in old_runs]
new_vals = [statistics.mean(run[name]) for run in new_runs]
old_values[name] = statistics.median(old_vals)
new_values[name] = statistics.median(new_vals)

# name : (absolute difference, times difference)
diffs = {}
for name in names:
old = old_values[name]
new = new_values[name]
diff = old - new
diff_times = (old - new) / old
if 0 < diff_times < 1:
diff_times = (1 / (1 - diff_times)) - 1
diffs[name] = (diff, diff_times)

print(f"{name} {int(diff)} ({int(old)} -> {int(new)}, {round(diff_times, 1)}✕)")

diffs = dict(sorted(diffs.items(), key=lambda kv: kv[1][0]))
diffs_value = [v[0] for v in diffs.values()]
diffs_percentage = [v[1] for v in diffs.values()]

print(
f"mean: {int(statistics.mean(diffs_value))}, median: {int(statistics.median(diffs_value))}"
)


# https://stackoverflow.com/a/65824524
def align_axes(ax1, ax2):
ax1_ylims = ax1.axes.get_ylim()
ax1_yratio = ax1_ylims[0] / ax1_ylims[1]

ax2_ylims = ax2.axes.get_ylim()
ax2_yratio = ax2_ylims[0] / ax2_ylims[1]

if ax1_yratio < ax2_yratio:
ax2.set_ylim(bottom=ax2_ylims[1] * ax1_yratio)
else:
ax1.set_ylim(bottom=ax1_ylims[1] * ax2_yratio)


ax1 = sns.barplot(diffs_value, color="b", alpha=0.7, label="shrink call change")
ax2 = plt.twinx()
sns.barplot(diffs_percentage, color="r", alpha=0.7, label=r"n✕ change", ax=ax2)

ax1.set_title("old shrinks - new shrinks (aka shrinks saved, higher is better)")
ax1.set_xticks([])
align_axes(ax1, ax2)
legend = ax1.legend(labels=["shrink call change", "n✕ change"])
legend.legend_handles[0].set_color("b")
legend.legend_handles[1].set_color("r")

plt.show()
39 changes: 28 additions & 11 deletions hypothesis-python/src/hypothesis/internal/conjecture/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -779,10 +779,6 @@ def end(self, i: int) -> int:
"""Equivalent to self[i].end."""
return self.endpoints[i]

def bounds(self, i: int) -> Tuple[int, int]:
"""Equivalent to self[i].bounds."""
return (self.start(i), self.end(i))

def all_bounds(self) -> Iterable[Tuple[int, int]]:
"""Equivalent to [(b.start, b.end) for b in self]."""
prev = 0
Expand Down Expand Up @@ -970,7 +966,12 @@ class IRNode:
was_forced: bool = attr.ib()
index: Optional[int] = attr.ib(default=None)

def copy(self, *, with_value: IRType) -> "IRNode":
def copy(
self,
*,
with_value: Optional[IRType] = None,
with_kwargs: Optional[IRKWargsType] = None,
) -> "IRNode":
# we may want to allow this combination in the future, but for now it's
# a footgun.
assert not self.was_forced, "modifying a forced node doesn't make sense"
Expand All @@ -979,8 +980,8 @@ def copy(self, *, with_value: IRType) -> "IRNode":
# after copying.
return IRNode(
ir_type=self.ir_type,
value=with_value,
kwargs=self.kwargs,
value=self.value if with_value is None else with_value,
kwargs=self.kwargs if with_kwargs is None else with_kwargs,
was_forced=self.was_forced,
)

Expand Down Expand Up @@ -1071,9 +1072,17 @@ def __repr__(self):

def ir_value_permitted(value, ir_type, kwargs):
if ir_type == "integer":
if kwargs["min_value"] is not None and value < kwargs["min_value"]:
min_value = kwargs["min_value"]
max_value = kwargs["max_value"]
shrink_towards = kwargs["shrink_towards"]
if min_value is not None and value < min_value:
return False
if kwargs["max_value"] is not None and value > kwargs["max_value"]:
if max_value is not None and value > max_value:
return False

if (max_value is None or min_value is None) and (
value - shrink_towards
).bit_length() >= 128:
return False

return True
Expand Down Expand Up @@ -1144,14 +1153,22 @@ class ConjectureResult:
status: Status = attr.ib()
interesting_origin: Optional[InterestingOrigin] = attr.ib()
buffer: bytes = attr.ib()
blocks: Blocks = attr.ib()
# some ConjectureDatas pass through the ir and some pass through buffers.
# the ir does not drive its result through the buffer, which means blocks/examples
# may differ (I think for forced values?) even when the buffer is the same.
# I don't *think* anything was relying on anything but .buffer for result equality,
# though that assumption may be leaning on flakiness detection invariants.
#
# If we consider blocks or examples in equality checks, multiple semantically equal
# results get stored in e.g. the pareto front.
blocks: Blocks = attr.ib(eq=False)
output: str = attr.ib()
extra_information: Optional[ExtraInformation] = attr.ib()
has_discards: bool = attr.ib()
target_observations: TargetObservations = attr.ib()
tags: FrozenSet[StructuralCoverageTag] = attr.ib()
forced_indices: FrozenSet[int] = attr.ib(repr=False)
examples: Examples = attr.ib(repr=False)
examples: Examples = attr.ib(repr=False, eq=False)
arg_slices: Set[Tuple[int, int]] = attr.ib(repr=False)
slice_comments: Dict[Tuple[int, int], str] = attr.ib(repr=False)
invalid_at: Optional[InvalidAt] = attr.ib(repr=False)
Expand Down
25 changes: 21 additions & 4 deletions hypothesis-python/src/hypothesis/internal/conjecture/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -342,7 +342,7 @@ def _cache_key_ir(
for node in nodes + extension
)

def _cache(self, data: Union[ConjectureData, ConjectureResult]) -> None:
def _cache(self, data: ConjectureData) -> None:
result = data.as_result()
# when we shrink, we try out of bounds things, which can lead to the same
# data.buffer having multiple outcomes. eg data.buffer=b'' is Status.OVERRUN
Expand All @@ -357,8 +357,25 @@ def _cache(self, data: Union[ConjectureData, ConjectureResult]) -> None:
# write to the buffer cache here as we move more things to the ir cache.
if data.invalid_at is None:
self.__data_cache[data.buffer] = result
key = self._cache_key_ir(data=data)
self.__data_cache_ir[key] = result

# interesting buffer-based data can mislead the shrinker if we cache them.
#
# @given(st.integers())
# def f(n):
# assert n < 100
#
# may generate two counterexamples, n=101 and n=m > 101, in that order,
# where the buffer corresponding to n is large due to eg failed probes.
# We shrink m and eventually try n=101, but it is cached to a large buffer
# and so the best we can do is n=102, a non-ideal shrink.
#
# We can cache ir-based buffers fine, which always correspond to the
# smallest buffer via forced=. The overhead here is small because almost
# all interesting data are ir-based via the shrinker (and that overhead
# will tend towards zero as we move generation to the ir).
if data.ir_tree_nodes is not None or data.status < Status.INTERESTING:
key = self._cache_key_ir(data=data)
self.__data_cache_ir[key] = result

def cached_test_function_ir(
self, nodes: List[IRNode]
Expand Down Expand Up @@ -1218,7 +1235,7 @@ def shrink_interesting_examples(self) -> None:
self.interesting_examples.values(), key=lambda d: sort_key(d.buffer)
):
assert prev_data.status == Status.INTERESTING
data = self.new_conjecture_data_for_buffer(prev_data.buffer)
data = self.new_conjecture_data_ir(prev_data.examples.ir_tree_nodes)
self.test_function(data)
if data.status != Status.INTERESTING:
self.exit_with(ExitReason.flaky)
Expand Down
Loading

0 comments on commit 822e39d

Please sign in to comment.