From eb809269a3b92d07f8411f617b034a059fa1a2fa Mon Sep 17 00:00:00 2001 From: Tony Meyer Date: Wed, 18 Dec 2024 10:36:56 +1300 Subject: [PATCH] refactor: cache signature structure in ops.testing state classes (#1499) The Scenario code that limits the number of positional arguments (needed in Python 3.8 since the dataclasses there do not have `KW_ONLY`) is quite expensive. We have a method of providing this functionality in 3.8, but it is unfortunately expensive, and therefore a regression from older versions of Scenario. I considered 4 options here: * Manually write out the dozen or so `__init__` signatures (as we had at one point in the [original PR](https://github.com/canonical/ops-scenario/pull/137)). I'd rather not for the same reasons we made the decision at that time. * Figure out a way to have a syntax that allows using regular `dataclasses.KW_ONLY` with modern Python and only use our custom code on older Python. I spent some time looking into this, and couldn't figure out a way to do it where the code still seemed readable and maintainable enough. * Cache the most expensive work (what's in this PR). * Don't bother doing anything now, and eagerly wait for the ability to drop 3.8 support. A roughly 5% performance regression felt worth fixing as long as the change is fairly straightforward (although this change only gets about half of that back). The most expensive parts of the `__new__` method are the ones that work with `inspect`: `inspect.signature` in particular, but also getting the default value of the parameters. If we assume that no-one is run-time altering the signature of the class (I believe no-one should be) then the values of these never actually change, but we are currently calculating them every time an instance of the class is created. This PR changes that to cache those three values the first time they are needed. There's one additional performance tweak in the branch that doesn't make a significant difference, but is trivial to do: when checking if the YAML files exist, skip the filesystem `exists()` call if we just created an empty temporary directory a few lines above, since we know that it will never exist. A drive-by: I happened to notice while working on this branch Harness referring to `options.yaml`, which does not exist (any more?) as far as I know, so a docs tweak to address that. Timing (best of 3): |Suite|main|branch| |-|-|-| |operator unit tests (no concurrency)|165.25s|161.89s| |traefik scenario tests|45.49|44.30s| |kafka-k8s-operator scenario tests|4.48s|4.38s| Refs #1434 --- ops/_private/harness.py | 4 +-- testing/src/scenario/_runtime.py | 4 ++- testing/src/scenario/state.py | 43 ++++++++++++++++++++++++-------- 3 files changed, 37 insertions(+), 14 deletions(-) diff --git a/ops/_private/harness.py b/ops/_private/harness.py index 61c2ee0a3..92fdeec5a 100644 --- a/ops/_private/harness.py +++ b/ops/_private/harness.py @@ -2271,13 +2271,13 @@ def _config_set(self, key: str, value: Union[str, int, float, bool]): declared_type = option.get('type') if not declared_type: raise RuntimeError( - f'Incorrectly formatted `options.yaml`, option {key} ' + f'Incorrectly formatted config in YAML, option {key} ' 'is expected to declare a `type`.' ) if declared_type not in self._supported_types: raise RuntimeError( - 'Incorrectly formatted `options.yaml`: `type` needs to be one ' + 'Incorrectly formatted config in YAML: `type` needs to be one ' 'of [{}], not {}.'.format(', '.join(self._supported_types), declared_type) ) diff --git a/testing/src/scenario/_runtime.py b/testing/src/scenario/_runtime.py index 2dbd683b2..8be33bb59 100644 --- a/testing/src/scenario/_runtime.py +++ b/testing/src/scenario/_runtime.py @@ -224,7 +224,9 @@ def _virtual_charm_root(self): actions_yaml = virtual_charm_root / "actions.yaml" metadata_files_present: Dict[Path, Optional[str]] = { - file: file.read_text() if file.exists() else None + file: file.read_text() + if charm_virtual_root_is_custom and file.exists() + else None for file in (metadata_yaml, config_yaml, actions_yaml) } diff --git a/testing/src/scenario/state.py b/testing/src/scenario/state.py index ad9595175..d68bfc660 100644 --- a/testing/src/scenario/state.py +++ b/testing/src/scenario/state.py @@ -122,26 +122,43 @@ class _MaxPositionalArgs: _max_positional_args = n - def __new__(cls, *args: Any, **kwargs: Any): + @classmethod + def _annotate_class(cls): + """Record information about which parameters are positional vs. keyword-only.""" + if hasattr(cls, "_init_parameters"): + # We don't support dynamically changing the signature of a + # class, so we assume here it's the same as previously. + # In addition, the class and the function that provides it + # are private, so we generally don't expect anyone to be + # doing anything radical with these. + return # inspect.signature guarantees the order of parameters is as # declared, which aligns with dataclasses. Simpler ways of # getting the arguments (like __annotations__) do not have that # guarantee, although in practice it is the case. - parameters = inspect.signature(cls.__init__).parameters - required_args = [ + cls._init_parameters = parameters = inspect.signature( + cls.__init__ + ).parameters + cls._init_kw_only = { + name + for name in tuple(parameters)[cls._max_positional_args :] + if not name.startswith("_") + } + cls._init_required_args = [ name for name in tuple(parameters) - if parameters[name].default is inspect.Parameter.empty - and name not in kwargs - and name != "self" + if name != "self" + and parameters[name].default is inspect.Parameter.empty + ] + + def __new__(cls, *args: Any, **kwargs: Any): + cls._annotate_class() + required_args = [ + name for name in cls._init_required_args if name not in kwargs ] n_posargs = len(args) max_n_posargs = cls._max_positional_args - kw_only = { - name - for name in tuple(parameters)[max_n_posargs:] - if not name.startswith("_") - } + kw_only = cls._init_kw_only if n_posargs > max_n_posargs: raise TypeError( f"{cls.__name__} takes {max_n_posargs} positional " @@ -180,6 +197,10 @@ def __reduce__(self): return _MaxPositionalArgs +# A lot of JujuLogLine objects are created, so we want them to be fast and light. +# Dataclasses define __slots__, so are small, and a namedtuple is actually +# slower to create than a dataclass. A plain dictionary (or TypedDict) would be +# about twice as fast, but less convenient to use. @dataclasses.dataclass(frozen=True) class JujuLogLine: """An entry in the Juju debug-log."""