From 40d069d0c88b065b4776a0220cca96e06c2028bb Mon Sep 17 00:00:00 2001 From: JD Bothma Date: Wed, 21 Aug 2024 12:13:03 +0100 Subject: [PATCH 1/2] Little command to benchmark matchers --- nomenklatura/cli.py | 9 +++++++++ nomenklatura/matching/bench.py | 30 ++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+) create mode 100644 nomenklatura/matching/bench.py diff --git a/nomenklatura/cli.py b/nomenklatura/cli.py index c99575bc..51b2b3e8 100644 --- a/nomenklatura/cli.py +++ b/nomenklatura/cli.py @@ -23,6 +23,7 @@ from nomenklatura.stream import StreamEntity from nomenklatura.xref import xref as run_xref from nomenklatura.tui import dedupe_ui +from nomenklatura.matching.bench import bench_matcher INDEX_SEGMENT = "xref-index" @@ -308,5 +309,13 @@ def statements_aggregate( write_entity(outfh, entity) +@cli.command("bench", help="Benchmark a matching algorithm") +@click.argument("name", type=str) +@click.argument("pairs_file", type=InPath) +@click.option("-n", "--number", type=int, default=1000) +def bench(name: str, pairs_file: Path, number: int = 1000) -> None: + bench_matcher(name, pairs_file, number) + + if __name__ == "__main__": cli() diff --git a/nomenklatura/matching/bench.py b/nomenklatura/matching/bench.py new file mode 100644 index 00000000..678ac5a5 --- /dev/null +++ b/nomenklatura/matching/bench.py @@ -0,0 +1,30 @@ +import datetime +from timeit import timeit +from itertools import cycle +import logging + +from nomenklatura.matching import get_algorithm +from nomenklatura.matching.pairs import read_pairs +from nomenklatura.util import PathLike + + +log = logging.getLogger(__name__) + + +def bench_matcher(name: str, pairs_file: PathLike, number: int) -> None: + log.info("Loading pairs from %s", pairs_file) + pairs = list(read_pairs(pairs_file)) + log.info("Read %d pairs", len(pairs)) + matcher = get_algorithm(name) + if matcher is None: + raise ValueError("No matcher named %s", name) + log.info("Loaded %s", matcher.NAME) + infinite_pairs = cycle(pairs) + + def compare_one_pair(): + pair = next(infinite_pairs) + matcher.compare(pair.left, pair.right) + + log.info("Running benchmark for %d iterations", number) + seconds = timeit(compare_one_pair, number=number) + log.info("Total time %s", datetime.timedelta(seconds=seconds)) From c3e540f5e8f84e1f6a68800e9d9c3fd40be6db1d Mon Sep 17 00:00:00 2001 From: JD Bothma Date: Wed, 21 Aug 2024 12:44:23 +0100 Subject: [PATCH 2/2] Fix missing type --- nomenklatura/matching/bench.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nomenklatura/matching/bench.py b/nomenklatura/matching/bench.py index 678ac5a5..fc87d154 100644 --- a/nomenklatura/matching/bench.py +++ b/nomenklatura/matching/bench.py @@ -21,7 +21,7 @@ def bench_matcher(name: str, pairs_file: PathLike, number: int) -> None: log.info("Loaded %s", matcher.NAME) infinite_pairs = cycle(pairs) - def compare_one_pair(): + def compare_one_pair() -> None: pair = next(infinite_pairs) matcher.compare(pair.left, pair.right)