From c3d619932eb4e3e0ae93b0dded3041d39d0c36e2 Mon Sep 17 00:00:00 2001 From: MrTatsugoro Date: Mon, 5 Jun 2023 23:03:53 +0200 Subject: [PATCH 1/8] Add requirement.txt --- requirements.txt | 1 + 1 file changed, 1 insertion(+) create mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..2bd69c8 --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +networkx==3.1 From 15003bbefbf6a481efada2e2840826f001f474ea Mon Sep 17 00:00:00 2001 From: MrTatsugoro Date: Mon, 5 Jun 2023 23:01:18 +0200 Subject: [PATCH 2/8] Fix timing.py The previous implementation ensured side-effect-free execution by having the input list deepcopied as part of the benched function. This affects timings too much, so I moved that to the setup and used timeit.repeat(repeat=x, number=1) instead of timeit.timeit(number=x) to ensure the setup runs before each iteration. No side-effects and we bench the actual code instead of the deepcopy. I also simplified some of the code to remove redundant stuff. + some PEP8. --- timing.py | 233 ++++++++++++++++++++++++++---------------------------- 1 file changed, 110 insertions(+), 123 deletions(-) diff --git a/timing.py b/timing.py index 7e2e039..f76feb2 100644 --- a/timing.py +++ b/timing.py @@ -1,9 +1,12 @@ +import argparse import core -import timeit +import json +from pathlib import Path import random +from statistics import mean import sys -import argparse import time +import timeit # ================= @@ -11,16 +14,12 @@ # ================= class Benchmark: - def __init__(self): - with open('core.py') as f: - self.setup = f.read() - - self.setup += """\nfrom copy import deepcopy\n""" - + self.lsts = [] + self.info = '' + self.setup = 'from copy import deepcopy\nlists = deepcopy(lsts)' self.load() self.build_info() - self.extend_setup() def load(self): raise NotImplementedError @@ -34,107 +33,109 @@ def build_info(self): if len(lst) > max: max = len(lst) num += 1 - self.info = "{} lists, average size {}, max size {}".format(num, size//num, max) - - def extend_setup(self): - self.setup += '' - raise NotImplementedError + self.info = f'{num} lists, average size {size // num}, max size {max}' class Niklas(Benchmark): - def __init__(self, filename): self.filename = filename super().__init__() def load(self): - self.lsts = [] - with open(self.filename, "r") as f: - for line in f: - lst = [int(x) for x in line.split()] - self.lsts.append(lst) + self.lsts = [ + [int(x) for x in line.split()] + for line in Path(self.filename).read_text().splitlines() + ] def build_info(self): super().build_info() - self.info += '\n(from file: {})'.format(self.filename) - - def extend_setup(self): - self.setup += """ - -lsts = [] -size = 0 -num = 0 -max = 0 -for line in open("{0}", "r"): - lst = [int(x) for x in line.split()] - size += len(lst) - if len(lst) > max: max = len(lst) - num += 1 - lsts.append(lst) -""".format(self.filename) + self.info += f'\n(from file: {self.filename})' class Sven(Benchmark): - def load(self): - import json - with open('./lists/sven_list.txt') as f: - self.lsts = json.loads(f.read()) - - def extend_setup(self): - self.setup += """ -import json -with open('./lists/sven_list.txt') as f: - lsts = json.loads(f.read()) -""" + self.lsts = json.loads(Path('./lists/sven_list.txt').read_text()) class Agf(Benchmark): - def load(self): - import random - tenk = range(10000) - self.lsts = [random.sample(tenk, random.randint(0, 500)) for _ in range(2000)] - - def extend_setup(self): - self.setup += """\nlsts = {}""".format(repr(self.lsts)) + self.lsts = [ + random.sample(range(10000), random.randint(0, 500)) + for _ in range(2000) + ] # ====================================== # Function for building Nik's test lists # ====================================== -def build_timing_list(filename, - class_count=50, - class_size=1000, - list_count_per_class=10, - large_list_sizes = (100, 1000), - small_list_sizes = (0, 100), - large_list_probability = 0.5): - - large_list_sizes = list(range(*large_list_sizes)) - small_list_sizes = list(range(*small_list_sizes)) - with open(filename, "w") as f: - lists = [] - classes = [list(range(class_size*i, class_size*(i+1))) for i in range(class_count)] - for c in classes: - # distribute each class across ~300 lists - for i in range(list_count_per_class): - lst = [] - if random.random() < large_list_probability: - size = random.choice(large_list_sizes) - else: - size = random.choice(small_list_sizes) - nums = set(c) - for j in range(size): - x = random.choice(list(nums)) - lst.append(x) - nums.remove(x) - random.shuffle(lst) - lists.append(lst) - random.shuffle(lists) - for lst in lists: - f.write(" ".join(str(x) for x in lst) + "\n") +def build_timing_list( + filename, + class_count=50, + class_size=1000, + list_count_per_class=10, + large_list_sizes=(100, 1000), + small_list_sizes=(0, 100), + large_list_probability=0.5 +): + large_list_sizes = list(range(*large_list_sizes)) + small_list_sizes = list(range(*small_list_sizes)) + with open(filename, "w") as f: + lists = [] + classes = [ + list(range(class_size * i, class_size * (i + 1))) + for i in range(class_count) + ] + for c in classes: + # distribute each class across ~300 lists + for i in range(list_count_per_class): + lst = [] + if random.random() < large_list_probability: + size = random.choice(large_list_sizes) + else: + size = random.choice(small_list_sizes) + nums = set(c) + for j in range(size): + x = random.choice(list(nums)) + lst.append(x) + nums.remove(x) + random.shuffle(lst) + lists.append(lst) + random.shuffle(lists) + for lst in lists: + f.write(" ".join(str(x) for x in lst) + "\n") + + +def build_all_timing_lists(): + print('building test list (for Nik test) ... ', end='') + sys.stdout.flush() + param = dict(class_count=50, + class_size=1000, + list_count_per_class=100, + large_list_sizes=(100, 1000), + small_list_sizes=(0, 100), + large_list_probability=0.5, + filename='./lists/timing_1.txt') + build_timing_list(**param) + + param = dict(class_count=15, + class_size=1000, + list_count_per_class=300, + large_list_sizes=(100, 1000), + small_list_sizes=(0, 100), + large_list_probability=0.5, + filename='./lists/timing_2.txt') + build_timing_list(**param) + + param = dict(class_count=15, + class_size=1000, + list_count_per_class=300, + large_list_sizes=(100, 1000), + small_list_sizes=(0, 100), + large_list_probability=0.1, + filename='./lists/timing_3.txt') + build_timing_list(**param) + print('done') # =============== @@ -144,8 +145,7 @@ def build_timing_list(filename, def timing(bench, number): print('\nTiming with: >> {} << Benchmark'.format(bench.__class__.__name__)) print('Info: {}'.format(bench.info)) - setup = bench.setup - + print('-- Press Ctrl-C to skip a test --\n') times = [] @@ -154,9 +154,21 @@ def timing(bench, number): print('timing: {} '.format(value.__doc__), end='') sys.stdout.flush() try: - t = timeit.timeit("{}(deepcopy(lsts))".format(name), - setup=setup, - number=number) + # We pass number to repeat and leave number to 1. + # This ensures the setup is repeated before every + # iteration. We put `bench.lsts` into the execution + # namespace. The setup does the deepcopy into `lists` + # (and this deepcopy doesn't count in timings). The + # benched execution uses the deepcopied list. This way, + # if any function has side effects, they don't impact + # other runs. + t = mean(timeit.repeat( + f'{name}(lists)', + setup=bench.setup, + number=1, + repeat=number, + globals={'lsts': bench.lsts, **core.__dict__} + )) except KeyboardInterrupt: print(' skipped.') try: @@ -164,14 +176,17 @@ def timing(bench, number): except KeyboardInterrupt: print('Two fast Ctrl-C - exiting') sys.exit(0) - + else: times.append((t, value.__doc__)) print(' -- {:0.4f} -- '.format(t)) print('\nTiming Results:') - for t,name in sorted(times): - print('{:0.3f} -- {}'.format(t,name)) + times = sorted(times) + best_t, best_name = times[0] + for t, name in times: + factor = t / best_t + print(f'{t:0.3f} ({factor:.2g}x) -- {name}') if __name__ == '__main__': @@ -182,36 +197,8 @@ def timing(bench, number): args = parser.parse_args() if args.new: - print('building test list (for Nik test) ... ', end='') - sys.stdout.flush() - param = dict(class_count = 50, - class_size = 1000, - list_count_per_class = 100, - large_list_sizes = (100, 1000), - small_list_sizes = (0, 100), - large_list_probability = 0.5, - filename = './lists/timing_1.txt') - build_timing_list(**param) - - param = dict(class_count = 15, - class_size = 1000, - list_count_per_class = 300, - large_list_sizes = (100, 1000), - small_list_sizes = (0, 100), - large_list_probability = 0.5, - filename = './lists/timing_2.txt') - build_timing_list(**param) - - param = dict(class_count = 15, - class_size = 1000, - list_count_per_class = 300, - large_list_sizes = (100, 1000), - small_list_sizes = (0, 100), - large_list_probability = 0.1, - filename = './lists/timing_3.txt') - build_timing_list(**param) - print('done') - + build_all_timing_lists() + timing(Niklas('./lists/timing_1.txt'), number=3) timing(Niklas('./lists/timing_2.txt'), number=3) timing(Niklas('./lists/timing_3.txt'), number=3) From 5336c27ac6247f45971ef8dbf65c2c9a82c6fd3c Mon Sep 17 00:00:00 2001 From: MrTatsugoro Date: Mon, 5 Jun 2023 23:03:10 +0200 Subject: [PATCH 3/8] Fix test.py Don't assume the return value is a list (this fixes a couple of tests where the tested code wasn't at fault). Some tests still fail but that may be due to the tested code. --- core.py | 12 +++++++----- test.py | 4 ++-- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/core.py b/core.py index e670237..677a404 100644 --- a/core.py +++ b/core.py @@ -100,10 +100,13 @@ def pairs(lst): prev = item yield item, first + def kat_merge(lsts): """katrielalex""" g = networkx.Graph() for sub_list in lsts: + if not sub_list: + continue for edge in pairs(sub_list): g.add_edge(*edge) @@ -206,7 +209,6 @@ def che_merge(lsts): def locatebin(bins, n): """Find the bin where list n has ended up: Follow bin references until we find a bin that has not moved. - """ while bins[n] != n: n = bins[n] @@ -218,7 +220,7 @@ def ale_merge(data): bins = list(range(len(data))) # Initialize each bin[n] == n nums = dict() - data = [set(m) for m in data ] # Convert to sets + data = [set(m) for m in data] # Convert to sets for r, row in enumerate(data): for num in row: if num not in nums: @@ -233,11 +235,11 @@ def ale_merge(data): if dest > r: dest, r = r, dest # always merge into the smallest bin - data[dest].update(data[r]) + data[dest].update(data[r]) data[r] = None # Update our indices to reflect the move bins[r] = dest - r = dest + r = dest # Filter out the empty bins have = [ m for m in data if m ] @@ -245,7 +247,7 @@ def ale_merge(data): return have -def nik_rew_merge_skip(lsts): +def nik_rew_merge(lsts): """Nik's rewrite""" sets = list(map(set,lsts)) results = [] diff --git a/test.py b/test.py index 5c442a2..43ba1b2 100644 --- a/test.py +++ b/test.py @@ -9,12 +9,12 @@ class MergeTestCase(unittest.TestCase): def setUp(self): with open('./lists/test_list.txt') as f: self.lsts = json.loads(f.read()) - self.merged = self.merge_func(deepcopy(self.lsts)) + self.merged = list(self.merge_func(deepcopy(self.lsts))) def test_disjoint(self): """Check disjoint-ness of merged results""" from itertools import combinations - for a,b in combinations(self.merged, 2): + for a, b in combinations(self.merged, 2): self.assertTrue(a.isdisjoint(b)) def test_coverage(self): # Credit to katrielalex From b2a94eca8c928ca98e4c1d4ec4e6d596b3666eee Mon Sep 17 00:00:00 2001 From: MrTatsugoro Date: Mon, 5 Jun 2023 23:04:09 +0200 Subject: [PATCH 4/8] Add profiler script. --- README | 8 +++++- profiler.py | 66 ++++++++++++++++++++++++++++++++++++++++++++++++ requirements.txt | 1 + 3 files changed, 74 insertions(+), 1 deletion(-) create mode 100644 profiler.py diff --git a/README b/README index 36983dc..59d7133 100644 --- a/README +++ b/README @@ -33,6 +33,13 @@ All the merge functions are in core.py. After that it will be auto loaded and everything will be taken care of on its own. +Profile a function/list combo +----------------------------- + +$ python3 profiler.py -f alexis -l timing_1 + +This will open a snakeviz visualisation of the execution profile in your browser at the end. + Test all the functions ---------------------- @@ -47,4 +54,3 @@ $ python3 timing.py - Ctrl-C to skip a test. - Two fast Ctrl-C to exit. - diff --git a/profiler.py b/profiler.py new file mode 100644 index 0000000..095db0f --- /dev/null +++ b/profiler.py @@ -0,0 +1,66 @@ +from argparse import ArgumentParser +import cProfile +import json +from pathlib import Path +import random +import subprocess + +import core +from timing import build_all_timing_lists + + +if __name__ == '__main__': + parser = ArgumentParser( + prog='Algorithm profiler', + description='Profile any given function from "core.py", ' + 'with any dataset from "lists/".', + ) + parser.add_argument( + '-f', '--function', dest='function', action='store', required=True, type=str, + help='Name of the function to profile. You can give the function\'s full name ' + '("ale_merge"), the function\'s abbreviated name ("ale") or the function\'s ' + 'display name ("alexis", as found in the docstring).' + ) + parser.add_argument( + '-l', '--list', dest='list', action='store', required=True, type=str, + help='Name of the list to profile with (e.g.: "timing_1" or "timing_1.txt").' + ) + parser.add_argument( + '-n', '--new', dest='new', action='store_true', default=False, + help='Rebuild all "timing_*.txt" test lists.' + ) + args = parser.parse_args() + if args.new: + build_all_timing_lists() + + func_name: str = args.function.strip() + func = getattr(core, func_name, getattr(core, f'{func_name}_merge', None)) + if not func: + for obj in core.__dict__.values(): + if func_name == getattr(obj, '__doc__', None): + func = obj + break + assert callable(func), f'Object {func.__name__} is not a function.' + + list_name: str = args.list.strip() + if not list_name.endswith('.txt'): + list_name = f'{list_name}.txt' + list_path = Path('.', 'lists', list_name) + if list_name in ('sven_list.txt', 'test_list.txt'): + lists = json.loads(list_path.read_text()) + elif list_name == 'agf_list.txt': + lists = [ + random.sample(range(10000), random.randint(0, 500)) + for _ in range(2000) + ] + else: + with open(list_path, 'r') as f: + lists = [[int(x) for x in line.split()] for line in f] + assert len(lists) > 0, 'It would be better if the dataset had some data.' + + prof_file_name = f'{func.__name__}_{list_name}.prof' + cProfile.runctx('func(lists)', globals(), locals(), prof_file_name) + try: + subprocess.run(['snakeviz', prof_file_name]) + except KeyboardInterrupt: + exit() diff --git a/requirements.txt b/requirements.txt index 2bd69c8..b81d898 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,2 @@ networkx==3.1 +snakeviz==2.2.0 From bf4f40d4d4fbcf02dd7f05ef09c9a986892fd2b4 Mon Sep 17 00:00:00 2001 From: MrTatsugoro Date: Mon, 5 Jun 2023 22:55:16 +0200 Subject: [PATCH 5/8] Add takeshi_merge version --- core.py | 48 +++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 45 insertions(+), 3 deletions(-) diff --git a/core.py b/core.py index 677a404..bfae726 100644 --- a/core.py +++ b/core.py @@ -3,11 +3,12 @@ # # Evey function with a name ending with '_merge' will be auto loaded - -import networkx +from collections import deque import heapq from itertools import chain -from collections import deque +from typing import Iterable, Iterator, TypeVar + +import networkx def rik_merge(lsts): @@ -266,3 +267,44 @@ def nik_rew_merge(lsts): else: results.append(first) return results + + +T = TypeVar('T') + + +def takeshi_merge(lists: Iterator[Iterable[T]]) -> list[set[T]]: + """takeshi""" + bins: dict[T: set[T]] = dict() + bin_refs: dict[T: T] = dict() + for lst in lists: + if not lst: + continue + + # Gather the bin refs of all items in the list that we have + # already seen. + encountered_items_bin_refs = { + bin_refs[item] + for item in lst + if item in bin_refs + } + if len(encountered_items_bin_refs) >= 1: + # Some of the items in `lst` have already been seen in a + # previous iteration. They are therefore already attached + # to a bin. Select any of their corresponding bin ref. + bin_ref = encountered_items_bin_refs.pop() + # If the previously-seen items were not all attached to the + # same bin, their respective bins need to be merged into + # the selected one. + if len(encountered_items_bin_refs) > 0: + to_merge_bins = [bins.pop(ref) for ref in encountered_items_bin_refs] + bins[bin_ref].update(chain(*to_merge_bins)) + bin_refs.update({item: bin_ref for item in chain(*to_merge_bins)}) + bins[bin_ref].update(lst) + else: + # None of the items in `lst` have already been seen in a + # previous iteration. Therefore, we can safely pick any + # item as our new bin ref and create the corresponding bin. + bin_ref = next(iter(lst)) + bins[bin_ref] = set(lst) + bin_refs.update({item: bin_ref for item in lst}) + return list(bins.values()) From d69d63f127ff72ffe43685acf8b1a1853dc4fb9f Mon Sep 17 00:00:00 2001 From: MrTatsugoro Date: Tue, 6 Jun 2023 01:18:52 +0200 Subject: [PATCH 6/8] faster setup --- timing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/timing.py b/timing.py index f76feb2..892a2fb 100644 --- a/timing.py +++ b/timing.py @@ -17,7 +17,7 @@ class Benchmark: def __init__(self): self.lsts = [] self.info = '' - self.setup = 'from copy import deepcopy\nlists = deepcopy(lsts)' + self.setup = 'lists = [[item for item in lst] for lst in lsts]' self.load() self.build_info() From f12f841deffb50f43552f5a124793fa80d218166 Mon Sep 17 00:00:00 2001 From: MrTatsugoro Date: Tue, 6 Jun 2023 01:31:10 +0200 Subject: [PATCH 7/8] Optimization dict comps were heavier than I thought. --- core.py | 6 ++++-- timing.py | 3 ++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/core.py b/core.py index bfae726..1fd01e6 100644 --- a/core.py +++ b/core.py @@ -298,7 +298,8 @@ def takeshi_merge(lists: Iterator[Iterable[T]]) -> list[set[T]]: if len(encountered_items_bin_refs) > 0: to_merge_bins = [bins.pop(ref) for ref in encountered_items_bin_refs] bins[bin_ref].update(chain(*to_merge_bins)) - bin_refs.update({item: bin_ref for item in chain(*to_merge_bins)}) + for item in chain(*to_merge_bins): + bin_refs[item] = bin_ref bins[bin_ref].update(lst) else: # None of the items in `lst` have already been seen in a @@ -306,5 +307,6 @@ def takeshi_merge(lists: Iterator[Iterable[T]]) -> list[set[T]]: # item as our new bin ref and create the corresponding bin. bin_ref = next(iter(lst)) bins[bin_ref] = set(lst) - bin_refs.update({item: bin_ref for item in lst}) + for item in lst: + bin_refs[item] = bin_ref return list(bins.values()) diff --git a/timing.py b/timing.py index 892a2fb..339165e 100644 --- a/timing.py +++ b/timing.py @@ -186,7 +186,8 @@ def timing(bench, number): best_t, best_name = times[0] for t, name in times: factor = t / best_t - print(f'{t:0.3f} ({factor:.2g}x) -- {name}') + fmt = '.2g' if factor < 99 else '.0f' + print(f'{t:0.3f} ({factor:{fmt}}x) -- {name}') if __name__ == '__main__': From 59bf3fc8c4751c54e98346934fba06585d9edb73 Mon Sep 17 00:00:00 2001 From: MrTatsugoro Date: Wed, 7 Jun 2023 21:22:27 +0200 Subject: [PATCH 8/8] fix typing --- core.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/core.py b/core.py index 1fd01e6..d636447 100644 --- a/core.py +++ b/core.py @@ -6,7 +6,7 @@ from collections import deque import heapq from itertools import chain -from typing import Iterable, Iterator, TypeVar +from typing import Iterable, TypeVar import networkx @@ -272,7 +272,7 @@ def nik_rew_merge(lsts): T = TypeVar('T') -def takeshi_merge(lists: Iterator[Iterable[T]]) -> list[set[T]]: +def takeshi_merge(lists: Iterable[Iterable[T]]) -> list[set[T]]: """takeshi""" bins: dict[T: set[T]] = dict() bin_refs: dict[T: T] = dict()