From c3d619932eb4e3e0ae93b0dded3041d39d0c36e2 Mon Sep 17 00:00:00 2001
From: MrTatsugoro <mrtatsugoro@gmail.com>
Date: Mon, 5 Jun 2023 23:03:53 +0200
Subject: [PATCH 1/8] Add requirement.txt

---
 requirements.txt | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 requirements.txt

diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..2bd69c8
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1 @@
+networkx==3.1

From 15003bbefbf6a481efada2e2840826f001f474ea Mon Sep 17 00:00:00 2001
From: MrTatsugoro <mrtatsugoro@gmail.com>
Date: Mon, 5 Jun 2023 23:01:18 +0200
Subject: [PATCH 2/8] Fix timing.py

The previous implementation ensured side-effect-free execution by having
the input list deepcopied as part of the benched function. This affects
timings too much, so I moved that to the setup and used
timeit.repeat(repeat=x, number=1) instead of timeit.timeit(number=x) to
ensure the setup runs before each iteration. No side-effects and we
bench the actual code instead of the deepcopy. I also simplified some of
the code to remove redundant stuff. + some PEP8.
---
 timing.py | 233 ++++++++++++++++++++++++++----------------------------
 1 file changed, 110 insertions(+), 123 deletions(-)

diff --git a/timing.py b/timing.py
index 7e2e039..f76feb2 100644
--- a/timing.py
+++ b/timing.py
@@ -1,9 +1,12 @@
+import argparse
 import core
-import timeit
+import json
+from pathlib import Path
 import random
+from statistics import mean
 import sys
-import argparse
 import time
+import timeit
 
 
 # =================
@@ -11,16 +14,12 @@
 # =================
 
 class Benchmark:
-
     def __init__(self):
-        with open('core.py') as f:
-            self.setup = f.read()
-
-        self.setup += """\nfrom copy import deepcopy\n"""
-
+        self.lsts = []
+        self.info = ''
+        self.setup = 'from copy import deepcopy\nlists = deepcopy(lsts)'
         self.load()
         self.build_info()
-        self.extend_setup()
 
     def load(self):
         raise NotImplementedError
@@ -34,107 +33,109 @@ def build_info(self):
             if len(lst) > max:
                 max = len(lst)
             num += 1
-        self.info = "{} lists, average size {}, max size {}".format(num, size//num, max)
-
-    def extend_setup(self):
-        self.setup += ''
-        raise NotImplementedError
+        self.info = f'{num} lists, average size {size // num}, max size {max}'
 
 
 class Niklas(Benchmark):
-
     def __init__(self, filename):
         self.filename = filename
         super().__init__()
 
     def load(self):
-        self.lsts = []
-        with open(self.filename, "r") as f:
-            for line in f:
-                lst = [int(x) for x in line.split()]
-                self.lsts.append(lst)
+        self.lsts = [
+            [int(x) for x in line.split()]
+            for line in Path(self.filename).read_text().splitlines()
+        ]
 
     def build_info(self):
         super().build_info()
-        self.info += '\n(from file: {})'.format(self.filename)
-
-    def extend_setup(self):
-        self.setup += """
-
-lsts = []
-size = 0
-num = 0
-max = 0
-for line in open("{0}", "r"):
-  lst = [int(x) for x in line.split()]
-  size += len(lst)
-  if len(lst) > max: max = len(lst)
-  num += 1
-  lsts.append(lst)
-""".format(self.filename)
+        self.info += f'\n(from file: {self.filename})'
 
 
 class Sven(Benchmark):
-
     def load(self):
-        import json
-        with open('./lists/sven_list.txt') as f:
-            self.lsts = json.loads(f.read())
-
-    def extend_setup(self):
-        self.setup += """
-import json
-with open('./lists/sven_list.txt') as f:
-    lsts = json.loads(f.read())
-"""
+        self.lsts = json.loads(Path('./lists/sven_list.txt').read_text())
 
 
 class Agf(Benchmark):
-
     def load(self):
-        import random
-        tenk = range(10000)
-        self.lsts = [random.sample(tenk, random.randint(0, 500)) for _ in range(2000)]
-
-    def extend_setup(self):
-        self.setup += """\nlsts = {}""".format(repr(self.lsts))
+        self.lsts = [
+            random.sample(range(10000), random.randint(0, 500))
+            for _ in range(2000)
+        ]
 
 
 # ======================================
 # Function for building Nik's test lists
 # ======================================
 
-def build_timing_list(filename,
-                      class_count=50,
-                      class_size=1000,  
-                      list_count_per_class=10,
-                      large_list_sizes = (100, 1000),
-                      small_list_sizes = (0, 100),
-                      large_list_probability = 0.5):
-
-  large_list_sizes = list(range(*large_list_sizes))
-  small_list_sizes = list(range(*small_list_sizes))
-  with open(filename, "w") as f:
-    lists = []
-    classes = [list(range(class_size*i, class_size*(i+1))) for i in range(class_count)]
-    for c in classes:
-      # distribute each class across ~300 lists
-      for i in range(list_count_per_class):
-        lst = []
-        if random.random() < large_list_probability:
-          size = random.choice(large_list_sizes)
-        else:
-          size = random.choice(small_list_sizes)
-        nums = set(c)
-        for j in range(size):
-          x = random.choice(list(nums))
-          lst.append(x)
-          nums.remove(x)
-        random.shuffle(lst)
-        lists.append(lst)
-    random.shuffle(lists)
-    for lst in lists:
-      f.write(" ".join(str(x) for x in lst) + "\n")
+def build_timing_list(
+    filename,
+    class_count=50,
+    class_size=1000,
+    list_count_per_class=10,
+    large_list_sizes=(100, 1000),
+    small_list_sizes=(0, 100),
+    large_list_probability=0.5
+):
+    large_list_sizes = list(range(*large_list_sizes))
+    small_list_sizes = list(range(*small_list_sizes))
+    with open(filename, "w") as f:
+        lists = []
+        classes = [
+            list(range(class_size * i, class_size * (i + 1)))
+            for i in range(class_count)
+        ]
+        for c in classes:
+            # distribute each class across ~300 lists
+            for i in range(list_count_per_class):
+                lst = []
+                if random.random() < large_list_probability:
+                    size = random.choice(large_list_sizes)
+                else:
+                    size = random.choice(small_list_sizes)
+                nums = set(c)
+                for j in range(size):
+                    x = random.choice(list(nums))
+                    lst.append(x)
+                    nums.remove(x)
+                random.shuffle(lst)
+                lists.append(lst)
+        random.shuffle(lists)
+        for lst in lists:
+            f.write(" ".join(str(x) for x in lst) + "\n")
+
+
+def build_all_timing_lists():
+    print('building test list (for Nik test) ... ', end='')
+    sys.stdout.flush()
+    param = dict(class_count=50,
+                 class_size=1000,
+                 list_count_per_class=100,
+                 large_list_sizes=(100, 1000),
+                 small_list_sizes=(0, 100),
+                 large_list_probability=0.5,
+                 filename='./lists/timing_1.txt')
+    build_timing_list(**param)
+
+    param = dict(class_count=15,
+                 class_size=1000,
+                 list_count_per_class=300,
+                 large_list_sizes=(100, 1000),
+                 small_list_sizes=(0, 100),
+                 large_list_probability=0.5,
+                 filename='./lists/timing_2.txt')
+    build_timing_list(**param)
+
+    param = dict(class_count=15,
+                 class_size=1000,
+                 list_count_per_class=300,
+                 large_list_sizes=(100, 1000),
+                 small_list_sizes=(0, 100),
+                 large_list_probability=0.1,
+                 filename='./lists/timing_3.txt')
+    build_timing_list(**param)
+    print('done')
 
 
 # ===============
@@ -144,8 +145,7 @@ def build_timing_list(filename,
 def timing(bench, number):
     print('\nTiming with: >> {} << Benchmark'.format(bench.__class__.__name__))
     print('Info: {}'.format(bench.info))
-    setup = bench.setup
-    
+
     print('-- Press Ctrl-C to skip a test --\n')
 
     times = []
@@ -154,9 +154,21 @@ def timing(bench, number):
             print('timing: {} '.format(value.__doc__), end='')
             sys.stdout.flush()
             try:
-                t = timeit.timeit("{}(deepcopy(lsts))".format(name),
-                                  setup=setup,
-                                  number=number)
+                # We pass number to repeat and leave number to 1.
+                # This ensures the setup is repeated before every
+                # iteration. We put `bench.lsts` into the execution
+                # namespace. The setup does the deepcopy into `lists`
+                # (and this deepcopy doesn't count in timings). The
+                # benched execution uses the deepcopied list. This way,
+                # if any function has side effects, they don't impact
+                # other runs.
+                t = mean(timeit.repeat(
+                    f'{name}(lists)',
+                    setup=bench.setup,
+                    number=1,
+                    repeat=number,
+                    globals={'lsts': bench.lsts, **core.__dict__}
+                ))
             except KeyboardInterrupt:
                 print(' skipped.')
                 try:
@@ -164,14 +176,17 @@ def timing(bench, number):
                 except KeyboardInterrupt:
                     print('Two fast Ctrl-C - exiting')
                     sys.exit(0)
-                
+
             else:
                 times.append((t, value.__doc__))
                 print(' --   {:0.4f}   -- '.format(t))
 
     print('\nTiming Results:')
-    for t,name in sorted(times):
-        print('{:0.3f}  -- {}'.format(t,name))
+    times = sorted(times)
+    best_t, best_name = times[0]
+    for t, name in times:
+        factor = t / best_t
+        print(f'{t:0.3f} ({factor:.2g}x) -- {name}')
 
 
 if __name__ == '__main__':
@@ -182,36 +197,8 @@ def timing(bench, number):
     args = parser.parse_args()
 
     if args.new:
-        print('building test list (for Nik test) ... ', end='')
-        sys.stdout.flush()
-        param = dict(class_count = 50,
-                     class_size = 1000,
-                     list_count_per_class = 100,
-                     large_list_sizes = (100, 1000),
-                     small_list_sizes = (0, 100),
-                     large_list_probability = 0.5,
-                     filename = './lists/timing_1.txt')
-        build_timing_list(**param)
-
-        param = dict(class_count = 15,
-                     class_size = 1000,
-                     list_count_per_class = 300,
-                     large_list_sizes = (100, 1000),
-                     small_list_sizes = (0, 100),
-                     large_list_probability = 0.5,
-                     filename = './lists/timing_2.txt')
-        build_timing_list(**param)
-        
-        param = dict(class_count = 15,
-                     class_size = 1000,
-                     list_count_per_class = 300,
-                     large_list_sizes = (100, 1000),
-                     small_list_sizes = (0, 100),
-                     large_list_probability = 0.1,
-                     filename = './lists/timing_3.txt')
-        build_timing_list(**param)
-        print('done')
-        
+        build_all_timing_lists()
+
     timing(Niklas('./lists/timing_1.txt'), number=3)
     timing(Niklas('./lists/timing_2.txt'), number=3)
     timing(Niklas('./lists/timing_3.txt'), number=3)

From 5336c27ac6247f45971ef8dbf65c2c9a82c6fd3c Mon Sep 17 00:00:00 2001
From: MrTatsugoro <mrtatsugoro@gmail.com>
Date: Mon, 5 Jun 2023 23:03:10 +0200
Subject: [PATCH 3/8] Fix test.py

Don't assume the return value is a list (this fixes a couple of tests
where the tested code wasn't at fault). Some tests still fail but that
may be due to the tested code.
---
 core.py | 12 +++++++-----
 test.py |  4 ++--
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/core.py b/core.py
index e670237..677a404 100644
--- a/core.py
+++ b/core.py
@@ -100,10 +100,13 @@ def pairs(lst):
         prev = item
     yield item, first
 
+
 def kat_merge(lsts):
     """katrielalex"""
     g = networkx.Graph()
     for sub_list in lsts:
+        if not sub_list:
+            continue
         for edge in pairs(sub_list):
             g.add_edge(*edge)
 
@@ -206,7 +209,6 @@ def che_merge(lsts):
 def locatebin(bins, n):
     """Find the bin where list n has ended up: Follow bin references until
     we find a bin that has not moved.
-    
     """
     while bins[n] != n:
         n = bins[n]
@@ -218,7 +220,7 @@ def ale_merge(data):
     bins = list(range(len(data)))  # Initialize each bin[n] == n
     nums = dict()
 
-    data = [set(m) for m in data ]  # Convert to sets    
+    data = [set(m) for m in data]  # Convert to sets
     for r, row in enumerate(data):
         for num in row:
             if num not in nums:
@@ -233,11 +235,11 @@ def ale_merge(data):
                 if dest > r:
                     dest, r = r, dest   # always merge into the smallest bin
 
-                data[dest].update(data[r]) 
+                data[dest].update(data[r])
                 data[r] = None
                 # Update our indices to reflect the move
                 bins[r] = dest
-                r = dest 
+                r = dest
 
     # Filter out the empty bins
     have = [ m for m in data if m ]
@@ -245,7 +247,7 @@ def ale_merge(data):
     return have
 
 
-def nik_rew_merge_skip(lsts):
+def nik_rew_merge(lsts):
     """Nik's rewrite"""
     sets = list(map(set,lsts))
     results = []
diff --git a/test.py b/test.py
index 5c442a2..43ba1b2 100644
--- a/test.py
+++ b/test.py
@@ -9,12 +9,12 @@ class MergeTestCase(unittest.TestCase):
     def setUp(self):
         with open('./lists/test_list.txt') as f:
             self.lsts = json.loads(f.read())
-        self.merged = self.merge_func(deepcopy(self.lsts))
+        self.merged = list(self.merge_func(deepcopy(self.lsts)))
 
     def test_disjoint(self):
         """Check disjoint-ness of merged results"""
         from itertools import combinations
-        for a,b in combinations(self.merged, 2):
+        for a, b in combinations(self.merged, 2):
             self.assertTrue(a.isdisjoint(b))
 
     def test_coverage(self):    # Credit to katrielalex

From b2a94eca8c928ca98e4c1d4ec4e6d596b3666eee Mon Sep 17 00:00:00 2001
From: MrTatsugoro <mrtatsugoro@gmail.com>
Date: Mon, 5 Jun 2023 23:04:09 +0200
Subject: [PATCH 4/8] Add profiler script.

---
 README           |  8 +++++-
 profiler.py      | 66 ++++++++++++++++++++++++++++++++++++++++++++++++
 requirements.txt |  1 +
 3 files changed, 74 insertions(+), 1 deletion(-)
 create mode 100644 profiler.py

diff --git a/README b/README
index 36983dc..59d7133 100644
--- a/README
+++ b/README
@@ -33,6 +33,13 @@ All the merge functions are in core.py.
 
 After that it will be auto loaded and everything will be taken care of on its own.
 
+Profile a function/list combo
+-----------------------------
+
+$ python3 profiler.py -f alexis -l timing_1
+
+This will open a snakeviz visualisation of the execution profile in your browser at the end.
+
 Test all the functions
 ----------------------
 
@@ -47,4 +54,3 @@ $ python3 timing.py
 
 - Ctrl-C to skip a test.
 - Two fast Ctrl-C to exit.
-
diff --git a/profiler.py b/profiler.py
new file mode 100644
index 0000000..095db0f
--- /dev/null
+++ b/profiler.py
@@ -0,0 +1,66 @@
+from argparse import ArgumentParser
+import cProfile
+import json
+from pathlib import Path
+import random
+import subprocess
+
+import core
+from timing import build_all_timing_lists
+
+
+if __name__ == '__main__':
+    parser = ArgumentParser(
+        prog='Algorithm profiler',
+        description='Profile any given function from "core.py", '
+                    'with any dataset from "lists/".',
+    )
+    parser.add_argument(
+        '-f', '--function', dest='function', action='store', required=True, type=str,
+        help='Name of the function to profile. You can give the function\'s full name '
+             '("ale_merge"), the function\'s abbreviated name ("ale") or the function\'s '
+             'display name ("alexis", as found in the docstring).'
+    )
+    parser.add_argument(
+        '-l', '--list', dest='list', action='store', required=True, type=str,
+        help='Name of the list to profile with (e.g.: "timing_1" or "timing_1.txt").'
+    )
+    parser.add_argument(
+        '-n', '--new', dest='new', action='store_true', default=False,
+        help='Rebuild all "timing_*.txt" test lists.'
+    )
+    args = parser.parse_args()
+    if args.new:
+        build_all_timing_lists()
+
+    func_name: str = args.function.strip()
+    func = getattr(core, func_name, getattr(core, f'{func_name}_merge', None))
+    if not func:
+        for obj in core.__dict__.values():
+            if func_name == getattr(obj, '__doc__', None):
+                func = obj
+                break
+    assert callable(func), f'Object {func.__name__} is not a function.'
+
+    list_name: str = args.list.strip()
+    if not list_name.endswith('.txt'):
+        list_name = f'{list_name}.txt'
+    list_path = Path('.', 'lists', list_name)
+    if list_name in ('sven_list.txt', 'test_list.txt'):
+        lists = json.loads(list_path.read_text())
+    elif list_name == 'agf_list.txt':
+        lists = [
+            random.sample(range(10000), random.randint(0, 500))
+            for _ in range(2000)
+        ]
+    else:
+        with open(list_path, 'r') as f:
+            lists = [[int(x) for x in line.split()] for line in f]
+    assert len(lists) > 0, 'It would be better if the dataset had some data.'
+
+    prof_file_name = f'{func.__name__}_{list_name}.prof'
+    cProfile.runctx('func(lists)', globals(), locals(), prof_file_name)
+    try:
+        subprocess.run(['snakeviz', prof_file_name])
+    except KeyboardInterrupt:
+        exit()
diff --git a/requirements.txt b/requirements.txt
index 2bd69c8..b81d898 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1 +1,2 @@
 networkx==3.1
+snakeviz==2.2.0

From bf4f40d4d4fbcf02dd7f05ef09c9a986892fd2b4 Mon Sep 17 00:00:00 2001
From: MrTatsugoro <mrtatsugoro@gmail.com>
Date: Mon, 5 Jun 2023 22:55:16 +0200
Subject: [PATCH 5/8] Add takeshi_merge version

---
 core.py | 48 +++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 45 insertions(+), 3 deletions(-)

diff --git a/core.py b/core.py
index 677a404..bfae726 100644
--- a/core.py
+++ b/core.py
@@ -3,11 +3,12 @@
 #
 # Evey function with a name ending with '_merge' will be auto loaded
 
-
-import networkx
+from collections import deque
 import heapq
 from itertools import chain
-from collections import deque
+from typing import Iterable, Iterator, TypeVar
+
+import networkx
 
 
 def rik_merge(lsts):
@@ -266,3 +267,44 @@ def nik_rew_merge(lsts):
         else:
             results.append(first)
     return results
+
+
+T = TypeVar('T')
+
+
+def takeshi_merge(lists: Iterator[Iterable[T]]) -> list[set[T]]:
+    """takeshi"""
+    bins: dict[T: set[T]] = dict()
+    bin_refs: dict[T: T] = dict()
+    for lst in lists:
+        if not lst:
+            continue
+
+        # Gather the bin refs of all items in the list that we have
+        # already seen.
+        encountered_items_bin_refs = {
+            bin_refs[item]
+            for item in lst
+            if item in bin_refs
+        }
+        if len(encountered_items_bin_refs) >= 1:
+            # Some of the items in `lst` have already been seen in a
+            # previous iteration. They are therefore already attached
+            # to a bin. Select any of their corresponding bin ref.
+            bin_ref = encountered_items_bin_refs.pop()
+            # If the previously-seen items were not all attached to the
+            # same bin, their respective bins need to be merged into
+            # the selected one.
+            if len(encountered_items_bin_refs) > 0:
+                to_merge_bins = [bins.pop(ref) for ref in encountered_items_bin_refs]
+                bins[bin_ref].update(chain(*to_merge_bins))
+                bin_refs.update({item: bin_ref for item in chain(*to_merge_bins)})
+            bins[bin_ref].update(lst)
+        else:
+            # None of the items in `lst` have already been seen in a
+            # previous iteration. Therefore, we can safely pick any
+            # item as our new bin ref and create the corresponding bin.
+            bin_ref = next(iter(lst))
+            bins[bin_ref] = set(lst)
+        bin_refs.update({item: bin_ref for item in lst})
+    return list(bins.values())

From d69d63f127ff72ffe43685acf8b1a1853dc4fb9f Mon Sep 17 00:00:00 2001
From: MrTatsugoro <mrtatsugoro@gmail.com>
Date: Tue, 6 Jun 2023 01:18:52 +0200
Subject: [PATCH 6/8] faster setup

---
 timing.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/timing.py b/timing.py
index f76feb2..892a2fb 100644
--- a/timing.py
+++ b/timing.py
@@ -17,7 +17,7 @@ class Benchmark:
     def __init__(self):
         self.lsts = []
         self.info = ''
-        self.setup = 'from copy import deepcopy\nlists = deepcopy(lsts)'
+        self.setup = 'lists = [[item for item in lst] for lst in lsts]'
         self.load()
         self.build_info()
 

From f12f841deffb50f43552f5a124793fa80d218166 Mon Sep 17 00:00:00 2001
From: MrTatsugoro <mrtatsugoro@gmail.com>
Date: Tue, 6 Jun 2023 01:31:10 +0200
Subject: [PATCH 7/8] Optimization

dict comps were heavier than I thought.
---
 core.py   | 6 ++++--
 timing.py | 3 ++-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/core.py b/core.py
index bfae726..1fd01e6 100644
--- a/core.py
+++ b/core.py
@@ -298,7 +298,8 @@ def takeshi_merge(lists: Iterator[Iterable[T]]) -> list[set[T]]:
             if len(encountered_items_bin_refs) > 0:
                 to_merge_bins = [bins.pop(ref) for ref in encountered_items_bin_refs]
                 bins[bin_ref].update(chain(*to_merge_bins))
-                bin_refs.update({item: bin_ref for item in chain(*to_merge_bins)})
+                for item in chain(*to_merge_bins):
+                    bin_refs[item] = bin_ref
             bins[bin_ref].update(lst)
         else:
             # None of the items in `lst` have already been seen in a
@@ -306,5 +307,6 @@ def takeshi_merge(lists: Iterator[Iterable[T]]) -> list[set[T]]:
             # item as our new bin ref and create the corresponding bin.
             bin_ref = next(iter(lst))
             bins[bin_ref] = set(lst)
-        bin_refs.update({item: bin_ref for item in lst})
+        for item in lst:
+            bin_refs[item] = bin_ref
     return list(bins.values())
diff --git a/timing.py b/timing.py
index 892a2fb..339165e 100644
--- a/timing.py
+++ b/timing.py
@@ -186,7 +186,8 @@ def timing(bench, number):
     best_t, best_name = times[0]
     for t, name in times:
         factor = t / best_t
-        print(f'{t:0.3f} ({factor:.2g}x) -- {name}')
+        fmt = '.2g' if factor < 99 else '.0f'
+        print(f'{t:0.3f} ({factor:{fmt}}x) -- {name}')
 
 
 if __name__ == '__main__':

From 59bf3fc8c4751c54e98346934fba06585d9edb73 Mon Sep 17 00:00:00 2001
From: MrTatsugoro <mrtatsugoro@gmail.com>
Date: Wed, 7 Jun 2023 21:22:27 +0200
Subject: [PATCH 8/8] fix typing

---
 core.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/core.py b/core.py
index 1fd01e6..d636447 100644
--- a/core.py
+++ b/core.py
@@ -6,7 +6,7 @@
 from collections import deque
 import heapq
 from itertools import chain
-from typing import Iterable, Iterator, TypeVar
+from typing import Iterable, TypeVar
 
 import networkx
 
@@ -272,7 +272,7 @@ def nik_rew_merge(lsts):
 T = TypeVar('T')
 
 
-def takeshi_merge(lists: Iterator[Iterable[T]]) -> list[set[T]]:
+def takeshi_merge(lists: Iterable[Iterable[T]]) -> list[set[T]]:
     """takeshi"""
     bins: dict[T: set[T]] = dict()
     bin_refs: dict[T: T] = dict()