rikpg · thomas-mckay · Jun 5, 2023 · Jun 5, 2023 · Jun 5, 2023 · Jun 5, 2023
diff --git a/README b/README
@@ -33,6 +33,13 @@ All the merge functions are in core.py.
 
 After that it will be auto loaded and everything will be taken care of on its own.
 
+Profile a function/list combo
+-----------------------------
+
+$ python3 profiler.py -f alexis -l timing_1
+
+This will open a snakeviz visualisation of the execution profile in your browser at the end.
+
 Test all the functions
 ----------------------
 
@@ -47,4 +54,3 @@ $ python3 timing.py
 
 - Ctrl-C to skip a test.
 - Two fast Ctrl-C to exit.
-
diff --git a/core.py b/core.py
@@ -3,11 +3,12 @@
 #
 # Evey function with a name ending with '_merge' will be auto loaded
 
-
-import networkx
+from collections import deque
 import heapq
 from itertools import chain
-from collections import deque
+from typing import Iterable, TypeVar
+
+import networkx
 
 
 def rik_merge(lsts):
@@ -100,10 +101,13 @@ def pairs(lst):
         prev = item
     yield item, first
 
+
 def kat_merge(lsts):
     """katrielalex"""
     g = networkx.Graph()
     for sub_list in lsts:
+        if not sub_list:
+            continue
         for edge in pairs(sub_list):
             g.add_edge(*edge)
 
@@ -206,7 +210,6 @@ def che_merge(lsts):
 def locatebin(bins, n):
     """Find the bin where list n has ended up: Follow bin references until
     we find a bin that has not moved.
-
     """
     while bins[n] != n:
         n = bins[n]
@@ -218,7 +221,7 @@ def ale_merge(data):
     bins = list(range(len(data)))  # Initialize each bin[n] == n
     nums = dict()
 
-    data = [set(m) for m in data ]  # Convert to sets    
+    data = [set(m) for m in data]  # Convert to sets
     for r, row in enumerate(data):
         for num in row:
             if num not in nums:
@@ -233,19 +236,19 @@ def ale_merge(data):
                 if dest > r:
                     dest, r = r, dest   # always merge into the smallest bin
 
-                data[dest].update(data[r]) 
+                data[dest].update(data[r])
                 data[r] = None
                 # Update our indices to reflect the move
                 bins[r] = dest
-                r = dest 
+                r = dest
 
     # Filter out the empty bins
     have = [ m for m in data if m ]
     #print len(have), "groups in result"  #removed this line
     return have
 
 
-def nik_rew_merge_skip(lsts):
+def nik_rew_merge(lsts):
     """Nik's rewrite"""
     sets = list(map(set,lsts))
     results = []
@@ -264,3 +267,46 @@ def nik_rew_merge_skip(lsts):
         else:
             results.append(first)
     return results
+
+
+T = TypeVar('T')
+
+
+def takeshi_merge(lists: Iterable[Iterable[T]]) -> list[set[T]]:
+    """takeshi"""
+    bins: dict[T: set[T]] = dict()
+    bin_refs: dict[T: T] = dict()
+    for lst in lists:
+        if not lst:
+            continue
+
+        # Gather the bin refs of all items in the list that we have
+        # already seen.
+        encountered_items_bin_refs = {
+            bin_refs[item]
+            for item in lst
+            if item in bin_refs
+        }
+        if len(encountered_items_bin_refs) >= 1:
+            # Some of the items in `lst` have already been seen in a
+            # previous iteration. They are therefore already attached
+            # to a bin. Select any of their corresponding bin ref.
+            bin_ref = encountered_items_bin_refs.pop()
+            # If the previously-seen items were not all attached to the
+            # same bin, their respective bins need to be merged into
+            # the selected one.
+            if len(encountered_items_bin_refs) > 0:
+                to_merge_bins = [bins.pop(ref) for ref in encountered_items_bin_refs]
+                bins[bin_ref].update(chain(*to_merge_bins))
+                for item in chain(*to_merge_bins):
+                    bin_refs[item] = bin_ref
+            bins[bin_ref].update(lst)
+        else:
+            # None of the items in `lst` have already been seen in a
+            # previous iteration. Therefore, we can safely pick any
+            # item as our new bin ref and create the corresponding bin.
+            bin_ref = next(iter(lst))
+            bins[bin_ref] = set(lst)
+        for item in lst:
+            bin_refs[item] = bin_ref
+    return list(bins.values())
diff --git a/profiler.py b/profiler.py
@@ -0,0 +1,66 @@
+from argparse import ArgumentParser
+import cProfile
+import json
+from pathlib import Path
+import random
+import subprocess
+
+import core
+from timing import build_all_timing_lists
+
+
+if __name__ == '__main__':
+    parser = ArgumentParser(
+        prog='Algorithm profiler',
+        description='Profile any given function from "core.py", '
+                    'with any dataset from "lists/".',
+    )
+    parser.add_argument(
+        '-f', '--function', dest='function', action='store', required=True, type=str,
+        help='Name of the function to profile. You can give the function\'s full name '
+             '("ale_merge"), the function\'s abbreviated name ("ale") or the function\'s '
+             'display name ("alexis", as found in the docstring).'
+    )
+    parser.add_argument(
+        '-l', '--list', dest='list', action='store', required=True, type=str,
+        help='Name of the list to profile with (e.g.: "timing_1" or "timing_1.txt").'
+    )
+    parser.add_argument(
+        '-n', '--new', dest='new', action='store_true', default=False,
+        help='Rebuild all "timing_*.txt" test lists.'
+    )
+    args = parser.parse_args()
+    if args.new:
+        build_all_timing_lists()
+
+    func_name: str = args.function.strip()
+    func = getattr(core, func_name, getattr(core, f'{func_name}_merge', None))
+    if not func:
+        for obj in core.__dict__.values():
+            if func_name == getattr(obj, '__doc__', None):
+                func = obj
+                break
+    assert callable(func), f'Object {func.__name__} is not a function.'
+
+    list_name: str = args.list.strip()
+    if not list_name.endswith('.txt'):
+        list_name = f'{list_name}.txt'
+    list_path = Path('.', 'lists', list_name)
+    if list_name in ('sven_list.txt', 'test_list.txt'):
+        lists = json.loads(list_path.read_text())
+    elif list_name == 'agf_list.txt':
+        lists = [
+            random.sample(range(10000), random.randint(0, 500))
+            for _ in range(2000)
+        ]
+    else:
+        with open(list_path, 'r') as f:
+            lists = [[int(x) for x in line.split()] for line in f]
+    assert len(lists) > 0, 'It would be better if the dataset had some data.'
+
+    prof_file_name = f'{func.__name__}_{list_name}.prof'
+    cProfile.runctx('func(lists)', globals(), locals(), prof_file_name)
+    try:
+        subprocess.run(['snakeviz', prof_file_name])
+    except KeyboardInterrupt:
+        exit()
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,2 @@
+networkx==3.1
+snakeviz==2.2.0
diff --git a/test.py b/test.py
@@ -9,12 +9,12 @@ class MergeTestCase(unittest.TestCase):
     def setUp(self):
         with open('./lists/test_list.txt') as f:
             self.lsts = json.loads(f.read())
-        self.merged = self.merge_func(deepcopy(self.lsts))
+        self.merged = list(self.merge_func(deepcopy(self.lsts)))
 
     def test_disjoint(self):
         """Check disjoint-ness of merged results"""
         from itertools import combinations
-        for a,b in combinations(self.merged, 2):
+        for a, b in combinations(self.merged, 2):
             self.assertTrue(a.isdisjoint(b))
 
     def test_coverage(self):    # Credit to katrielalex