From 46df017e6083f5feb32676905151c8d1018d96b5 Mon Sep 17 00:00:00 2001
From: Nikolai Romashchenko <nikolay.romashchenko@gmail.com>
Date: Tue, 13 Aug 2024 17:39:34 +0200
Subject: [PATCH 01/17] Commented out njit optimizations for debugging

---
 omamer/main.py         | 4 ++++
 omamer/merge_search.py | 9 +++++----
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/omamer/main.py b/omamer/main.py
index cc7a6ca..ef7bd16 100644
--- a/omamer/main.py
+++ b/omamer/main.py
@@ -304,3 +304,7 @@ def get_thread_count():
         args.func(args)
     else:
         parser.print_usage()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/omamer/merge_search.py b/omamer/merge_search.py
index f0fb765..e9f942a 100644
--- a/omamer/merge_search.py
+++ b/omamer/merge_search.py
@@ -159,7 +159,7 @@ def get_fam_level_offsets(fam_ent, level_arr):
 
 
 ## search functions
-@numba.njit
+#@numba.njit
 def custom_unique1d(ar):
     """
     adapted from np._unique1d for numba
@@ -176,7 +176,7 @@ def custom_unique1d(ar):
     return aux[mask], perm[mask], np.diff(idx)
 
 
-@numba.njit
+#@numba.njit
 def parse_seq(s, DIGITS_AA_LOOKUP, n_kmers, k, trans, x_flag):
     """
     get the sequence unique k-mers and non ambiguous locations (when truly unique)
@@ -196,7 +196,7 @@ def parse_seq(s, DIGITS_AA_LOOKUP, n_kmers, k, trans, x_flag):
     return custom_unique1d(r)
 
 
-@numba.njit
+#@numba.njit
 def search_seq_kmers(r1, p1, hog_tab, fam_tab, x_flag, table_idx, table_buff):
     """
     Perform the kmer search, using the index.
@@ -766,4 +766,5 @@ def func(
                         c[int(choice)] if choice_score != 0.0 else 0
                     )
 
-        return numba.jit(func, parallel=True, nopython=True, nogil=True)
+        return func
+        #return numba.jit(func, parallel=True, nopython=True, nogil=True)

From 45404330debb36ace242eb12c753eff6bf850e29 Mon Sep 17 00:00:00 2001
From: Nikolai Romashchenko <nikolay.romashchenko@gmail.com>
Date: Wed, 21 Aug 2024 15:41:18 +0200
Subject: [PATCH 02/17] Linear-time k-mer encoding

---
 omamer/merge_search.py | 55 ++++++++++++++++++++++++++++++------------
 1 file changed, 40 insertions(+), 15 deletions(-)

diff --git a/omamer/merge_search.py b/omamer/merge_search.py
index e9f942a..39b729f 100644
--- a/omamer/merge_search.py
+++ b/omamer/merge_search.py
@@ -20,7 +20,7 @@
     You should have received a copy of the GNU Lesser General Public License
     along with OMAmer. If not, see <http://www.gnu.org/licenses/>.
 """
-from Rmath4 import pbinom
+from Rmath4 import pbinom, phyper
 from numba.tests.support import captured_stdout
 from property_manager import lazy_property, cached_property
 from time import time
@@ -56,6 +56,10 @@ def binom_neglogccdf(x, n, p):
     # return -1.0 * np.log(sc.bdtrc(np.float64(x - 1), n, p))
     return -1.0 * pbinom(x - 1, n, p, 0, 1)
 
+    # pbinom(n - 1, m, m/N, 0, 0)
+    # phyper(n - 1, m, N - m, m, 0, 0)
+    #return -1.0 * phyper(x - 1, n, n/p - n, n, 0, 1)
+
 
 # ----
 # family result sorting
@@ -159,7 +163,7 @@ def get_fam_level_offsets(fam_ent, level_arr):
 
 
 ## search functions
-#@numba.njit
+@numba.njit
 def custom_unique1d(ar):
     """
     adapted from np._unique1d for numba
@@ -176,27 +180,48 @@ def custom_unique1d(ar):
     return aux[mask], perm[mask], np.diff(idx)
 
 
-#@numba.njit
+@numba.njit
 def parse_seq(s, DIGITS_AA_LOOKUP, n_kmers, k, trans, x_flag):
     """
     get the sequence unique k-mers and non ambiguous locations (when truly unique)
     """
     s_norm = DIGITS_AA_LOOKUP[s]
     r = np.zeros(n_kmers, dtype=np.uint32)  # max kmer 7
-    for i in numba.prange(n_kmers):
-        # numba can't do np.dot with non-float
-        for j in numba.prange(k):
-            r[i] += trans[j] * s_norm[i + j]
-
-        # does k-mer contain any X?
-        x_seen = np.any(s_norm[i : i + k] == DIGITS_AA_LOOKUP[88])  # 88==b'X'
-        # if yes, replace it by the x_flag
+
+    # compute the code of the first k-mer
+    for j in range(k):
+        r[0] += trans[j] * s_norm[j]
+
+    # does k-mer contain any X?
+    x_seen = np.any(s_norm[0:k] == DIGITS_AA_LOOKUP[88])
+    # if yes, replace it by the x_flag
+    r[0] = r[0] if not x_seen else x_flag
+
+    # codes for other k-mers
+    for i in range(1, n_kmers):
+        if not x_seen:
+            # if the previous k-mer was valid,
+            # recompute the current code from the previous one
+
+            # remove the first character from the code
+            shared = r[i-1] - (trans[0] * s_norm[i - 1])
+
+            # trans[-2] is the alphabet size
+            r[i] = shared * trans[-2] + trans[-1] * s_norm[i + k - 1]
+
+        else:
+            # if the previous k-mer has Xs,
+            # just compute the code from scratch
+            for j in range(k):
+                r[i] += trans[j] * s_norm[i + j]
+
+        x_seen = np.any(s_norm[i: i + k] == DIGITS_AA_LOOKUP[88])
         r[i] = r[i] if not x_seen else x_flag
 
     return custom_unique1d(r)
 
 
-#@numba.njit
+@numba.njit
 def search_seq_kmers(r1, p1, hog_tab, fam_tab, x_flag, table_idx, table_buff):
     """
     Perform the kmer search, using the index.
@@ -663,7 +688,7 @@ def func(
                 correction_factor = np.log(len(ref_fam_prob))
                 for i in numba.prange(len(qres)):
                     qres["pvalue"][i] = min(
-                        MAX_LOGP,
+                        float(MAX_LOGP),
                         max(
                             0.0,
                             (
@@ -766,5 +791,5 @@ def func(
                         c[int(choice)] if choice_score != 0.0 else 0
                     )
 
-        return func
-        #return numba.jit(func, parallel=True, nopython=True, nogil=True)
+        return numba.jit(func, parallel=True, nopython=True, nogil=True)
+        #return func

From 1a087000b14398fdd15bfb2eec1b030ae7948d9f Mon Sep 17 00:00:00 2001
From: Nikolai Romashchenko <nikolay.romashchenko@gmail.com>
Date: Wed, 21 Aug 2024 16:32:09 +0200
Subject: [PATCH 03/17] Got rid of sorting k-mers in parse_seq

---
 omamer/merge_search.py | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/omamer/merge_search.py b/omamer/merge_search.py
index 39b729f..4bc548d 100644
--- a/omamer/merge_search.py
+++ b/omamer/merge_search.py
@@ -22,6 +22,7 @@
 """
 from Rmath4 import pbinom, phyper
 from numba.tests.support import captured_stdout
+from numba.typed import List
 from property_manager import lazy_property, cached_property
 from time import time
 import numba
@@ -180,6 +181,24 @@ def custom_unique1d(ar):
     return aux[mask], perm[mask], np.diff(idx)
 
 
+@numba.njit
+def unique1d_linear(array):
+    """
+    Find a set of unique elements in linear time
+    """
+    unique_list = List()
+    index_list = List()
+    seen = set()
+
+    for i in range(len(array)):
+        if array[i] not in seen:
+            seen.add(array[i])
+            unique_list.append(array[i])
+            index_list.append(i)
+
+    return np.asarray(unique_list), np.asarray(index_list), None
+
+
 @numba.njit
 def parse_seq(s, DIGITS_AA_LOOKUP, n_kmers, k, trans, x_flag):
     """
@@ -218,7 +237,7 @@ def parse_seq(s, DIGITS_AA_LOOKUP, n_kmers, k, trans, x_flag):
         x_seen = np.any(s_norm[i: i + k] == DIGITS_AA_LOOKUP[88])
         r[i] = r[i] if not x_seen else x_flag
 
-    return custom_unique1d(r)
+    return unique1d_linear(r)
 
 
 @numba.njit

From bf741f4e48fa412fdbad20c08403ed91c60fe23c Mon Sep 17 00:00:00 2001
From: Nikolai Romashchenko <nikolay.romashchenko@gmail.com>
Date: Mon, 26 Aug 2024 15:19:08 +0200
Subject: [PATCH 04/17] First implementation of the quickselect algorithm for
 filtering results

---
 omamer/merge_search.py | 244 +++++++++++++++++++++++++++++++++--------
 1 file changed, 199 insertions(+), 45 deletions(-)

diff --git a/omamer/merge_search.py b/omamer/merge_search.py
index 4bc548d..9121717 100644
--- a/omamer/merge_search.py
+++ b/omamer/merge_search.py
@@ -22,7 +22,7 @@
 """
 from Rmath4 import pbinom, phyper
 from numba.tests.support import captured_stdout
-from numba.typed import List
+from numba.typed import List, Dict
 from property_manager import lazy_property, cached_property
 from time import time
 import numba
@@ -85,6 +85,52 @@ def fam_res_compare(x1, x2):
     return 0
 
 
+@numba.njit
+def fam_res_less(x1, x2):
+    """
+    Same as fam_res_compare, but operates as '<'
+    """
+    if x1["normcount"] != x2["normcount"]:
+        return x1["normcount"] < x2["normcount"]
+
+    if x1["overlap"] != x2["overlap"]:
+        return x1["overlap"] < x2["overlap"]
+
+    if x1["pvalue"] != x2["pvalue"]:
+        return x1["pvalue"] < x2["pvalue"]
+
+    return False
+
+
+@numba.njit
+def fam_res_le(x1, x2):
+    """
+    Same as fam_res_compare, but operates as '<='
+    """
+    if not fam_res_less(x1, x2):
+        return x1["normcount"] == x2["normcount"] and \
+            x1["overlap"] == x2["overlap"] and \
+            x1["pvalue"] == x2["pvalue"]
+
+    return True
+
+
+@numba.njit
+def fam_res_greater(x1, x2):
+    """
+    Same as fam_res_compare, but operates as '>'
+    """
+    return not fam_res_le(x1, x2)
+
+
+@numba.njit
+def fam_res_ge(x1, x2):
+    """
+    Same as fam_res_compare, but operates as '>='
+    """
+    return not fam_res_less(x1, x2)
+
+
 @numba.njit
 def family_result_argsort(x, ii):
     """
@@ -120,17 +166,98 @@ def family_result_argsort(x, ii):
 
 
 @numba.njit
-def family_result_sort(x):
+def family_result_sort(x, k):
     """
     Sort the family results according to normcount, overlap and pvalue (to break ties).
     this uses a quicksort implementation as np.argsort does not support struct type in numba.
     """
-    # perform the family result sorting.
-    idx = family_result_argsort(x, list(range(len(x))))
-    y = np.zeros_like(x)
-    for i in numba.prange(len(x)):
-        y[i] = x[idx[i]]
-    return y
+
+    if len(x) <= k:
+        #idx = family_result_argsort(x, list(range(len(x))))
+        #y = np.zeros_like(x)
+        #for i in numba.prange(len(x)):
+        #    y[i] = x[idx[i]]
+        return x
+
+    _ = _select(x, k, 0, len(x) - 1)
+    return x[:k]
+
+
+@numba.njit
+def _swap(data, i, j):
+    # Manually swap each field. I don't know how to do it more
+    # generally under @numba.njit, as numba seems to have problems
+    # with indexing structured arrays
+    temp = np.zeros(1, dtype=data.dtype)[0]
+    temp['id'] = data[i]['id']
+    temp['pvalue'] = data[i]['pvalue']
+    temp['count'] = data[i]['count']
+    temp['normcount'] = data[i]['normcount']
+    temp['overlap'] = data[i]['overlap']
+
+    data[i]['id'] = data[j]['id']
+    data[i]['pvalue'] = data[j]['pvalue']
+    data[i]['count'] = data[j]['count']
+    data[i]['normcount'] = data[j]['normcount']
+    data[i]['overlap'] = data[j]['overlap']
+
+    data[j]['id'] = temp['id']
+    data[j]['pvalue'] = temp['pvalue']
+    data[j]['count'] = temp['count']
+    data[j]['normcount'] = temp['normcount']
+    data[j]['overlap'] = temp['overlap']
+
+
+@numba.njit
+def _partition(A, low, high, mid):
+    if mid == -1:
+        mid = (low + high) >> 1
+    # NOTE: the pattern of swaps below for the pivot choice and the
+    # partitioning gives good results (i.e. regular O(n log n))
+    # on sorted, reverse-sorted, and uniform arrays.  Subtle changes
+    # risk breaking this property.
+
+    # Use median of three {low, middle, high} as the pivot
+    if fam_res_greater(A[mid], A[low]):
+        _swap(A, mid, low)
+    if fam_res_greater(A[high], A[mid]):
+        _swap(A, high, mid)
+        if fam_res_greater(A[mid], A[low]):
+            _swap(A, low, mid)
+
+    pivot = np.zeros(1, dtype=A.dtype)[0]
+    pivot['id'] = A[mid]['id']
+    pivot['pvalue'] = A[mid]['pvalue']
+    pivot['count'] = A[mid]['count']
+    pivot['normcount'] = A[mid]['normcount']
+    pivot['overlap'] = A[mid]['overlap']
+
+    _swap(A, high, mid)
+
+    i = low
+    for j in range(low, high):
+        if fam_res_greater(A[j], pivot):
+            _swap(A, i, j)
+            i += 1
+
+    _swap(A, i, high)
+    return i
+
+
+@numba.njit
+def _select(array, k, low, high):
+    """
+    Select the k'th largest element
+    """
+    i = _partition(array, low, high, -1)
+    while i != k:
+        if i < k:
+            low = i + 1
+            i = _partition(array, low, high, -1)
+        else:
+            high = i - 1
+            i = _partition(array, low, high, -1)
+    return array[k]
 
 
 # ----
@@ -196,7 +323,7 @@ def unique1d_linear(array):
             unique_list.append(array[i])
             index_list.append(i)
 
-    return np.asarray(unique_list), np.asarray(index_list), None
+    return np.asarray(unique_list), np.asarray(index_list, dtype=np.uint32), None
 
 
 @numba.njit
@@ -245,12 +372,12 @@ def search_seq_kmers(r1, p1, hog_tab, fam_tab, x_flag, table_idx, table_buff):
     """
     Perform the kmer search, using the index.
     """
-    hog_counts = np.zeros(hog_tab.size, dtype=np.uint16)
-    fam_counts = np.zeros(fam_tab.size, dtype=np.uint16)
+    hog_counts = Dict.empty(key_type=numba.uint32, value_type=numba.int64)
+    fam_counts = Dict.empty(key_type=numba.uint32, value_type=numba.int64)
 
     # store lowest and higher k-mer locations
-    fam_lowloc = np.full(fam_tab.size, -1, dtype=np.int32)
-    fam_highloc = np.full(fam_tab.size, -1, dtype=np.int32)
+    fam_lowloc = Dict.empty(key_type=numba.uint32, value_type=numba.uint32)
+    fam_highloc = Dict.empty(key_type=numba.uint32, value_type=numba.uint32)
 
     # iterate unique k-mers
     for m in range(r1.shape[0]):
@@ -260,30 +387,39 @@ def search_seq_kmers(r1, p1, hog_tab, fam_tab, x_flag, table_idx, table_buff):
         # to ignore k-mers with X
         if kmer == x_flag:
             continue
-        else:
-            pass
 
         # get mapping to HOGs
-        x = table_idx[kmer : kmer + 2]
-        hogs = table_buff[x[0] : x[1]]
+        x = table_idx[kmer: kmer + 2]
+        hogs = table_buff[x[0]: x[1]]
         fams = hog_tab["FamOff"][hogs]
-        hog_counts[hogs] += np.uint16(1)
-        fam_counts[fams] += np.uint16(1)
 
-        # store lowest and highest locations
+        for hog in hogs:
+            if hog in hog_counts:
+                hog_counts[hog] += 1
+            else:
+                hog_counts[hog] = 1
+
         for fam_off in fams:
-            # initiate first location
-            if fam_lowloc[fam_off] == -1:
-                fam_lowloc[fam_off] = loc
-                fam_highloc[fam_off] = loc
+            if fam_off in fam_counts:
+                fam_counts[fam_off] += 1
+            else:
+                fam_counts[fam_off] = 1
 
-            # update either lower or higher boundary
-            elif loc < fam_lowloc[fam_off]:
+            if fam_off in fam_lowloc:
+                if loc < fam_lowloc[fam_off]:
+                    fam_lowloc[fam_off] = loc
+            else:
+                # initiate first location
                 fam_lowloc[fam_off] = loc
-            elif loc > fam_highloc[fam_off]:
+
+            if fam_off in fam_highloc:
+                if loc > fam_highloc[fam_off]:
+                    fam_highloc[fam_off] = loc
+            else:
+                # initiate first location
                 fam_highloc[fam_off] = loc
 
-    return (hog_counts, fam_counts, fam_lowloc, fam_highloc)
+    return hog_counts, fam_counts, fam_lowloc, fam_highloc
 
 
 ## functions to cumulate HOG k-mer counts
@@ -421,6 +557,24 @@ def get_closest_taxa_from_ref(q2hog_off, ref_taxoff, tax_tab, hog_tab, chog_buff
     return q2closest_taxon
 
 
+@numba.njit
+def unzip_dict(data: numba.typed.Dict):
+    keys = np.zeros(len(data))
+    values = np.zeros_like(keys)
+    for i, key in enumerate(data):
+        keys[i] = key
+        values[i] = data[key]
+    return keys, values
+
+
+@numba.njit
+def dict_slice(data: numba.typed.Dict, start: np.uint32, end: np.uint32) -> np.ndarray:
+    result = np.zeros(end - start, dtype=np.uint32)
+    for j in range(start, end):
+        result[j - start] = data.get(j, np.uint32(0))
+    return result
+
+
 class MergeSearch(object):
     def __init__(self, ki, include_extant_genes=False):
         assert ki.db.db.mode == "r", "Database must be opened in read mode."
@@ -693,15 +847,15 @@ def func(
                     continue
 
                 # search using kmers
-                (hog_counts, fam_counts, fam_lowloc, fam_highloc) = search_seq_kmers(
+                hog_counts, fam_counts, fam_lowloc, fam_highloc = search_seq_kmers(
                     r1, p1, hog_tab, fam_tab, x_flag, table_idx, table_buff
                 )
 
                 # identify families of interest. note: repeat(zeros_like..) numba hack
-                idx = np.argwhere(fam_counts > 0).flatten()
-                qres = np.repeat(np.zeros_like(family_results[zz, 0]), len(idx))
-                qres["id"][:] = idx
-                qres["count"][:] = fam_counts[idx]
+                count_keys, counts_values = unzip_dict(fam_counts)
+                qres = np.repeat(np.zeros_like(family_results[zz, 0]), len(count_keys))
+                qres["id"][:] = count_keys
+                qres["count"][:] = counts_values
 
                 # 1. compute p-value for each family. note: in negative log units
                 correction_factor = np.log(len(ref_fam_prob))
@@ -732,9 +886,12 @@ def func(
                     continue
 
                 # - b. filter on sequence coverage
-                qres["overlap"][:] = (
-                    fam_highloc[qres["id"]] - fam_lowloc[qres["id"]] + k
-                ) / query_len
+                overlap = np.zeros(len(qres))
+                for i in range(len(qres)):
+                    family_id = qres["id"][i]
+                    overlap[i] = (fam_highloc[family_id] - fam_lowloc[family_id] + k) / query_len
+
+                qres["overlap"][:] = overlap
 
                 qres = qres[(qres["overlap"] >= (25 / query_len))]
                 if len(qres) == 0:
@@ -751,18 +908,14 @@ def func(
 
                 # 4. Store results
                 # - a. sort by normcount, then overlap, then p-value for tie-breaking
-                qres = family_result_sort(qres)
+                qres = family_result_sort(qres, top_n_fams)
 
                 # - b. store results
                 family_results["id"][zz, :top_n_fams] = qres["id"][:top_n_fams] + 1
                 family_results["pvalue"][zz, :top_n_fams] = qres["pvalue"][:top_n_fams]
                 family_results["count"][zz, :top_n_fams] = qres["count"][:top_n_fams]
-                family_results["normcount"][zz, :top_n_fams] = qres["normcount"][
-                    :top_n_fams
-                ]
-                family_results["overlap"][zz, :top_n_fams] = qres["overlap"][
-                    :top_n_fams
-                ]
+                family_results["normcount"][zz, :top_n_fams] = qres["normcount"][:top_n_fams]
+                family_results["overlap"][zz, :top_n_fams] = qres["overlap"][:top_n_fams]
 
                 # 5. Place within families
                 for i in numba.prange(min(len(qres), top_n_fams)):
@@ -779,7 +932,8 @@ def func(
                     fam_level_offsets = get_fam_level_offsets(entry, level_arr)
 
                     # cumulation of counts
-                    c = hog_counts[hog_s:hog_e].copy()
+                    c = dict_slice(hog_counts, hog_s, hog_e)
+
                     cumulate_counts_1fam(c, fam_level_offsets, fam_hog2parent)
 
                     # new expected count, but using old cumulation
@@ -788,7 +942,7 @@ def func(
                         r1.size,
                         fam_level_offsets,
                         fam_hog2parent,
-                        hog_counts[hog_s:hog_e],
+                        dict_slice(hog_counts, hog_s, hog_e),
                         ref_hog_prob[hog_s:hog_e],
                     )
 

From ff4d96827a747ca9297964633ba51a73bbb63b99 Mon Sep 17 00:00:00 2001
From: Nikolai Romashchenko <nikolay.romashchenko@gmail.com>
Date: Mon, 26 Aug 2024 16:28:55 +0200
Subject: [PATCH 05/17] EPIK-like algorithm for family placemnt,
 parallelization off

---
 omamer/merge_search.py | 94 +++++++++++++++++++++++++-----------------
 1 file changed, 56 insertions(+), 38 deletions(-)

diff --git a/omamer/merge_search.py b/omamer/merge_search.py
index 9121717..6a3940e 100644
--- a/omamer/merge_search.py
+++ b/omamer/merge_search.py
@@ -172,11 +172,13 @@ def family_result_sort(x, k):
     this uses a quicksort implementation as np.argsort does not support struct type in numba.
     """
 
+    idx = family_result_argsort(x, list(range(len(x))))
+    y = np.zeros_like(x)
+    for i in numba.prange(len(x)):
+        y[i] = x[idx[i]]
+    return y
+
     if len(x) <= k:
-        #idx = family_result_argsort(x, list(range(len(x))))
-        #y = np.zeros_like(x)
-        #for i in numba.prange(len(x)):
-        #    y[i] = x[idx[i]]
         return x
 
     _ = _select(x, k, 0, len(x) - 1)
@@ -368,16 +370,25 @@ def parse_seq(s, DIGITS_AA_LOOKUP, n_kmers, k, trans, x_flag):
 
 
 @numba.njit
-def search_seq_kmers(r1, p1, hog_tab, fam_tab, x_flag, table_idx, table_buff):
+def search_seq_kmers(r1, p1, hog_tab, fam_tab, x_flag, table_idx, table_buff,
+                     hog_counts, fam_counts, fam_lowloc, fam_highloc,
+                     deinit_fam_list, deinit_hog_list):
     """
     Perform the kmer search, using the index.
     """
-    hog_counts = Dict.empty(key_type=numba.uint32, value_type=numba.int64)
-    fam_counts = Dict.empty(key_type=numba.uint32, value_type=numba.int64)
 
-    # store lowest and higher k-mer locations
-    fam_lowloc = Dict.empty(key_type=numba.uint32, value_type=numba.uint32)
-    fam_highloc = Dict.empty(key_type=numba.uint32, value_type=numba.uint32)
+    # reinitialize counters for the indexes modified
+    # by the previous call
+    for family in deinit_fam_list:
+        fam_counts[family] = 0
+        fam_lowloc[family] = -1
+        fam_highloc[family] = -1
+
+    for hog in deinit_hog_list:
+        hog_counts[hog] = 0
+
+    deinit_fam_list = List.empty_list(numba.int32)
+    deinit_hog_list = List.empty_list(numba.int32)
 
     # iterate unique k-mers
     for m in range(r1.shape[0]):
@@ -394,32 +405,29 @@ def search_seq_kmers(r1, p1, hog_tab, fam_tab, x_flag, table_idx, table_buff):
         fams = hog_tab["FamOff"][hogs]
 
         for hog in hogs:
-            if hog in hog_counts:
-                hog_counts[hog] += 1
-            else:
-                hog_counts[hog] = 1
+            if not hog_counts[hog]:
+                deinit_hog_list.append(hog)
+
+            hog_counts[hog] += 1
 
         for fam_off in fams:
-            if fam_off in fam_counts:
-                fam_counts[fam_off] += 1
-            else:
-                fam_counts[fam_off] = 1
+            if not fam_counts[fam_off]:
+                deinit_fam_list.append(fam_off)
 
-            if fam_off in fam_lowloc:
-                if loc < fam_lowloc[fam_off]:
-                    fam_lowloc[fam_off] = loc
-            else:
-                # initiate first location
+            fam_counts[fam_off] += 1
+
+            # initiate first location
+            if fam_lowloc[fam_off] == -1:
                 fam_lowloc[fam_off] = loc
+                fam_highloc[fam_off] = loc
 
-            if fam_off in fam_highloc:
-                if loc > fam_highloc[fam_off]:
-                    fam_highloc[fam_off] = loc
-            else:
-                # initiate first location
+            # update either lower or higher boundary
+            elif loc < fam_lowloc[fam_off]:
+                fam_lowloc[fam_off] = loc
+            elif loc > fam_highloc[fam_off]:
                 fam_highloc[fam_off] = loc
 
-    return hog_counts, fam_counts, fam_lowloc, fam_highloc
+    return deinit_fam_list, deinit_hog_list
 
 
 ## functions to cumulate HOG k-mer counts
@@ -827,6 +835,13 @@ def func(
             # flags to ignore k-mers containing X
             x_flag = table_idx.size - 1
 
+            hog_counts = np.zeros(hog_tab.size, dtype=np.uint16)
+            fam_counts = np.zeros(fam_tab.size, dtype=np.uint16)
+            fam_lowloc = np.full(fam_tab.size, -1, dtype=np.int32)
+            fam_highloc = np.full(fam_tab.size, -1, dtype=np.int32)
+            deinit_fam_list = List.empty_list(numba.int32)
+            deinit_hog_list = List.empty_list(numba.int32)
+
             for zz in numba.prange(len(seqs_idx) - 1):
                 # load seq
                 s = seqs[seqs_idx[zz] : np.int64(seqs_idx[zz + 1] - 1)]
@@ -847,15 +862,18 @@ def func(
                     continue
 
                 # search using kmers
-                hog_counts, fam_counts, fam_lowloc, fam_highloc = search_seq_kmers(
-                    r1, p1, hog_tab, fam_tab, x_flag, table_idx, table_buff
+                deinit_fam_list, deinit_hog_list = search_seq_kmers(
+                    r1, p1, hog_tab, fam_tab, x_flag, table_idx, table_buff,
+                    hog_counts, fam_counts, fam_lowloc, fam_highloc,
+                    deinit_fam_list, deinit_hog_list
                 )
 
                 # identify families of interest. note: repeat(zeros_like..) numba hack
-                count_keys, counts_values = unzip_dict(fam_counts)
-                qres = np.repeat(np.zeros_like(family_results[zz, 0]), len(count_keys))
-                qres["id"][:] = count_keys
-                qres["count"][:] = counts_values
+                idx = np.argwhere(fam_counts > 0).flatten()
+                #idx = deinit_fam_list
+                qres = np.repeat(np.zeros_like(family_results[zz, 0]), len(idx))
+                qres["id"][:] = idx
+                qres["count"][:] = fam_counts[idx]
 
                 # 1. compute p-value for each family. note: in negative log units
                 correction_factor = np.log(len(ref_fam_prob))
@@ -932,7 +950,7 @@ def func(
                     fam_level_offsets = get_fam_level_offsets(entry, level_arr)
 
                     # cumulation of counts
-                    c = dict_slice(hog_counts, hog_s, hog_e)
+                    c = hog_counts[hog_s:hog_e].copy()
 
                     cumulate_counts_1fam(c, fam_level_offsets, fam_hog2parent)
 
@@ -942,7 +960,7 @@ def func(
                         r1.size,
                         fam_level_offsets,
                         fam_hog2parent,
-                        dict_slice(hog_counts, hog_s, hog_e),
+                        hog_counts[hog_s:hog_e],
                         ref_hog_prob[hog_s:hog_e],
                     )
 
@@ -964,5 +982,5 @@ def func(
                         c[int(choice)] if choice_score != 0.0 else 0
                     )
 
-        return numba.jit(func, parallel=True, nopython=True, nogil=True)
+        return numba.jit(func, parallel=False, nopython=True, nogil=True)
         #return func

From 1624f1d041dd3464988fdcfb4f764cffd4a8bb1d Mon Sep 17 00:00:00 2001
From: Nikolai Romashchenko <nikolay.romashchenko@gmail.com>
Date: Wed, 28 Aug 2024 11:25:47 +0200
Subject: [PATCH 06/17] Changed the order of family filtering: start with
 overlap

---
 omamer/merge_search.py | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/omamer/merge_search.py b/omamer/merge_search.py
index 6a3940e..fc5b7b2 100644
--- a/omamer/merge_search.py
+++ b/omamer/merge_search.py
@@ -875,6 +875,19 @@ def func(
                 qres["id"][:] = idx
                 qres["count"][:] = fam_counts[idx]
 
+                # 2. Filtering
+                # - b. filter on sequence coverage
+                overlap = np.zeros(len(qres))
+                for i in range(len(qres)):
+                    family_id = qres["id"][i]
+                    overlap[i] = (fam_highloc[family_id] - fam_lowloc[family_id] + k) / query_len
+
+                qres["overlap"][:] = overlap
+
+                qres = qres[(qres["overlap"] >= (25 / query_len))]
+                if len(qres) == 0:
+                    continue
+
                 # 1. compute p-value for each family. note: in negative log units
                 correction_factor = np.log(len(ref_fam_prob))
                 for i in numba.prange(len(qres)):
@@ -893,25 +906,12 @@ def func(
                         ),
                     )
 
-                # 2. Filtering
                 # - a. filter to significant families (on p-value)
                 alpha = -1.0 * np.log(alpha_cutoff)
                 qres = qres[qres["pvalue"] >= alpha]
                 # filter out 0 neg log p. alpha > 0 is normal. alpha = 0 is edge case.
                 qres = qres if alpha > 0 else qres[qres["pvalue"] > 0]
 
-                if len(qres) == 0:
-                    continue
-
-                # - b. filter on sequence coverage
-                overlap = np.zeros(len(qres))
-                for i in range(len(qres)):
-                    family_id = qres["id"][i]
-                    overlap[i] = (fam_highloc[family_id] - fam_lowloc[family_id] + k) / query_len
-
-                qres["overlap"][:] = overlap
-
-                qres = qres[(qres["overlap"] >= (25 / query_len))]
                 if len(qres) == 0:
                     continue
 

From cddd0d30a9034eb420a8b5c876b469a176b88a32 Mon Sep 17 00:00:00 2001
From: Nikolai Romashchenko <nikolay.romashchenko@gmail.com>
Date: Thu, 29 Aug 2024 10:03:52 +0200
Subject: [PATCH 07/17] Modified quickselect to base it on indexes not data

---
 omamer/merge_search.py | 106 ++++++++++++++++-------------------------
 1 file changed, 40 insertions(+), 66 deletions(-)

diff --git a/omamer/merge_search.py b/omamer/merge_search.py
index fc5b7b2..6f2349d 100644
--- a/omamer/merge_search.py
+++ b/omamer/merge_search.py
@@ -171,95 +171,69 @@ def family_result_sort(x, k):
     Sort the family results according to normcount, overlap and pvalue (to break ties).
     this uses a quicksort implementation as np.argsort does not support struct type in numba.
     """
-
-    idx = family_result_argsort(x, list(range(len(x))))
-    y = np.zeros_like(x)
-    for i in numba.prange(len(x)):
-        y[i] = x[idx[i]]
-    return y
-
-    if len(x) <= k:
-        return x
-
-    _ = _select(x, k, 0, len(x) - 1)
-    return x[:k]
+    idx = np.arange(len(x))
+    _ = _select(x, idx, k, 0, len(x) - 1)
+    return x[idx[:k]]
 
 
 @numba.njit
-def _swap(data, i, j):
-    # Manually swap each field. I don't know how to do it more
-    # generally under @numba.njit, as numba seems to have problems
-    # with indexing structured arrays
-    temp = np.zeros(1, dtype=data.dtype)[0]
-    temp['id'] = data[i]['id']
-    temp['pvalue'] = data[i]['pvalue']
-    temp['count'] = data[i]['count']
-    temp['normcount'] = data[i]['normcount']
-    temp['overlap'] = data[i]['overlap']
-
-    data[i]['id'] = data[j]['id']
-    data[i]['pvalue'] = data[j]['pvalue']
-    data[i]['count'] = data[j]['count']
-    data[i]['normcount'] = data[j]['normcount']
-    data[i]['overlap'] = data[j]['overlap']
-
-    data[j]['id'] = temp['id']
-    data[j]['pvalue'] = temp['pvalue']
-    data[j]['count'] = temp['count']
-    data[j]['normcount'] = temp['normcount']
-    data[j]['overlap'] = temp['overlap']
+def _swap(array, i, j):
+    tmp = array[i]
+    array[i] = array[j]
+    array[j] = tmp
 
 
 @numba.njit
-def _partition(A, low, high, mid):
-    if mid == -1:
-        mid = (low + high) >> 1
-    # NOTE: the pattern of swaps below for the pivot choice and the
-    # partitioning gives good results (i.e. regular O(n log n))
-    # on sorted, reverse-sorted, and uniform arrays.  Subtle changes
-    # risk breaking this property.
+def _partition(x, idx, low, high):
+    """
+    Index-based version of the partition algorithm of
+    quicksort. Juggles indexes of the idx array that
+    indexes the x array, considering the range [low, high].
+    """
+    mid = (low + high) >> 1
 
     # Use median of three {low, middle, high} as the pivot
-    if fam_res_greater(A[mid], A[low]):
-        _swap(A, mid, low)
-    if fam_res_greater(A[high], A[mid]):
-        _swap(A, high, mid)
-        if fam_res_greater(A[mid], A[low]):
-            _swap(A, low, mid)
-
-    pivot = np.zeros(1, dtype=A.dtype)[0]
-    pivot['id'] = A[mid]['id']
-    pivot['pvalue'] = A[mid]['pvalue']
-    pivot['count'] = A[mid]['count']
-    pivot['normcount'] = A[mid]['normcount']
-    pivot['overlap'] = A[mid]['overlap']
-
-    _swap(A, high, mid)
-
+    if fam_res_greater(x[idx[mid]], x[idx[low]]):
+        _swap(idx, mid, low)
+    if fam_res_greater(x[idx[high]], x[idx[mid]]):
+        _swap(idx, high, mid)
+        if fam_res_greater(x[idx[mid]], x[idx[low]]):
+            _swap(idx, low, mid)
+
+    pivot = x[idx[mid]]
+    # Put the pivot in the end of the array
+    _swap(idx, mid, high)
+
+    # Collect elements that are > pivot in the beginning
     i = low
     for j in range(low, high):
-        if fam_res_greater(A[j], pivot):
-            _swap(A, i, j)
+        if fam_res_greater(x[idx[j]], pivot):
+            _swap(idx, i, j)
             i += 1
 
-    _swap(A, i, high)
+    # All indexes of idx in [0, i) are good now.
+    # Let's place the pivot back where it should be
+    _swap(idx, i, high)
     return i
 
 
 @numba.njit
-def _select(array, k, low, high):
+def _select(x, idx, k, low, high):
     """
-    Select the k'th largest element
+    Select the k'th largest element of the x array
     """
-    i = _partition(array, low, high, -1)
+    if k >= len(x):
+        return len(x)
+
+    i = _partition(x, idx, low, high)
     while i != k:
         if i < k:
             low = i + 1
-            i = _partition(array, low, high, -1)
+            i = _partition(x, idx, low, high)
         else:
             high = i - 1
-            i = _partition(array, low, high, -1)
-    return array[k]
+            i = _partition(x, idx, low, high)
+    return idx[k]
 
 
 # ----

From d41a63da027b0723bb3778f6929adb2be78da9e8 Mon Sep 17 00:00:00 2001
From: Nikolai Romashchenko <nikolay.romashchenko@gmail.com>
Date: Thu, 29 Aug 2024 10:33:30 +0200
Subject: [PATCH 08/17] Added in-jit time measurements of placement steps
 elapsed time

---
 omamer/_clock.py       |  8 ++++++++
 omamer/main.py         |  5 +++++
 omamer/merge_search.py | 44 +++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 56 insertions(+), 1 deletion(-)
 create mode 100644 omamer/_clock.py

diff --git a/omamer/_clock.py b/omamer/_clock.py
new file mode 100644
index 0000000..8d57dc6
--- /dev/null
+++ b/omamer/_clock.py
@@ -0,0 +1,8 @@
+""" CPU-time returning clock() function which works from within njit-ted code """
+import ctypes
+from ctypes.util import find_library
+
+__LIB = find_library("c")
+
+clock = ctypes.CDLL(__LIB).clock
+clock.argtypes = []
diff --git a/omamer/main.py b/omamer/main.py
index cc7a6ca..d7aa561 100644
--- a/omamer/main.py
+++ b/omamer/main.py
@@ -265,6 +265,7 @@ def get_thread_count():
         help="Show metadata about an omamer database.",
         description="Show metadata about an existing omamer database",
     )
+
     info_parser.set_defaults(func=info_db)
     info_parser.add_argument(
         "-d",
@@ -304,3 +305,7 @@ def get_thread_count():
         args.func(args)
     else:
         parser.print_usage()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/omamer/merge_search.py b/omamer/merge_search.py
index f0fb765..2b4b502 100644
--- a/omamer/merge_search.py
+++ b/omamer/merge_search.py
@@ -38,6 +38,7 @@
     is_taxon_implied,
     get_children,
 )
+from ._clock import clock
 
 
 # maximum neglogp to set
@@ -629,7 +630,18 @@ def func(
             # flags to ignore k-mers containing X
             x_flag = table_idx.size - 1
 
+            parse_time = 0.0
+            search_time = 0.0
+            filter_time = 0.0
+            pvalue_time = 0.0
+            place_time = 0.0
+            sort_time = 0.0
+            total_time = 0.0
+
             for zz in numba.prange(len(seqs_idx) - 1):
+
+                t0 = clock()
+
                 # load seq
                 s = seqs[seqs_idx[zz] : np.int64(seqs_idx[zz + 1] - 1)]
                 query_len = s.shape[0]
@@ -648,11 +660,19 @@ def func(
                 elif r1[0] == x_flag:
                     continue
 
+                t1 = clock()
+                parse_time += t1 - t0
+                t0 = clock()
+
                 # search using kmers
                 (hog_counts, fam_counts, fam_lowloc, fam_highloc) = search_seq_kmers(
                     r1, p1, hog_tab, fam_tab, x_flag, table_idx, table_buff
                 )
 
+                t1 = clock()
+                search_time += t1 - t0
+                t0 = clock()
+
                 # identify families of interest. note: repeat(zeros_like..) numba hack
                 idx = np.argwhere(fam_counts > 0).flatten()
                 qres = np.repeat(np.zeros_like(family_results[zz, 0]), len(idx))
@@ -677,6 +697,10 @@ def func(
                         ),
                     )
 
+                t1 = clock()
+                pvalue_time += t1 - t0
+                t0 = clock()
+
                 # 2. Filtering
                 # - a. filter to significant families (on p-value)
                 alpha = -1.0 * np.log(alpha_cutoff)
@@ -705,6 +729,10 @@ def func(
                     len(r1) - top_fam_expect_counts
                 )
 
+                t1 = clock()
+                filter_time += t1 - t0
+                t0 = clock()
+
                 # 4. Store results
                 # - a. sort by normcount, then overlap, then p-value for tie-breaking
                 qres = family_result_sort(qres)
@@ -720,6 +748,10 @@ def func(
                     :top_n_fams
                 ]
 
+                t1 = clock()
+                sort_time += t1 - t0
+                t0 = clock()
+
                 # 5. Place within families
                 for i in numba.prange(min(len(qres), top_n_fams)):
                     entry = fam_tab[qres["id"][i]]
@@ -766,4 +798,14 @@ def func(
                         c[int(choice)] if choice_score != 0.0 else 0
                     )
 
-        return numba.jit(func, parallel=True, nopython=True, nogil=True)
+                t1 = clock()
+                place_time += t1 - t0
+
+            print("Parse time\t", parse_time)
+            print("Search time\t", search_time)
+            print("Filter time\t", filter_time)
+            print("Pvalue time\t", pvalue_time)
+            print("Place time\t", place_time)
+            print("Sort time\t", sort_time)
+
+        return numba.jit(func, parallel=False, nopython=True, nogil=True)

From 8cf9763cfb9c22ca57428a5574c91ec072d5c874 Mon Sep 17 00:00:00 2001
From: Nikolai Romashchenko <nikolay.romashchenko@gmail.com>
Date: Thu, 29 Aug 2024 10:51:14 +0200
Subject: [PATCH 09/17] Improved time measurements

---
 omamer/_clock.py       | 14 ++++++++++----
 omamer/merge_search.py | 19 ++++++++++++-------
 2 files changed, 22 insertions(+), 11 deletions(-)

diff --git a/omamer/_clock.py b/omamer/_clock.py
index 8d57dc6..e02e1bf 100644
--- a/omamer/_clock.py
+++ b/omamer/_clock.py
@@ -1,8 +1,14 @@
-""" CPU-time returning clock() function which works from within njit-ted code """
 import ctypes
-from ctypes.util import find_library
 
-__LIB = find_library("c")
+# Access the _PyTime_AsSecondsDouble and _PyTime_GetSystemClock functions from pythonapi
+clock = ctypes.pythonapi._PyTime_GetSystemClock
+as_seconds = ctypes.pythonapi._PyTime_AsSecondsDouble
 
-clock = ctypes.CDLL(__LIB).clock
+# Set the argument types and return types of the functions
 clock.argtypes = []
+clock.restype = ctypes.c_int64
+
+as_seconds.argtypes = [ctypes.c_int64]
+as_seconds.restype = ctypes.c_double
+
+
diff --git a/omamer/merge_search.py b/omamer/merge_search.py
index 2b4b502..006f476 100644
--- a/omamer/merge_search.py
+++ b/omamer/merge_search.py
@@ -38,7 +38,7 @@
     is_taxon_implied,
     get_children,
 )
-from ._clock import clock
+from ._clock import clock, as_seconds
 
 
 # maximum neglogp to set
@@ -801,11 +801,16 @@ def func(
                 t1 = clock()
                 place_time += t1 - t0
 
-            print("Parse time\t", parse_time)
-            print("Search time\t", search_time)
-            print("Filter time\t", filter_time)
-            print("Pvalue time\t", pvalue_time)
-            print("Place time\t", place_time)
-            print("Sort time\t", sort_time)
+            total_time = parse_time + search_time + filter_time + pvalue_time + place_time + sort_time
+
+            print()
+            print("Parse time\t", as_seconds(parse_time))
+            print("Search time\t", as_seconds(search_time))
+            print("Filter time\t", as_seconds(filter_time))
+            print("Pvalue time\t", as_seconds(pvalue_time))
+            print("Place time\t", as_seconds(place_time))
+            print("Sort time\t", as_seconds(sort_time))
+            print("Batch total\t", as_seconds(total_time))
+            print()
 
         return numba.jit(func, parallel=False, nopython=True, nogil=True)

From e15c65e7444f65076571e622690d212f79e50885 Mon Sep 17 00:00:00 2001
From: Nikolai Romashchenko <nikolay.romashchenko@gmail.com>
Date: Thu, 29 Aug 2024 11:43:22 +0200
Subject: [PATCH 10/17] Reimplemented list-based query search with arrays

---
 omamer/merge_search.py | 48 ++++++++++++++++++++++--------------------
 1 file changed, 25 insertions(+), 23 deletions(-)

diff --git a/omamer/merge_search.py b/omamer/merge_search.py
index 457383c..d6cd66d 100644
--- a/omamer/merge_search.py
+++ b/omamer/merge_search.py
@@ -347,23 +347,23 @@ def parse_seq(s, DIGITS_AA_LOOKUP, n_kmers, k, trans, x_flag):
 @numba.njit
 def search_seq_kmers(r1, p1, hog_tab, fam_tab, x_flag, table_idx, table_buff,
                      hog_counts, fam_counts, fam_lowloc, fam_highloc,
-                     deinit_fam_list, deinit_hog_list):
+                     hit_fams, num_hit_fams, hit_hogs, num_hit_hogs):
     """
     Perform the kmer search, using the index.
     """
-
-    # reinitialize counters for the indexes modified
-    # by the previous call
-    for family in deinit_fam_list:
+    # Reinitialize counters modified by the previous call
+    for i in range(num_hit_fams):
+        family = hit_fams[i]
         fam_counts[family] = 0
         fam_lowloc[family] = -1
         fam_highloc[family] = -1
 
-    for hog in deinit_hog_list:
+    for i in range(num_hit_hogs):
+        hog = hit_hogs[i]
         hog_counts[hog] = 0
 
-    deinit_fam_list = List.empty_list(numba.int32)
-    deinit_hog_list = List.empty_list(numba.int32)
+    num_hit_fams = 0
+    num_hit_hogs = 0
 
     # iterate unique k-mers
     for m in range(r1.shape[0]):
@@ -381,13 +381,15 @@ def search_seq_kmers(r1, p1, hog_tab, fam_tab, x_flag, table_idx, table_buff,
 
         for hog in hogs:
             if not hog_counts[hog]:
-                deinit_hog_list.append(hog)
+                hit_hogs[num_hit_hogs] = hog
+                num_hit_hogs += 1
 
             hog_counts[hog] += 1
 
         for fam_off in fams:
             if not fam_counts[fam_off]:
-                deinit_fam_list.append(fam_off)
+                hit_fams[num_hit_fams] = fam_off
+                num_hit_fams += 1
 
             fam_counts[fam_off] += 1
 
@@ -402,7 +404,7 @@ def search_seq_kmers(r1, p1, hog_tab, fam_tab, x_flag, table_idx, table_buff,
             elif loc > fam_highloc[fam_off]:
                 fam_highloc[fam_off] = loc
 
-    return deinit_fam_list, deinit_hog_list
+    return hit_fams, num_hit_fams, hit_hogs, num_hit_hogs
 
 
 ## functions to cumulate HOG k-mer counts
@@ -557,7 +559,6 @@ def dict_slice(data: numba.typed.Dict, start: np.uint32, end: np.uint32) -> np.n
         result[j - start] = data.get(j, np.uint32(0))
     return result
 
-
 class MergeSearch(object):
     def __init__(self, ki, include_extant_genes=False):
         assert ki.db.db.mode == "r", "Database must be opened in read mode."
@@ -814,8 +815,10 @@ def func(
             fam_counts = np.zeros(fam_tab.size, dtype=np.uint16)
             fam_lowloc = np.full(fam_tab.size, -1, dtype=np.int32)
             fam_highloc = np.full(fam_tab.size, -1, dtype=np.int32)
-            deinit_fam_list = List.empty_list(numba.int32)
-            deinit_hog_list = List.empty_list(numba.int32)
+            hit_fams = np.zeros(fam_tab.size, dtype=np.int32)
+            hit_hogs = np.zeros(hog_tab.size, dtype=np.int32)
+            num_hit_fams = 0
+            num_hit_hogs = 0
 
             parse_time = 0.0
             search_time = 0.0
@@ -852,23 +855,22 @@ def func(
                 t0 = clock()
 
                 # search using kmers
-                deinit_fam_list, deinit_hog_list = search_seq_kmers(
+                hit_fams, num_hit_fams, hit_hogs, num_hit_hogs = search_seq_kmers(
                     r1, p1, hog_tab, fam_tab, x_flag, table_idx, table_buff,
                     hog_counts, fam_counts, fam_lowloc, fam_highloc,
-                    deinit_fam_list, deinit_hog_list
+                    hit_fams, num_hit_fams, hit_hogs, num_hit_hogs
                 )
 
-                t1 = clock()
-                search_time += t1 - t0
-                t0 = clock()
-
-                # identify families of interest. note: repeat(zeros_like..) numba hack
-                idx = np.argwhere(fam_counts > 0).flatten()
-                #idx = deinit_fam_list
+                # Identify families of interest
+                idx = hit_fams[:num_hit_fams]
                 qres = np.repeat(np.zeros_like(family_results[zz, 0]), len(idx))
                 qres["id"][:] = idx
                 qres["count"][:] = fam_counts[idx]
 
+                t1 = clock()
+                search_time += t1 - t0
+                t0 = clock()
+
                 # 2. Filtering
                 # - b. filter on sequence coverage
                 overlap = np.zeros(len(qres))

From 53c897e20ecdd590997844c2199d1b391c974372 Mon Sep 17 00:00:00 2001
From: Nikolai Romashchenko <nikolay.romashchenko@gmail.com>
Date: Thu, 29 Aug 2024 12:39:43 +0200
Subject: [PATCH 11/17] Parallel version of search_seq_kmers based on hit
 families

---
 omamer/merge_search.py | 37 +++++++++++++++++++++++++------------
 1 file changed, 25 insertions(+), 12 deletions(-)

diff --git a/omamer/merge_search.py b/omamer/merge_search.py
index d6cd66d..1f2b292 100644
--- a/omamer/merge_search.py
+++ b/omamer/merge_search.py
@@ -404,7 +404,7 @@ def search_seq_kmers(r1, p1, hog_tab, fam_tab, x_flag, table_idx, table_buff,
             elif loc > fam_highloc[fam_off]:
                 fam_highloc[fam_off] = loc
 
-    return hit_fams, num_hit_fams, hit_hogs, num_hit_hogs
+    return num_hit_fams, num_hit_hogs
 
 
 ## functions to cumulate HOG k-mer counts
@@ -811,14 +811,15 @@ def func(
             # flags to ignore k-mers containing X
             x_flag = table_idx.size - 1
 
-            hog_counts = np.zeros(hog_tab.size, dtype=np.uint16)
-            fam_counts = np.zeros(fam_tab.size, dtype=np.uint16)
-            fam_lowloc = np.full(fam_tab.size, -1, dtype=np.int32)
-            fam_highloc = np.full(fam_tab.size, -1, dtype=np.int32)
-            hit_fams = np.zeros(fam_tab.size, dtype=np.int32)
-            hit_hogs = np.zeros(hog_tab.size, dtype=np.int32)
-            num_hit_fams = 0
-            num_hit_hogs = 0
+            num_threads = numba.get_num_threads()
+            thread_hog_counts = np.zeros((num_threads, hog_tab.size), dtype=np.uint16)
+            thread_fam_counts = np.zeros((num_threads, fam_tab.size), dtype=np.uint16)
+            thread_fam_lowloc = np.full((num_threads, fam_tab.size), -1, dtype=np.int32)
+            thread_fam_highloc = np.full((num_threads, fam_tab.size), -1, dtype=np.int32)
+            thread_hit_fams = np.zeros((num_threads, fam_tab.size), dtype=np.int32)
+            thread_hit_hogs = np.zeros((num_threads, hog_tab.size), dtype=np.int32)
+            thread_num_hit_fams = np.zeros(num_threads, dtype=np.uint32)
+            thread_num_hit_hogs = np.zeros(num_threads, dtype=np.uint32)
 
             parse_time = 0.0
             search_time = 0.0
@@ -829,7 +830,6 @@ def func(
             total_time = 0.0
 
             for zz in numba.prange(len(seqs_idx) - 1):
-
                 t0 = clock()
 
                 # load seq
@@ -854,13 +854,26 @@ def func(
                 parse_time += t1 - t0
                 t0 = clock()
 
+                # Get thread-local data structures for search
+                hit_fams = thread_hit_fams[numba.get_thread_id()]
+                hit_hogs = thread_hit_hogs[numba.get_thread_id()]
+                hog_counts = thread_hog_counts[numba.get_thread_id()]
+                fam_counts = thread_fam_counts[numba.get_thread_id()]
+                fam_lowloc = thread_fam_lowloc[numba.get_thread_id()]
+                fam_highloc = thread_fam_highloc[numba.get_thread_id()]
+                num_hit_fams = thread_num_hit_fams[numba.get_thread_id()]
+                num_hit_hogs = thread_num_hit_hogs[numba.get_thread_id()]
+
                 # search using kmers
-                hit_fams, num_hit_fams, hit_hogs, num_hit_hogs = search_seq_kmers(
+                num_hit_fams, num_hit_hogs = search_seq_kmers(
                     r1, p1, hog_tab, fam_tab, x_flag, table_idx, table_buff,
                     hog_counts, fam_counts, fam_lowloc, fam_highloc,
                     hit_fams, num_hit_fams, hit_hogs, num_hit_hogs
                 )
 
+                thread_num_hit_fams[numba.get_thread_id()] = num_hit_fams
+                thread_num_hit_hogs[numba.get_thread_id()] = num_hit_hogs
+
                 # Identify families of interest
                 idx = hit_fams[:num_hit_fams]
                 qres = np.repeat(np.zeros_like(family_results[zz, 0]), len(idx))
@@ -1010,4 +1023,4 @@ def func(
             print("Batch total\t", as_seconds(total_time))
             print()
 
-        return numba.jit(func, parallel=False, nopython=True, nogil=True)
+        return numba.jit(func, parallel=True, nopython=True, nogil=True)

From 6c05d11ac4a01e521540de7e4395bcc1d57c8561 Mon Sep 17 00:00:00 2001
From: Nikolai Romashchenko <nikolay.romashchenko@gmail.com>
Date: Fri, 30 Aug 2024 14:58:11 +0200
Subject: [PATCH 12/17] Chernoff filtering

---
 omamer/merge_search.py | 50 +++++++++++++++++++++++++++++-------------
 1 file changed, 35 insertions(+), 15 deletions(-)

diff --git a/omamer/merge_search.py b/omamer/merge_search.py
index 1f2b292..9020894 100644
--- a/omamer/merge_search.py
+++ b/omamer/merge_search.py
@@ -821,13 +821,12 @@ def func(
             thread_num_hit_fams = np.zeros(num_threads, dtype=np.uint32)
             thread_num_hit_hogs = np.zeros(num_threads, dtype=np.uint32)
 
-            parse_time = 0.0
-            search_time = 0.0
-            filter_time = 0.0
-            pvalue_time = 0.0
-            place_time = 0.0
-            sort_time = 0.0
-            total_time = 0.0
+            parse_time = 0
+            search_time = 0
+            filter_time = 0
+            pvalue_time = 0
+            place_time = 0
+            sort_time = 0
 
             for zz in numba.prange(len(seqs_idx) - 1):
                 t0 = clock()
@@ -880,11 +879,35 @@ def func(
                 qres["id"][:] = idx
                 qres["count"][:] = fam_counts[idx]
 
+
                 t1 = clock()
                 search_time += t1 - t0
                 t0 = clock()
 
-                # 2. Filtering
+                # 2. Fast family filtering
+                # Filter unlikely families by bounding the p-value with the Chernoff's bound:
+                #     P(X >= k) = P(X >= (1 + d) mu) <= (exp(d) / (1 + d)^(1+d))^mu < Alpha
+                # Here: k is the number of k-mer hit, mu is the expected number of hits
+                # d = delta is the considered deviation from the expected value,
+                # Alpha is the p-value cutoff (in original units, not negative log).
+                # d = delta = (k - mu) / mu by definition of k.
+                # If we want P(X >= k) >= Alpha, then we can safely eliminate families
+                # that have P(X >= k) < Alpha. We know that the probability is bounded
+                # by Chernoff. For such families, we will have then:
+                #     mu ((1+d) * ln(1+d) - d) < -ln(Alpha)
+                # This allows us to filter out families based on Alpha without
+                # actually computing the p-value which is quite expensive.
+
+                # Compute the delta, similar to normcount
+                expected_count = ref_fam_prob[qres["id"]] * len(r1)
+                delta = (qres["count"] - expected_count) / expected_count
+                chernoff_bound = expected_count * ((1 + delta) * np.log(1 + delta) - delta)
+
+                alpha = -1.0 * np.log(alpha_cutoff)
+                qres = qres[chernoff_bound >= alpha]
+                if len(qres) == 0:
+                    continue
+
                 # - b. filter on sequence coverage
                 overlap = np.zeros(len(qres))
                 for i in range(len(qres)):
@@ -925,7 +948,6 @@ def func(
 
                 # 2. Filtering
                 # - a. filter to significant families (on p-value)
-                alpha = -1.0 * np.log(alpha_cutoff)
                 qres = qres[qres["pvalue"] >= alpha]
                 # filter out 0 neg log p. alpha > 0 is normal. alpha = 0 is edge case.
                 qres = qres if alpha > 0 else qres[qres["pvalue"] > 0]
@@ -934,12 +956,9 @@ def func(
                     continue
 
                 # 3. Compute normalised count
-                # - a. compute the expected count
-                top_fam_expect_counts = ref_fam_prob[qres["id"]] * len(r1)
-
-                # - b. compute the normalised count
-                qres["normcount"][:] = (qres["count"] - top_fam_expect_counts) / (
-                    len(r1) - top_fam_expect_counts
+                expected_count = ref_fam_prob[qres["id"]] * len(r1)
+                qres["normcount"][:] = (qres["count"] - expected_count) / (
+                    len(r1) - expected_count
                 )
 
                 t1 = clock()
@@ -1024,3 +1043,4 @@ def func(
             print()
 
         return numba.jit(func, parallel=True, nopython=True, nogil=True)
+        #return func

From c51cfe7d02fa4b8d88851ecfa3cfddceb0061675 Mon Sep 17 00:00:00 2001
From: Nikolai Romashchenko <nikolay.romashchenko@gmail.com>
Date: Wed, 11 Sep 2024 10:25:11 +0200
Subject: [PATCH 13/17] Tighter Chernoff bound based on KL-div

---
 omamer/merge_search.py | 119 +++++++++++++++++++++--------------------
 1 file changed, 60 insertions(+), 59 deletions(-)

diff --git a/omamer/merge_search.py b/omamer/merge_search.py
index 9020894..ff28c08 100644
--- a/omamer/merge_search.py
+++ b/omamer/merge_search.py
@@ -43,11 +43,9 @@
 
 
 # maximum neglogp to set
-MAX_LOGP = 20000
+MAX_LOGP = 20000.0
 
 
-# ----
-# stats functions
 @numba.njit(nogil=True)
 def binom_neglogccdf(x, n, p):
     """
@@ -786,24 +784,24 @@ def generate():
     @lazy_property
     def _lookup(self):
         def func(
-            family_results,
-            subfam_results,
-            seqs,
-            seqs_idx,
-            trans,
-            table_idx,
-            table_buff,
-            k,
-            DIGITS_AA_LOOKUP,
-            fam_tab,
-            hog_tab,
-            level_arr,
-            top_n_fams,
-            ref_fam_prob,
-            ref_hog_prob,
-            alpha_cutoff,
-            sst,
-            family_only,
+                family_results,
+                subfam_results,
+                seqs,
+                seqs_idx,
+                trans,
+                table_idx,
+                table_buff,
+                k,
+                DIGITS_AA_LOOKUP,
+                fam_tab,
+                hog_tab,
+                level_arr,
+                top_n_fams,
+                ref_fam_prob,
+                ref_hog_prob,
+                alpha_cutoff,
+                sst,
+                family_only,
         ):
             """
             top_n_fams: number of family for which HOG scores are computed
@@ -879,44 +877,48 @@ def func(
                 qres["id"][:] = idx
                 qres["count"][:] = fam_counts[idx]
 
-
                 t1 = clock()
                 search_time += t1 - t0
                 t0 = clock()
 
                 # 2. Fast family filtering
-                # Filter unlikely families by bounding the p-value with the Chernoff's bound:
-                #     P(X >= k) = P(X >= (1 + d) mu) <= (exp(d) / (1 + d)^(1+d))^mu < Alpha
-                # Here: k is the number of k-mer hit, mu is the expected number of hits
-                # d = delta is the considered deviation from the expected value,
-                # Alpha is the p-value cutoff (in original units, not negative log).
-                # d = delta = (k - mu) / mu by definition of k.
-                # If we want P(X >= k) >= Alpha, then we can safely eliminate families
-                # that have P(X >= k) < Alpha. We know that the probability is bounded
-                # by Chernoff. For such families, we will have then:
-                #     mu ((1+d) * ln(1+d) - d) < -ln(Alpha)
-                # This allows us to filter out families based on Alpha without
-                # actually computing the p-value which is quite expensive.
-
-                # Compute the delta, similar to normcount
-                expected_count = ref_fam_prob[qres["id"]] * len(r1)
-                delta = (qres["count"] - expected_count) / expected_count
-                chernoff_bound = expected_count * ((1 + delta) * np.log(1 + delta) - delta)
+                #     - a. filter by count. We are only interested in families
+                #     that have at least their expected number of k-mer matches,
+                #     because that number should be already given by random
+                #     (however is still insignificant).
+                mu = ref_fam_prob[qres["id"]] * len(r1)
+                qres = qres[qres["count"] >= mu]
+
+                #     - b. filter by sequence coverage. There is no point to
+                #     compute p-value for families that are going to be hard
+                #     filtered by coverage later anyway.
+                for i in range(len(qres)):
+                    family_id = qres["id"][i]
+                    qres["overlap"][i] = (fam_highloc[family_id] - fam_lowloc[family_id] + k) / query_len
 
-                alpha = -1.0 * np.log(alpha_cutoff)
-                qres = qres[chernoff_bound >= alpha]
+                qres = qres[(qres["overlap"] >= (25 / query_len))]
                 if len(qres) == 0:
                     continue
 
-                # - b. filter on sequence coverage
-                overlap = np.zeros(len(qres))
-                for i in range(len(qres)):
-                    family_id = qres["id"][i]
-                    overlap[i] = (fam_highloc[family_id] - fam_lowloc[family_id] + k) / query_len
-
-                qres["overlap"][:] = overlap
+                # - c. Filter by predicted p-value: perform the Chernoff KL-div test,
+                # that is, the Chernoff upper bound for the binomial X:
+                #     P(X >= k) <= exp(-n D(k/n || p))
+                # where D is Kullback-Leibler divergence.
+                # First, compute k / n, the empirical proportion of Bernoulli successes.
+                # We need to clip it a little for the case n = k, because it's used
+                # in logarithms later. For the other edge case, we already have k > 0
+                # guaranteed, so we're good here
+                epsilon = 1e-10
+                n = len(r1)
+                k_n = np.clip(qres["count"] / n, epsilon, 1 - epsilon)
+
+                # Now, by demanding exp(-n KL(k/n || p)) < alpha, we guarantee P < alpha too.
+                # There is a theoretical chance of that P < alpha <= bound, and the test will
+                # fail with a false negative. I could not observe any instances of this.
+                p = ref_fam_prob[qres["id"]]
+                kl_div = k_n * np.log(k_n / p) + (1 - k_n) * np.log((1 - k_n) / (1 - p))
+                qres = qres[kl_div > -np.log(alpha_cutoff)/n]
 
-                qres = qres[(qres["overlap"] >= (25 / query_len))]
                 if len(qres) == 0:
                     continue
 
@@ -932,12 +934,12 @@ def func(
                         max(
                             0.0,
                             (
-                                binom_neglogccdf(
-                                    qres["count"][i],
-                                    len(r1),
-                                    ref_fam_prob[qres["id"][i]],
-                                )
-                                - correction_factor
+                                    binom_neglogccdf(
+                                        qres["count"][i],
+                                        len(r1),
+                                        ref_fam_prob[qres["id"][i]],
+                                    )
+                                    - correction_factor
                             ),
                         ),
                     )
@@ -946,19 +948,18 @@ def func(
                 pvalue_time += t1 - t0
                 t0 = clock()
 
-                # 2. Filtering
-                # - a. filter to significant families (on p-value)
+                # Filter on the actual p-value
+                alpha = -1.0 * np.log(alpha_cutoff)
                 qres = qres[qres["pvalue"] >= alpha]
                 # filter out 0 neg log p. alpha > 0 is normal. alpha = 0 is edge case.
                 qres = qres if alpha > 0 else qres[qres["pvalue"] > 0]
-
                 if len(qres) == 0:
                     continue
 
                 # 3. Compute normalised count
                 expected_count = ref_fam_prob[qres["id"]] * len(r1)
                 qres["normcount"][:] = (qres["count"] - expected_count) / (
-                    len(r1) - expected_count
+                        len(r1) - expected_count
                 )
 
                 t1 = clock()

From 3ae846b99ad165a53fb470a111d88ddc2ba1c4a5 Mon Sep 17 00:00:00 2001
From: Nikolai Romashchenko <nikolay.romashchenko@gmail.com>
Date: Thu, 21 Nov 2024 13:38:49 +0100
Subject: [PATCH 14/17] Restored the final sorting of multiple results per
 query after quickselect

---
 omamer/merge_search.py | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/omamer/merge_search.py b/omamer/merge_search.py
index ff28c08..e564ec2 100644
--- a/omamer/merge_search.py
+++ b/omamer/merge_search.py
@@ -170,9 +170,19 @@ def family_result_sort(x, k):
     Sort the family results according to normcount, overlap and pvalue (to break ties).
     this uses a quicksort implementation as np.argsort does not support struct type in numba.
     """
+
+    # Quickselect top k results
     idx = np.arange(len(x))
     _ = _select(x, idx, k, 0, len(x) - 1)
-    return x[idx[:k]]
+    x = x[idx[:k]]
+
+    # Now mergesort the selected results, because we need to
+    # report them sorted
+    idx = family_result_argsort(x, list(range(len(x))))
+    y = np.zeros_like(x)
+    for i in range(len(x)):
+        y[i] = x[idx[i]]
+    return y
 
 
 @numba.njit
@@ -235,7 +245,6 @@ def _select(x, idx, k, low, high):
     return idx[k]
 
 
-# ----
 ## generic functions
 @numba.njit
 def get_fam_hog2parent(fam_ent, hog_tab):
@@ -926,7 +935,7 @@ def func(
                 filter_time += t1 - t0
                 t0 = clock()
 
-                # 1. compute p-value for each family. note: in negative log units
+                # 3. compute p-value for each family. note: in negative log units
                 correction_factor = np.log(len(ref_fam_prob))
                 for i in numba.prange(len(qres)):
                     qres["pvalue"][i] = min(
@@ -956,7 +965,7 @@ def func(
                 if len(qres) == 0:
                     continue
 
-                # 3. Compute normalised count
+                # 4. Compute normalised count
                 expected_count = ref_fam_prob[qres["id"]] * len(r1)
                 qres["normcount"][:] = (qres["count"] - expected_count) / (
                         len(r1) - expected_count
@@ -966,7 +975,7 @@ def func(
                 filter_time += t1 - t0
                 t0 = clock()
 
-                # 4. Store results
+                # 5. Store results
                 # - a. sort by normcount, then overlap, then p-value for tie-breaking
                 qres = family_result_sort(qres, top_n_fams)
 

From 8a43650ee415159871d67d9b9184e1843e0fce39 Mon Sep 17 00:00:00 2001
From: Nikolai Romashchenko <nikolay.romashchenko@gmail.com>
Date: Mon, 25 Nov 2024 09:48:29 +0100
Subject: [PATCH 15/17] Clean up

---
 omamer/main.py         |   6 +--
 omamer/merge_search.py | 117 ++++++++++++-----------------------------
 2 files changed, 36 insertions(+), 87 deletions(-)

diff --git a/omamer/main.py b/omamer/main.py
index d7aa561..f8272e7 100644
--- a/omamer/main.py
+++ b/omamer/main.py
@@ -304,8 +304,4 @@ def get_thread_count():
         # call the relevant runner func
         args.func(args)
     else:
-        parser.print_usage()
-
-
-if __name__ == "__main__":
-    main()
+        parser.print_usage()
\ No newline at end of file
diff --git a/omamer/merge_search.py b/omamer/merge_search.py
index e564ec2..29b33fd 100644
--- a/omamer/merge_search.py
+++ b/omamer/merge_search.py
@@ -818,28 +818,21 @@ def func(
             # flags to ignore k-mers containing X
             x_flag = table_idx.size - 1
 
+            # Arrays for thread-local data. We allocate them
+            # in advance to avoid doing so for every query
             num_threads = numba.get_num_threads()
-            thread_hog_counts = np.zeros((num_threads, hog_tab.size), dtype=np.uint16)
-            thread_fam_counts = np.zeros((num_threads, fam_tab.size), dtype=np.uint16)
-            thread_fam_lowloc = np.full((num_threads, fam_tab.size), -1, dtype=np.int32)
-            thread_fam_highloc = np.full((num_threads, fam_tab.size), -1, dtype=np.int32)
-            thread_hit_fams = np.zeros((num_threads, fam_tab.size), dtype=np.int32)
-            thread_hit_hogs = np.zeros((num_threads, hog_tab.size), dtype=np.int32)
-            thread_num_hit_fams = np.zeros(num_threads, dtype=np.uint32)
-            thread_num_hit_hogs = np.zeros(num_threads, dtype=np.uint32)
-
-            parse_time = 0
-            search_time = 0
-            filter_time = 0
-            pvalue_time = 0
-            place_time = 0
-            sort_time = 0
+            hog_counts = np.zeros((num_threads, hog_tab.size), dtype=np.uint16)
+            fam_counts = np.zeros((num_threads, fam_tab.size), dtype=np.uint16)
+            fam_lowloc = np.full((num_threads, fam_tab.size), -1, dtype=np.int32)
+            fam_highloc = np.full((num_threads, fam_tab.size), -1, dtype=np.int32)
+            hit_fams = np.zeros((num_threads, fam_tab.size), dtype=np.int32)
+            hit_hogs = np.zeros((num_threads, hog_tab.size), dtype=np.int32)
+            num_hit_fams = np.zeros(num_threads, dtype=np.uint32)
+            num_hit_hogs = np.zeros(num_threads, dtype=np.uint32)
 
             for zz in numba.prange(len(seqs_idx) - 1):
-                t0 = clock()
-
                 # load seq
-                s = seqs[seqs_idx[zz] : np.int64(seqs_idx[zz + 1] - 1)]
+                s = seqs[seqs_idx[zz]: np.int64(seqs_idx[zz + 1] - 1)]
                 query_len = s.shape[0]
                 n_kmers = query_len - (k - 1)
 
@@ -856,39 +849,31 @@ def func(
                 elif r1[0] == x_flag:
                     continue
 
-                t1 = clock()
-                parse_time += t1 - t0
-                t0 = clock()
-
                 # Get thread-local data structures for search
-                hit_fams = thread_hit_fams[numba.get_thread_id()]
-                hit_hogs = thread_hit_hogs[numba.get_thread_id()]
-                hog_counts = thread_hog_counts[numba.get_thread_id()]
-                fam_counts = thread_fam_counts[numba.get_thread_id()]
-                fam_lowloc = thread_fam_lowloc[numba.get_thread_id()]
-                fam_highloc = thread_fam_highloc[numba.get_thread_id()]
-                num_hit_fams = thread_num_hit_fams[numba.get_thread_id()]
-                num_hit_hogs = thread_num_hit_hogs[numba.get_thread_id()]
+                thread_hit_fams = hit_fams[numba.get_thread_id()]
+                thread_hit_hogs = hit_hogs[numba.get_thread_id()]
+                thread_hog_counts = hog_counts[numba.get_thread_id()]
+                thread_fam_counts = fam_counts[numba.get_thread_id()]
+                thread_fam_lowloc = fam_lowloc[numba.get_thread_id()]
+                thread_fam_highloc = fam_highloc[numba.get_thread_id()]
+                thread_num_hit_fams = num_hit_fams[numba.get_thread_id()]
+                thread_num_hit_hogs = num_hit_hogs[numba.get_thread_id()]
 
                 # search using kmers
-                num_hit_fams, num_hit_hogs = search_seq_kmers(
+                thread_num_hit_fams, thread_num_hit_hogs = search_seq_kmers(
                     r1, p1, hog_tab, fam_tab, x_flag, table_idx, table_buff,
-                    hog_counts, fam_counts, fam_lowloc, fam_highloc,
-                    hit_fams, num_hit_fams, hit_hogs, num_hit_hogs
+                    thread_hog_counts, thread_fam_counts, thread_fam_lowloc, thread_fam_highloc,
+                    thread_hit_fams, thread_num_hit_fams, thread_hit_hogs, thread_num_hit_hogs
                 )
 
-                thread_num_hit_fams[numba.get_thread_id()] = num_hit_fams
-                thread_num_hit_hogs[numba.get_thread_id()] = num_hit_hogs
+                num_hit_fams[numba.get_thread_id()] = thread_num_hit_fams
+                num_hit_hogs[numba.get_thread_id()] = thread_num_hit_hogs
 
                 # Identify families of interest
-                idx = hit_fams[:num_hit_fams]
+                idx = thread_hit_fams[:thread_num_hit_fams]
                 qres = np.repeat(np.zeros_like(family_results[zz, 0]), len(idx))
                 qres["id"][:] = idx
-                qres["count"][:] = fam_counts[idx]
-
-                t1 = clock()
-                search_time += t1 - t0
-                t0 = clock()
+                qres["count"][:] = thread_fam_counts[idx]
 
                 # 2. Fast family filtering
                 #     - a. filter by count. We are only interested in families
@@ -903,7 +888,7 @@ def func(
                 #     filtered by coverage later anyway.
                 for i in range(len(qres)):
                     family_id = qres["id"][i]
-                    qres["overlap"][i] = (fam_highloc[family_id] - fam_lowloc[family_id] + k) / query_len
+                    qres["overlap"][i] = (thread_fam_highloc[family_id] - thread_fam_lowloc[family_id] + k) / query_len
 
                 qres = qres[(qres["overlap"] >= (25 / query_len))]
                 if len(qres) == 0:
@@ -931,10 +916,6 @@ def func(
                 if len(qres) == 0:
                     continue
 
-                t1 = clock()
-                filter_time += t1 - t0
-                t0 = clock()
-
                 # 3. compute p-value for each family. note: in negative log units
                 correction_factor = np.log(len(ref_fam_prob))
                 for i in numba.prange(len(qres)):
@@ -943,20 +924,16 @@ def func(
                         max(
                             0.0,
                             (
-                                    binom_neglogccdf(
-                                        qres["count"][i],
-                                        len(r1),
-                                        ref_fam_prob[qres["id"][i]],
-                                    )
-                                    - correction_factor
+                                binom_neglogccdf(
+                                    qres["count"][i],
+                                    len(r1),
+                                    ref_fam_prob[qres["id"][i]],
+                                )
+                                - correction_factor
                             ),
                         ),
                     )
 
-                t1 = clock()
-                pvalue_time += t1 - t0
-                t0 = clock()
-
                 # Filter on the actual p-value
                 alpha = -1.0 * np.log(alpha_cutoff)
                 qres = qres[qres["pvalue"] >= alpha]
@@ -971,10 +948,6 @@ def func(
                         len(r1) - expected_count
                 )
 
-                t1 = clock()
-                filter_time += t1 - t0
-                t0 = clock()
-
                 # 5. Store results
                 # - a. sort by normcount, then overlap, then p-value for tie-breaking
                 qres = family_result_sort(qres, top_n_fams)
@@ -986,10 +959,6 @@ def func(
                 family_results["normcount"][zz, :top_n_fams] = qres["normcount"][:top_n_fams]
                 family_results["overlap"][zz, :top_n_fams] = qres["overlap"][:top_n_fams]
 
-                t1 = clock()
-                sort_time += t1 - t0
-                t0 = clock()
-
                 # 5. Place within families
                 for i in numba.prange(min(len(qres), top_n_fams)):
                     entry = fam_tab[qres["id"][i]]
@@ -1005,7 +974,7 @@ def func(
                     fam_level_offsets = get_fam_level_offsets(entry, level_arr)
 
                     # cumulation of counts
-                    c = hog_counts[hog_s:hog_e].copy()
+                    c = thread_hog_counts[hog_s:hog_e].copy()
 
                     cumulate_counts_1fam(c, fam_level_offsets, fam_hog2parent)
 
@@ -1015,7 +984,7 @@ def func(
                         r1.size,
                         fam_level_offsets,
                         fam_hog2parent,
-                        hog_counts[hog_s:hog_e],
+                        thread_hog_counts[hog_s:hog_e],
                         ref_hog_prob[hog_s:hog_e],
                     )
 
@@ -1037,20 +1006,4 @@ def func(
                         c[int(choice)] if choice_score != 0.0 else 0
                     )
 
-                t1 = clock()
-                place_time += t1 - t0
-
-            total_time = parse_time + search_time + filter_time + pvalue_time + place_time + sort_time
-
-            print()
-            print("Parse time\t", as_seconds(parse_time))
-            print("Search time\t", as_seconds(search_time))
-            print("Filter time\t", as_seconds(filter_time))
-            print("Pvalue time\t", as_seconds(pvalue_time))
-            print("Place time\t", as_seconds(place_time))
-            print("Sort time\t", as_seconds(sort_time))
-            print("Batch total\t", as_seconds(total_time))
-            print()
-
         return numba.jit(func, parallel=True, nopython=True, nogil=True)
-        #return func

From 0bbb6e9edd06f9a938b3a8416b6088cb1614ef2e Mon Sep 17 00:00:00 2001
From: Nikolai Romashchenko <nikolay.romashchenko@gmail.com>
Date: Mon, 25 Nov 2024 10:38:42 +0100
Subject: [PATCH 16/17] Updated license headers

---
 omamer/HOGParser.py       |  1 +
 omamer/__init__.py        |  3 ++-
 omamer/_clock.py          | 24 ++++++++++++++++++++++++
 omamer/_runners.py        |  1 +
 omamer/_utils.py          |  1 +
 omamer/database.py        |  1 +
 omamer/hierarchy.py       |  1 +
 omamer/index.py           |  1 +
 omamer/main.py            |  1 +
 omamer/merge_search.py    |  1 +
 omamer/results_reader.py  |  1 +
 omamer/sequence_buffer.py |  1 +
 omamer/sequence_reader.py |  1 +
 setup.py                  |  1 +
 14 files changed, 38 insertions(+), 1 deletion(-)

diff --git a/omamer/HOGParser.py b/omamer/HOGParser.py
index 6c394d4..e938140 100644
--- a/omamer/HOGParser.py
+++ b/omamer/HOGParser.py
@@ -2,6 +2,7 @@
     HOGPROP - Propagation of Gene Ontology (GO) annotations through
     Hierarchical Orthologous Groups (HOGs) from the OMA project.
 
+    (C) 2024 Nikolai Romashchenko <nikolai.romashchenko@unil.ch>
     (C) 2015-2023 Alex Warwick Vesztrocy <alex@warwickvesztrocy.co.uk>
 
     This file is part of HOGPROP. It contains a module for parsing an
diff --git a/omamer/__init__.py b/omamer/__init__.py
index 5df936b..8ccc8eb 100644
--- a/omamer/__init__.py
+++ b/omamer/__init__.py
@@ -1,6 +1,7 @@
 """
     OMAmer - tree-driven and alignment-free protein assignment to sub-families
 
+    (C) 2024 Nikolai Romashchenko <nikolai.romashchenko@unil.ch>
     (C) 2022-2023 Alex Warwick Vesztrocy <alex.warwickvesztrocy@unil.ch>
     (C) 2019-2021 Victor Rossier <victor.rossier@unil.ch> and
                   Alex Warwick Vesztrocy <alex@warwickvesztrocy.co.uk>
@@ -24,6 +25,6 @@
 
 __packagename__ = "omamer"
 __version__ = "2.0.5"
-__copyright__ = "(C) 2019-{:d} Victor Rossier <victor.rossier@unil.ch> and Alex Warwick Vesztrocy <alex@warwickvesztrocy.co.uk>".format(
+__copyright__ = "(C) 2019-{:d} Victor Rossier <victor.rossier@unil.ch> and Alex Warwick Vesztrocy <alex@warwickvesztrocy.co.uk> and Nikolai Romashchenko <nikolai.romashchenko@unil.ch>".format(
     date.today().year
 )
diff --git a/omamer/_clock.py b/omamer/_clock.py
index e02e1bf..68313a6 100644
--- a/omamer/_clock.py
+++ b/omamer/_clock.py
@@ -1,3 +1,27 @@
+"""
+    OMAmer - tree-driven and alignment-free protein assignment to sub-families
+
+    (C) 2024 Nikolai Romashchenko <nikolai.romashchenko@unil.ch>
+    (C) 2022-2023 Alex Warwick Vesztrocy <alex.warwickvesztrocy@unil.ch>
+    (C) 2019-2021 Victor Rossier <victor.rossier@unil.ch> and
+                  Alex Warwick Vesztrocy <alex@warwickvesztrocy.co.uk>
+
+    This file is part of OMAmer.
+
+    OMAmer is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Lesser General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    OMAmer is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+    GNU Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public License
+    along with OMAmer. If not, see <http://www.gnu.org/licenses/>.
+"""
+
 import ctypes
 
 # Access the _PyTime_AsSecondsDouble and _PyTime_GetSystemClock functions from pythonapi
diff --git a/omamer/_runners.py b/omamer/_runners.py
index 799ed25..14ba6fd 100644
--- a/omamer/_runners.py
+++ b/omamer/_runners.py
@@ -1,6 +1,7 @@
 """
     OMAmer - tree-driven and alignment-free protein assignment to sub-families
 
+    (C) 2024 Nikolai Romashchenko <nikolai.romashchenko@unil.ch>
     (C) 2022-2023 Alex Warwick Vesztrocy <alex.warwickvesztrocy@unil.ch>
     (C) 2019-2021 Victor Rossier <victor.rossier@unil.ch> and
                   Alex Warwick Vesztrocy <alex@warwickvesztrocy.co.uk>
diff --git a/omamer/_utils.py b/omamer/_utils.py
index 32ffbdb..e1f0ae3 100644
--- a/omamer/_utils.py
+++ b/omamer/_utils.py
@@ -1,6 +1,7 @@
 """
     OMAmer - tree-driven and alignment-free protein assignment to sub-families
 
+    (C) 2024 Nikolai Romashchenko <nikolai.romashchenko@unil.ch>
     (C) 2022-2023 Alex Warwick Vesztrocy <alex.warwickvesztrocy@unil.ch>
     (C) 2019-2021 Victor Rossier <victor.rossier@unil.ch> and
                   Alex Warwick Vesztrocy <alex@warwickvesztrocy.co.uk>
diff --git a/omamer/database.py b/omamer/database.py
index ce61f97..b861e05 100644
--- a/omamer/database.py
+++ b/omamer/database.py
@@ -1,6 +1,7 @@
 """
     OMAmer - tree-driven and alignment-free protein assignment to sub-families
 
+    (C) 2024 Nikolai Romashchenko <nikolai.romashchenko@unil.ch>
     (C) 2022-2023 Alex Warwick Vesztrocy <alex.warwickvesztrocy@unil.ch>
     (C) 2019-2021 Victor Rossier <victor.rossier@unil.ch> and
                   Alex Warwick Vesztrocy <alex@warwickvesztrocy.co.uk>
diff --git a/omamer/hierarchy.py b/omamer/hierarchy.py
index d4f9dc3..eab4bd6 100644
--- a/omamer/hierarchy.py
+++ b/omamer/hierarchy.py
@@ -1,6 +1,7 @@
 """
     OMAmer - tree-driven and alignment-free protein assignment to sub-families
 
+    (C) 2024 Nikolai Romashchenko <nikolai.romashchenko@unil.ch>
     (C) 2022-2023 Alex Warwick Vesztrocy <alex.warwickvesztrocy@unil.ch>
     (C) 2019-2021 Victor Rossier <victor.rossier@unil.ch> and
                   Alex Warwick Vesztrocy <alex@warwickvesztrocy.co.uk>
diff --git a/omamer/index.py b/omamer/index.py
index 323bfe8..ba687cc 100644
--- a/omamer/index.py
+++ b/omamer/index.py
@@ -1,6 +1,7 @@
 """
     OMAmer - tree-driven and alignment-free protein assignment to sub-families
 
+    (C) 2024 Nikolai Romashchenko <nikolai.romashchenko@unil.ch>
     (C) 2022-2023 Alex Warwick Vesztrocy <alex.warwickvesztrocy@unil.ch>
     (C) 2019-2021 Victor Rossier <victor.rossier@unil.ch> and
                   Alex Warwick Vesztrocy <alex@warwickvesztrocy.co.uk>
diff --git a/omamer/main.py b/omamer/main.py
index f8272e7..db4e41b 100644
--- a/omamer/main.py
+++ b/omamer/main.py
@@ -1,6 +1,7 @@
 """
     OMAmer - tree-driven and alignment-free protein assignment to sub-families
 
+    (C) 2024 Nikolai Romashchenko <nikolai.romashchenko@unil.ch>
     (C) 2022-2023 Alex Warwick Vesztrocy <alex.warwickvesztrocy@unil.ch>
     (C) 2019-2021 Victor Rossier <victor.rossier@unil.ch> and
                   Alex Warwick Vesztrocy <alex@warwickvesztrocy.co.uk>
diff --git a/omamer/merge_search.py b/omamer/merge_search.py
index 29b33fd..9266b8d 100644
--- a/omamer/merge_search.py
+++ b/omamer/merge_search.py
@@ -1,6 +1,7 @@
 """
     OMAmer - tree-driven and alignment-free protein assignment to sub-families
 
+    (C) 2024 Nikolai Romashchenko <nikolai.romashchenko@unil.ch>
     (C) 2022-2023 Alex Warwick Vesztrocy <alex.warwickvesztrocy@unil.ch>
     (C) 2019-2021 Victor Rossier <victor.rossier@unil.ch> and
                   Alex Warwick Vesztrocy <alex@warwickvesztrocy.co.uk>
diff --git a/omamer/results_reader.py b/omamer/results_reader.py
index b925269..a16bf09 100644
--- a/omamer/results_reader.py
+++ b/omamer/results_reader.py
@@ -1,6 +1,7 @@
 """
     OMAmer - tree-driven and alignment-free protein assignment to sub-families
 
+    (C) 2024 Nikolai Romashchenko <nikolai.romashchenko@unil.ch>
     (C) 2022-2023 Alex Warwick Vesztrocy <alex.warwickvesztrocy@unil.ch>
     (C) 2019-2021 Victor Rossier <victor.rossier@unil.ch> and
                   Alex Warwick Vesztrocy <alex@warwickvesztrocy.co.uk>
diff --git a/omamer/sequence_buffer.py b/omamer/sequence_buffer.py
index 8293ac6..c95a79f 100644
--- a/omamer/sequence_buffer.py
+++ b/omamer/sequence_buffer.py
@@ -1,6 +1,7 @@
 """
     OMAmer - tree-driven and alignment-free protein assignment to sub-families
 
+    (C) 2024 Nikolai Romashchenko <nikolai.romashchenko@unil.ch>
     (C) 2022-2023 Alex Warwick Vesztrocy <alex.warwickvesztrocy@unil.ch>
     (C) 2019-2021 Victor Rossier <victor.rossier@unil.ch> and
                   Alex Warwick Vesztrocy <alex@warwickvesztrocy.co.uk>
diff --git a/omamer/sequence_reader.py b/omamer/sequence_reader.py
index 539e470..d5297ad 100644
--- a/omamer/sequence_reader.py
+++ b/omamer/sequence_reader.py
@@ -1,6 +1,7 @@
 """
     OMAmer - tree-driven and alignment-free protein assignment to sub-families
 
+    (C) 2024 Nikolai Romashchenko <nikolai.romashchenko@unil.ch>
     (C) 2022-2023 Alex Warwick Vesztrocy <alex.warwickvesztrocy@unil.ch>
     (C) 2019-2021 Victor Rossier <victor.rossier@unil.ch> and
                   Alex Warwick Vesztrocy <alex@warwickvesztrocy.co.uk>
diff --git a/setup.py b/setup.py
index fa1f599..2d7ee08 100644
--- a/setup.py
+++ b/setup.py
@@ -1,6 +1,7 @@
 """
     OMAmer - tree-driven and alignment-free protein assignment to sub-families
 
+    (C) 2024 Nikolai Romashchenko <nikolai.romashchenko@unil.ch>
     (C) 2022-2023 Alex Warwick Vesztrocy <alex.warwickvesztrocy@unil.ch>
     (C) 2019-2021 Victor Rossier <victor.rossier@unil.ch> and
                   Alex Warwick Vesztrocy <alex@warwickvesztrocy.co.uk>

From 15070a99ae801ced1da7eba368a7bd9b9e2588a3 Mon Sep 17 00:00:00 2001
From: Nikolai Romashchenko <nikolay.romashchenko@gmail.com>
Date: Mon, 25 Nov 2024 10:48:32 +0100
Subject: [PATCH 17/17] =?UTF-8?q?Bump=20version:=202.0.4=20=E2=86=92=202.1?=
 =?UTF-8?q?.0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 omamer/__init__.py | 2 +-
 setup.cfg          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/omamer/__init__.py b/omamer/__init__.py
index 8ccc8eb..4654e67 100644
--- a/omamer/__init__.py
+++ b/omamer/__init__.py
@@ -24,7 +24,7 @@
 from datetime import date
 
 __packagename__ = "omamer"
-__version__ = "2.0.5"
+__version__ = "2.1.0"
 __copyright__ = "(C) 2019-{:d} Victor Rossier <victor.rossier@unil.ch> and Alex Warwick Vesztrocy <alex@warwickvesztrocy.co.uk> and Nikolai Romashchenko <nikolai.romashchenko@unil.ch>".format(
     date.today().year
 )
diff --git a/setup.cfg b/setup.cfg
index 9d2c31c..5f54ea0 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 2.0.4
+current_version = 2.1.0
 commit = True
 tag = False