From 46df017e6083f5feb32676905151c8d1018d96b5 Mon Sep 17 00:00:00 2001 From: Nikolai Romashchenko Date: Tue, 13 Aug 2024 17:39:34 +0200 Subject: [PATCH 01/17] Commented out njit optimizations for debugging --- omamer/main.py | 4 ++++ omamer/merge_search.py | 9 +++++---- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/omamer/main.py b/omamer/main.py index cc7a6ca..ef7bd16 100644 --- a/omamer/main.py +++ b/omamer/main.py @@ -304,3 +304,7 @@ def get_thread_count(): args.func(args) else: parser.print_usage() + + +if __name__ == "__main__": + main() diff --git a/omamer/merge_search.py b/omamer/merge_search.py index f0fb765..e9f942a 100644 --- a/omamer/merge_search.py +++ b/omamer/merge_search.py @@ -159,7 +159,7 @@ def get_fam_level_offsets(fam_ent, level_arr): ## search functions -@numba.njit +#@numba.njit def custom_unique1d(ar): """ adapted from np._unique1d for numba @@ -176,7 +176,7 @@ def custom_unique1d(ar): return aux[mask], perm[mask], np.diff(idx) -@numba.njit +#@numba.njit def parse_seq(s, DIGITS_AA_LOOKUP, n_kmers, k, trans, x_flag): """ get the sequence unique k-mers and non ambiguous locations (when truly unique) @@ -196,7 +196,7 @@ def parse_seq(s, DIGITS_AA_LOOKUP, n_kmers, k, trans, x_flag): return custom_unique1d(r) -@numba.njit +#@numba.njit def search_seq_kmers(r1, p1, hog_tab, fam_tab, x_flag, table_idx, table_buff): """ Perform the kmer search, using the index. @@ -766,4 +766,5 @@ def func( c[int(choice)] if choice_score != 0.0 else 0 ) - return numba.jit(func, parallel=True, nopython=True, nogil=True) + return func + #return numba.jit(func, parallel=True, nopython=True, nogil=True) From 45404330debb36ace242eb12c753eff6bf850e29 Mon Sep 17 00:00:00 2001 From: Nikolai Romashchenko Date: Wed, 21 Aug 2024 15:41:18 +0200 Subject: [PATCH 02/17] Linear-time k-mer encoding --- omamer/merge_search.py | 55 ++++++++++++++++++++++++++++++------------ 1 file changed, 40 insertions(+), 15 deletions(-) diff --git a/omamer/merge_search.py b/omamer/merge_search.py index e9f942a..39b729f 100644 --- a/omamer/merge_search.py +++ b/omamer/merge_search.py @@ -20,7 +20,7 @@ You should have received a copy of the GNU Lesser General Public License along with OMAmer. If not, see . """ -from Rmath4 import pbinom +from Rmath4 import pbinom, phyper from numba.tests.support import captured_stdout from property_manager import lazy_property, cached_property from time import time @@ -56,6 +56,10 @@ def binom_neglogccdf(x, n, p): # return -1.0 * np.log(sc.bdtrc(np.float64(x - 1), n, p)) return -1.0 * pbinom(x - 1, n, p, 0, 1) + # pbinom(n - 1, m, m/N, 0, 0) + # phyper(n - 1, m, N - m, m, 0, 0) + #return -1.0 * phyper(x - 1, n, n/p - n, n, 0, 1) + # ---- # family result sorting @@ -159,7 +163,7 @@ def get_fam_level_offsets(fam_ent, level_arr): ## search functions -#@numba.njit +@numba.njit def custom_unique1d(ar): """ adapted from np._unique1d for numba @@ -176,27 +180,48 @@ def custom_unique1d(ar): return aux[mask], perm[mask], np.diff(idx) -#@numba.njit +@numba.njit def parse_seq(s, DIGITS_AA_LOOKUP, n_kmers, k, trans, x_flag): """ get the sequence unique k-mers and non ambiguous locations (when truly unique) """ s_norm = DIGITS_AA_LOOKUP[s] r = np.zeros(n_kmers, dtype=np.uint32) # max kmer 7 - for i in numba.prange(n_kmers): - # numba can't do np.dot with non-float - for j in numba.prange(k): - r[i] += trans[j] * s_norm[i + j] - - # does k-mer contain any X? - x_seen = np.any(s_norm[i : i + k] == DIGITS_AA_LOOKUP[88]) # 88==b'X' - # if yes, replace it by the x_flag + + # compute the code of the first k-mer + for j in range(k): + r[0] += trans[j] * s_norm[j] + + # does k-mer contain any X? + x_seen = np.any(s_norm[0:k] == DIGITS_AA_LOOKUP[88]) + # if yes, replace it by the x_flag + r[0] = r[0] if not x_seen else x_flag + + # codes for other k-mers + for i in range(1, n_kmers): + if not x_seen: + # if the previous k-mer was valid, + # recompute the current code from the previous one + + # remove the first character from the code + shared = r[i-1] - (trans[0] * s_norm[i - 1]) + + # trans[-2] is the alphabet size + r[i] = shared * trans[-2] + trans[-1] * s_norm[i + k - 1] + + else: + # if the previous k-mer has Xs, + # just compute the code from scratch + for j in range(k): + r[i] += trans[j] * s_norm[i + j] + + x_seen = np.any(s_norm[i: i + k] == DIGITS_AA_LOOKUP[88]) r[i] = r[i] if not x_seen else x_flag return custom_unique1d(r) -#@numba.njit +@numba.njit def search_seq_kmers(r1, p1, hog_tab, fam_tab, x_flag, table_idx, table_buff): """ Perform the kmer search, using the index. @@ -663,7 +688,7 @@ def func( correction_factor = np.log(len(ref_fam_prob)) for i in numba.prange(len(qres)): qres["pvalue"][i] = min( - MAX_LOGP, + float(MAX_LOGP), max( 0.0, ( @@ -766,5 +791,5 @@ def func( c[int(choice)] if choice_score != 0.0 else 0 ) - return func - #return numba.jit(func, parallel=True, nopython=True, nogil=True) + return numba.jit(func, parallel=True, nopython=True, nogil=True) + #return func From 1a087000b14398fdd15bfb2eec1b030ae7948d9f Mon Sep 17 00:00:00 2001 From: Nikolai Romashchenko Date: Wed, 21 Aug 2024 16:32:09 +0200 Subject: [PATCH 03/17] Got rid of sorting k-mers in parse_seq --- omamer/merge_search.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/omamer/merge_search.py b/omamer/merge_search.py index 39b729f..4bc548d 100644 --- a/omamer/merge_search.py +++ b/omamer/merge_search.py @@ -22,6 +22,7 @@ """ from Rmath4 import pbinom, phyper from numba.tests.support import captured_stdout +from numba.typed import List from property_manager import lazy_property, cached_property from time import time import numba @@ -180,6 +181,24 @@ def custom_unique1d(ar): return aux[mask], perm[mask], np.diff(idx) +@numba.njit +def unique1d_linear(array): + """ + Find a set of unique elements in linear time + """ + unique_list = List() + index_list = List() + seen = set() + + for i in range(len(array)): + if array[i] not in seen: + seen.add(array[i]) + unique_list.append(array[i]) + index_list.append(i) + + return np.asarray(unique_list), np.asarray(index_list), None + + @numba.njit def parse_seq(s, DIGITS_AA_LOOKUP, n_kmers, k, trans, x_flag): """ @@ -218,7 +237,7 @@ def parse_seq(s, DIGITS_AA_LOOKUP, n_kmers, k, trans, x_flag): x_seen = np.any(s_norm[i: i + k] == DIGITS_AA_LOOKUP[88]) r[i] = r[i] if not x_seen else x_flag - return custom_unique1d(r) + return unique1d_linear(r) @numba.njit From bf741f4e48fa412fdbad20c08403ed91c60fe23c Mon Sep 17 00:00:00 2001 From: Nikolai Romashchenko Date: Mon, 26 Aug 2024 15:19:08 +0200 Subject: [PATCH 04/17] First implementation of the quickselect algorithm for filtering results --- omamer/merge_search.py | 244 +++++++++++++++++++++++++++++++++-------- 1 file changed, 199 insertions(+), 45 deletions(-) diff --git a/omamer/merge_search.py b/omamer/merge_search.py index 4bc548d..9121717 100644 --- a/omamer/merge_search.py +++ b/omamer/merge_search.py @@ -22,7 +22,7 @@ """ from Rmath4 import pbinom, phyper from numba.tests.support import captured_stdout -from numba.typed import List +from numba.typed import List, Dict from property_manager import lazy_property, cached_property from time import time import numba @@ -85,6 +85,52 @@ def fam_res_compare(x1, x2): return 0 +@numba.njit +def fam_res_less(x1, x2): + """ + Same as fam_res_compare, but operates as '<' + """ + if x1["normcount"] != x2["normcount"]: + return x1["normcount"] < x2["normcount"] + + if x1["overlap"] != x2["overlap"]: + return x1["overlap"] < x2["overlap"] + + if x1["pvalue"] != x2["pvalue"]: + return x1["pvalue"] < x2["pvalue"] + + return False + + +@numba.njit +def fam_res_le(x1, x2): + """ + Same as fam_res_compare, but operates as '<=' + """ + if not fam_res_less(x1, x2): + return x1["normcount"] == x2["normcount"] and \ + x1["overlap"] == x2["overlap"] and \ + x1["pvalue"] == x2["pvalue"] + + return True + + +@numba.njit +def fam_res_greater(x1, x2): + """ + Same as fam_res_compare, but operates as '>' + """ + return not fam_res_le(x1, x2) + + +@numba.njit +def fam_res_ge(x1, x2): + """ + Same as fam_res_compare, but operates as '>=' + """ + return not fam_res_less(x1, x2) + + @numba.njit def family_result_argsort(x, ii): """ @@ -120,17 +166,98 @@ def family_result_argsort(x, ii): @numba.njit -def family_result_sort(x): +def family_result_sort(x, k): """ Sort the family results according to normcount, overlap and pvalue (to break ties). this uses a quicksort implementation as np.argsort does not support struct type in numba. """ - # perform the family result sorting. - idx = family_result_argsort(x, list(range(len(x)))) - y = np.zeros_like(x) - for i in numba.prange(len(x)): - y[i] = x[idx[i]] - return y + + if len(x) <= k: + #idx = family_result_argsort(x, list(range(len(x)))) + #y = np.zeros_like(x) + #for i in numba.prange(len(x)): + # y[i] = x[idx[i]] + return x + + _ = _select(x, k, 0, len(x) - 1) + return x[:k] + + +@numba.njit +def _swap(data, i, j): + # Manually swap each field. I don't know how to do it more + # generally under @numba.njit, as numba seems to have problems + # with indexing structured arrays + temp = np.zeros(1, dtype=data.dtype)[0] + temp['id'] = data[i]['id'] + temp['pvalue'] = data[i]['pvalue'] + temp['count'] = data[i]['count'] + temp['normcount'] = data[i]['normcount'] + temp['overlap'] = data[i]['overlap'] + + data[i]['id'] = data[j]['id'] + data[i]['pvalue'] = data[j]['pvalue'] + data[i]['count'] = data[j]['count'] + data[i]['normcount'] = data[j]['normcount'] + data[i]['overlap'] = data[j]['overlap'] + + data[j]['id'] = temp['id'] + data[j]['pvalue'] = temp['pvalue'] + data[j]['count'] = temp['count'] + data[j]['normcount'] = temp['normcount'] + data[j]['overlap'] = temp['overlap'] + + +@numba.njit +def _partition(A, low, high, mid): + if mid == -1: + mid = (low + high) >> 1 + # NOTE: the pattern of swaps below for the pivot choice and the + # partitioning gives good results (i.e. regular O(n log n)) + # on sorted, reverse-sorted, and uniform arrays. Subtle changes + # risk breaking this property. + + # Use median of three {low, middle, high} as the pivot + if fam_res_greater(A[mid], A[low]): + _swap(A, mid, low) + if fam_res_greater(A[high], A[mid]): + _swap(A, high, mid) + if fam_res_greater(A[mid], A[low]): + _swap(A, low, mid) + + pivot = np.zeros(1, dtype=A.dtype)[0] + pivot['id'] = A[mid]['id'] + pivot['pvalue'] = A[mid]['pvalue'] + pivot['count'] = A[mid]['count'] + pivot['normcount'] = A[mid]['normcount'] + pivot['overlap'] = A[mid]['overlap'] + + _swap(A, high, mid) + + i = low + for j in range(low, high): + if fam_res_greater(A[j], pivot): + _swap(A, i, j) + i += 1 + + _swap(A, i, high) + return i + + +@numba.njit +def _select(array, k, low, high): + """ + Select the k'th largest element + """ + i = _partition(array, low, high, -1) + while i != k: + if i < k: + low = i + 1 + i = _partition(array, low, high, -1) + else: + high = i - 1 + i = _partition(array, low, high, -1) + return array[k] # ---- @@ -196,7 +323,7 @@ def unique1d_linear(array): unique_list.append(array[i]) index_list.append(i) - return np.asarray(unique_list), np.asarray(index_list), None + return np.asarray(unique_list), np.asarray(index_list, dtype=np.uint32), None @numba.njit @@ -245,12 +372,12 @@ def search_seq_kmers(r1, p1, hog_tab, fam_tab, x_flag, table_idx, table_buff): """ Perform the kmer search, using the index. """ - hog_counts = np.zeros(hog_tab.size, dtype=np.uint16) - fam_counts = np.zeros(fam_tab.size, dtype=np.uint16) + hog_counts = Dict.empty(key_type=numba.uint32, value_type=numba.int64) + fam_counts = Dict.empty(key_type=numba.uint32, value_type=numba.int64) # store lowest and higher k-mer locations - fam_lowloc = np.full(fam_tab.size, -1, dtype=np.int32) - fam_highloc = np.full(fam_tab.size, -1, dtype=np.int32) + fam_lowloc = Dict.empty(key_type=numba.uint32, value_type=numba.uint32) + fam_highloc = Dict.empty(key_type=numba.uint32, value_type=numba.uint32) # iterate unique k-mers for m in range(r1.shape[0]): @@ -260,30 +387,39 @@ def search_seq_kmers(r1, p1, hog_tab, fam_tab, x_flag, table_idx, table_buff): # to ignore k-mers with X if kmer == x_flag: continue - else: - pass # get mapping to HOGs - x = table_idx[kmer : kmer + 2] - hogs = table_buff[x[0] : x[1]] + x = table_idx[kmer: kmer + 2] + hogs = table_buff[x[0]: x[1]] fams = hog_tab["FamOff"][hogs] - hog_counts[hogs] += np.uint16(1) - fam_counts[fams] += np.uint16(1) - # store lowest and highest locations + for hog in hogs: + if hog in hog_counts: + hog_counts[hog] += 1 + else: + hog_counts[hog] = 1 + for fam_off in fams: - # initiate first location - if fam_lowloc[fam_off] == -1: - fam_lowloc[fam_off] = loc - fam_highloc[fam_off] = loc + if fam_off in fam_counts: + fam_counts[fam_off] += 1 + else: + fam_counts[fam_off] = 1 - # update either lower or higher boundary - elif loc < fam_lowloc[fam_off]: + if fam_off in fam_lowloc: + if loc < fam_lowloc[fam_off]: + fam_lowloc[fam_off] = loc + else: + # initiate first location fam_lowloc[fam_off] = loc - elif loc > fam_highloc[fam_off]: + + if fam_off in fam_highloc: + if loc > fam_highloc[fam_off]: + fam_highloc[fam_off] = loc + else: + # initiate first location fam_highloc[fam_off] = loc - return (hog_counts, fam_counts, fam_lowloc, fam_highloc) + return hog_counts, fam_counts, fam_lowloc, fam_highloc ## functions to cumulate HOG k-mer counts @@ -421,6 +557,24 @@ def get_closest_taxa_from_ref(q2hog_off, ref_taxoff, tax_tab, hog_tab, chog_buff return q2closest_taxon +@numba.njit +def unzip_dict(data: numba.typed.Dict): + keys = np.zeros(len(data)) + values = np.zeros_like(keys) + for i, key in enumerate(data): + keys[i] = key + values[i] = data[key] + return keys, values + + +@numba.njit +def dict_slice(data: numba.typed.Dict, start: np.uint32, end: np.uint32) -> np.ndarray: + result = np.zeros(end - start, dtype=np.uint32) + for j in range(start, end): + result[j - start] = data.get(j, np.uint32(0)) + return result + + class MergeSearch(object): def __init__(self, ki, include_extant_genes=False): assert ki.db.db.mode == "r", "Database must be opened in read mode." @@ -693,15 +847,15 @@ def func( continue # search using kmers - (hog_counts, fam_counts, fam_lowloc, fam_highloc) = search_seq_kmers( + hog_counts, fam_counts, fam_lowloc, fam_highloc = search_seq_kmers( r1, p1, hog_tab, fam_tab, x_flag, table_idx, table_buff ) # identify families of interest. note: repeat(zeros_like..) numba hack - idx = np.argwhere(fam_counts > 0).flatten() - qres = np.repeat(np.zeros_like(family_results[zz, 0]), len(idx)) - qres["id"][:] = idx - qres["count"][:] = fam_counts[idx] + count_keys, counts_values = unzip_dict(fam_counts) + qres = np.repeat(np.zeros_like(family_results[zz, 0]), len(count_keys)) + qres["id"][:] = count_keys + qres["count"][:] = counts_values # 1. compute p-value for each family. note: in negative log units correction_factor = np.log(len(ref_fam_prob)) @@ -732,9 +886,12 @@ def func( continue # - b. filter on sequence coverage - qres["overlap"][:] = ( - fam_highloc[qres["id"]] - fam_lowloc[qres["id"]] + k - ) / query_len + overlap = np.zeros(len(qres)) + for i in range(len(qres)): + family_id = qres["id"][i] + overlap[i] = (fam_highloc[family_id] - fam_lowloc[family_id] + k) / query_len + + qres["overlap"][:] = overlap qres = qres[(qres["overlap"] >= (25 / query_len))] if len(qres) == 0: @@ -751,18 +908,14 @@ def func( # 4. Store results # - a. sort by normcount, then overlap, then p-value for tie-breaking - qres = family_result_sort(qres) + qres = family_result_sort(qres, top_n_fams) # - b. store results family_results["id"][zz, :top_n_fams] = qres["id"][:top_n_fams] + 1 family_results["pvalue"][zz, :top_n_fams] = qres["pvalue"][:top_n_fams] family_results["count"][zz, :top_n_fams] = qres["count"][:top_n_fams] - family_results["normcount"][zz, :top_n_fams] = qres["normcount"][ - :top_n_fams - ] - family_results["overlap"][zz, :top_n_fams] = qres["overlap"][ - :top_n_fams - ] + family_results["normcount"][zz, :top_n_fams] = qres["normcount"][:top_n_fams] + family_results["overlap"][zz, :top_n_fams] = qres["overlap"][:top_n_fams] # 5. Place within families for i in numba.prange(min(len(qres), top_n_fams)): @@ -779,7 +932,8 @@ def func( fam_level_offsets = get_fam_level_offsets(entry, level_arr) # cumulation of counts - c = hog_counts[hog_s:hog_e].copy() + c = dict_slice(hog_counts, hog_s, hog_e) + cumulate_counts_1fam(c, fam_level_offsets, fam_hog2parent) # new expected count, but using old cumulation @@ -788,7 +942,7 @@ def func( r1.size, fam_level_offsets, fam_hog2parent, - hog_counts[hog_s:hog_e], + dict_slice(hog_counts, hog_s, hog_e), ref_hog_prob[hog_s:hog_e], ) From ff4d96827a747ca9297964633ba51a73bbb63b99 Mon Sep 17 00:00:00 2001 From: Nikolai Romashchenko Date: Mon, 26 Aug 2024 16:28:55 +0200 Subject: [PATCH 05/17] EPIK-like algorithm for family placemnt, parallelization off --- omamer/merge_search.py | 94 +++++++++++++++++++++++++----------------- 1 file changed, 56 insertions(+), 38 deletions(-) diff --git a/omamer/merge_search.py b/omamer/merge_search.py index 9121717..6a3940e 100644 --- a/omamer/merge_search.py +++ b/omamer/merge_search.py @@ -172,11 +172,13 @@ def family_result_sort(x, k): this uses a quicksort implementation as np.argsort does not support struct type in numba. """ + idx = family_result_argsort(x, list(range(len(x)))) + y = np.zeros_like(x) + for i in numba.prange(len(x)): + y[i] = x[idx[i]] + return y + if len(x) <= k: - #idx = family_result_argsort(x, list(range(len(x)))) - #y = np.zeros_like(x) - #for i in numba.prange(len(x)): - # y[i] = x[idx[i]] return x _ = _select(x, k, 0, len(x) - 1) @@ -368,16 +370,25 @@ def parse_seq(s, DIGITS_AA_LOOKUP, n_kmers, k, trans, x_flag): @numba.njit -def search_seq_kmers(r1, p1, hog_tab, fam_tab, x_flag, table_idx, table_buff): +def search_seq_kmers(r1, p1, hog_tab, fam_tab, x_flag, table_idx, table_buff, + hog_counts, fam_counts, fam_lowloc, fam_highloc, + deinit_fam_list, deinit_hog_list): """ Perform the kmer search, using the index. """ - hog_counts = Dict.empty(key_type=numba.uint32, value_type=numba.int64) - fam_counts = Dict.empty(key_type=numba.uint32, value_type=numba.int64) - # store lowest and higher k-mer locations - fam_lowloc = Dict.empty(key_type=numba.uint32, value_type=numba.uint32) - fam_highloc = Dict.empty(key_type=numba.uint32, value_type=numba.uint32) + # reinitialize counters for the indexes modified + # by the previous call + for family in deinit_fam_list: + fam_counts[family] = 0 + fam_lowloc[family] = -1 + fam_highloc[family] = -1 + + for hog in deinit_hog_list: + hog_counts[hog] = 0 + + deinit_fam_list = List.empty_list(numba.int32) + deinit_hog_list = List.empty_list(numba.int32) # iterate unique k-mers for m in range(r1.shape[0]): @@ -394,32 +405,29 @@ def search_seq_kmers(r1, p1, hog_tab, fam_tab, x_flag, table_idx, table_buff): fams = hog_tab["FamOff"][hogs] for hog in hogs: - if hog in hog_counts: - hog_counts[hog] += 1 - else: - hog_counts[hog] = 1 + if not hog_counts[hog]: + deinit_hog_list.append(hog) + + hog_counts[hog] += 1 for fam_off in fams: - if fam_off in fam_counts: - fam_counts[fam_off] += 1 - else: - fam_counts[fam_off] = 1 + if not fam_counts[fam_off]: + deinit_fam_list.append(fam_off) - if fam_off in fam_lowloc: - if loc < fam_lowloc[fam_off]: - fam_lowloc[fam_off] = loc - else: - # initiate first location + fam_counts[fam_off] += 1 + + # initiate first location + if fam_lowloc[fam_off] == -1: fam_lowloc[fam_off] = loc + fam_highloc[fam_off] = loc - if fam_off in fam_highloc: - if loc > fam_highloc[fam_off]: - fam_highloc[fam_off] = loc - else: - # initiate first location + # update either lower or higher boundary + elif loc < fam_lowloc[fam_off]: + fam_lowloc[fam_off] = loc + elif loc > fam_highloc[fam_off]: fam_highloc[fam_off] = loc - return hog_counts, fam_counts, fam_lowloc, fam_highloc + return deinit_fam_list, deinit_hog_list ## functions to cumulate HOG k-mer counts @@ -827,6 +835,13 @@ def func( # flags to ignore k-mers containing X x_flag = table_idx.size - 1 + hog_counts = np.zeros(hog_tab.size, dtype=np.uint16) + fam_counts = np.zeros(fam_tab.size, dtype=np.uint16) + fam_lowloc = np.full(fam_tab.size, -1, dtype=np.int32) + fam_highloc = np.full(fam_tab.size, -1, dtype=np.int32) + deinit_fam_list = List.empty_list(numba.int32) + deinit_hog_list = List.empty_list(numba.int32) + for zz in numba.prange(len(seqs_idx) - 1): # load seq s = seqs[seqs_idx[zz] : np.int64(seqs_idx[zz + 1] - 1)] @@ -847,15 +862,18 @@ def func( continue # search using kmers - hog_counts, fam_counts, fam_lowloc, fam_highloc = search_seq_kmers( - r1, p1, hog_tab, fam_tab, x_flag, table_idx, table_buff + deinit_fam_list, deinit_hog_list = search_seq_kmers( + r1, p1, hog_tab, fam_tab, x_flag, table_idx, table_buff, + hog_counts, fam_counts, fam_lowloc, fam_highloc, + deinit_fam_list, deinit_hog_list ) # identify families of interest. note: repeat(zeros_like..) numba hack - count_keys, counts_values = unzip_dict(fam_counts) - qres = np.repeat(np.zeros_like(family_results[zz, 0]), len(count_keys)) - qres["id"][:] = count_keys - qres["count"][:] = counts_values + idx = np.argwhere(fam_counts > 0).flatten() + #idx = deinit_fam_list + qres = np.repeat(np.zeros_like(family_results[zz, 0]), len(idx)) + qres["id"][:] = idx + qres["count"][:] = fam_counts[idx] # 1. compute p-value for each family. note: in negative log units correction_factor = np.log(len(ref_fam_prob)) @@ -932,7 +950,7 @@ def func( fam_level_offsets = get_fam_level_offsets(entry, level_arr) # cumulation of counts - c = dict_slice(hog_counts, hog_s, hog_e) + c = hog_counts[hog_s:hog_e].copy() cumulate_counts_1fam(c, fam_level_offsets, fam_hog2parent) @@ -942,7 +960,7 @@ def func( r1.size, fam_level_offsets, fam_hog2parent, - dict_slice(hog_counts, hog_s, hog_e), + hog_counts[hog_s:hog_e], ref_hog_prob[hog_s:hog_e], ) @@ -964,5 +982,5 @@ def func( c[int(choice)] if choice_score != 0.0 else 0 ) - return numba.jit(func, parallel=True, nopython=True, nogil=True) + return numba.jit(func, parallel=False, nopython=True, nogil=True) #return func From 1624f1d041dd3464988fdcfb4f764cffd4a8bb1d Mon Sep 17 00:00:00 2001 From: Nikolai Romashchenko Date: Wed, 28 Aug 2024 11:25:47 +0200 Subject: [PATCH 06/17] Changed the order of family filtering: start with overlap --- omamer/merge_search.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/omamer/merge_search.py b/omamer/merge_search.py index 6a3940e..fc5b7b2 100644 --- a/omamer/merge_search.py +++ b/omamer/merge_search.py @@ -875,6 +875,19 @@ def func( qres["id"][:] = idx qres["count"][:] = fam_counts[idx] + # 2. Filtering + # - b. filter on sequence coverage + overlap = np.zeros(len(qres)) + for i in range(len(qres)): + family_id = qres["id"][i] + overlap[i] = (fam_highloc[family_id] - fam_lowloc[family_id] + k) / query_len + + qres["overlap"][:] = overlap + + qres = qres[(qres["overlap"] >= (25 / query_len))] + if len(qres) == 0: + continue + # 1. compute p-value for each family. note: in negative log units correction_factor = np.log(len(ref_fam_prob)) for i in numba.prange(len(qres)): @@ -893,25 +906,12 @@ def func( ), ) - # 2. Filtering # - a. filter to significant families (on p-value) alpha = -1.0 * np.log(alpha_cutoff) qres = qres[qres["pvalue"] >= alpha] # filter out 0 neg log p. alpha > 0 is normal. alpha = 0 is edge case. qres = qres if alpha > 0 else qres[qres["pvalue"] > 0] - if len(qres) == 0: - continue - - # - b. filter on sequence coverage - overlap = np.zeros(len(qres)) - for i in range(len(qres)): - family_id = qres["id"][i] - overlap[i] = (fam_highloc[family_id] - fam_lowloc[family_id] + k) / query_len - - qres["overlap"][:] = overlap - - qres = qres[(qres["overlap"] >= (25 / query_len))] if len(qres) == 0: continue From cddd0d30a9034eb420a8b5c876b469a176b88a32 Mon Sep 17 00:00:00 2001 From: Nikolai Romashchenko Date: Thu, 29 Aug 2024 10:03:52 +0200 Subject: [PATCH 07/17] Modified quickselect to base it on indexes not data --- omamer/merge_search.py | 106 ++++++++++++++++------------------------- 1 file changed, 40 insertions(+), 66 deletions(-) diff --git a/omamer/merge_search.py b/omamer/merge_search.py index fc5b7b2..6f2349d 100644 --- a/omamer/merge_search.py +++ b/omamer/merge_search.py @@ -171,95 +171,69 @@ def family_result_sort(x, k): Sort the family results according to normcount, overlap and pvalue (to break ties). this uses a quicksort implementation as np.argsort does not support struct type in numba. """ - - idx = family_result_argsort(x, list(range(len(x)))) - y = np.zeros_like(x) - for i in numba.prange(len(x)): - y[i] = x[idx[i]] - return y - - if len(x) <= k: - return x - - _ = _select(x, k, 0, len(x) - 1) - return x[:k] + idx = np.arange(len(x)) + _ = _select(x, idx, k, 0, len(x) - 1) + return x[idx[:k]] @numba.njit -def _swap(data, i, j): - # Manually swap each field. I don't know how to do it more - # generally under @numba.njit, as numba seems to have problems - # with indexing structured arrays - temp = np.zeros(1, dtype=data.dtype)[0] - temp['id'] = data[i]['id'] - temp['pvalue'] = data[i]['pvalue'] - temp['count'] = data[i]['count'] - temp['normcount'] = data[i]['normcount'] - temp['overlap'] = data[i]['overlap'] - - data[i]['id'] = data[j]['id'] - data[i]['pvalue'] = data[j]['pvalue'] - data[i]['count'] = data[j]['count'] - data[i]['normcount'] = data[j]['normcount'] - data[i]['overlap'] = data[j]['overlap'] - - data[j]['id'] = temp['id'] - data[j]['pvalue'] = temp['pvalue'] - data[j]['count'] = temp['count'] - data[j]['normcount'] = temp['normcount'] - data[j]['overlap'] = temp['overlap'] +def _swap(array, i, j): + tmp = array[i] + array[i] = array[j] + array[j] = tmp @numba.njit -def _partition(A, low, high, mid): - if mid == -1: - mid = (low + high) >> 1 - # NOTE: the pattern of swaps below for the pivot choice and the - # partitioning gives good results (i.e. regular O(n log n)) - # on sorted, reverse-sorted, and uniform arrays. Subtle changes - # risk breaking this property. +def _partition(x, idx, low, high): + """ + Index-based version of the partition algorithm of + quicksort. Juggles indexes of the idx array that + indexes the x array, considering the range [low, high]. + """ + mid = (low + high) >> 1 # Use median of three {low, middle, high} as the pivot - if fam_res_greater(A[mid], A[low]): - _swap(A, mid, low) - if fam_res_greater(A[high], A[mid]): - _swap(A, high, mid) - if fam_res_greater(A[mid], A[low]): - _swap(A, low, mid) - - pivot = np.zeros(1, dtype=A.dtype)[0] - pivot['id'] = A[mid]['id'] - pivot['pvalue'] = A[mid]['pvalue'] - pivot['count'] = A[mid]['count'] - pivot['normcount'] = A[mid]['normcount'] - pivot['overlap'] = A[mid]['overlap'] - - _swap(A, high, mid) - + if fam_res_greater(x[idx[mid]], x[idx[low]]): + _swap(idx, mid, low) + if fam_res_greater(x[idx[high]], x[idx[mid]]): + _swap(idx, high, mid) + if fam_res_greater(x[idx[mid]], x[idx[low]]): + _swap(idx, low, mid) + + pivot = x[idx[mid]] + # Put the pivot in the end of the array + _swap(idx, mid, high) + + # Collect elements that are > pivot in the beginning i = low for j in range(low, high): - if fam_res_greater(A[j], pivot): - _swap(A, i, j) + if fam_res_greater(x[idx[j]], pivot): + _swap(idx, i, j) i += 1 - _swap(A, i, high) + # All indexes of idx in [0, i) are good now. + # Let's place the pivot back where it should be + _swap(idx, i, high) return i @numba.njit -def _select(array, k, low, high): +def _select(x, idx, k, low, high): """ - Select the k'th largest element + Select the k'th largest element of the x array """ - i = _partition(array, low, high, -1) + if k >= len(x): + return len(x) + + i = _partition(x, idx, low, high) while i != k: if i < k: low = i + 1 - i = _partition(array, low, high, -1) + i = _partition(x, idx, low, high) else: high = i - 1 - i = _partition(array, low, high, -1) - return array[k] + i = _partition(x, idx, low, high) + return idx[k] # ---- From d41a63da027b0723bb3778f6929adb2be78da9e8 Mon Sep 17 00:00:00 2001 From: Nikolai Romashchenko Date: Thu, 29 Aug 2024 10:33:30 +0200 Subject: [PATCH 08/17] Added in-jit time measurements of placement steps elapsed time --- omamer/_clock.py | 8 ++++++++ omamer/main.py | 5 +++++ omamer/merge_search.py | 44 +++++++++++++++++++++++++++++++++++++++++- 3 files changed, 56 insertions(+), 1 deletion(-) create mode 100644 omamer/_clock.py diff --git a/omamer/_clock.py b/omamer/_clock.py new file mode 100644 index 0000000..8d57dc6 --- /dev/null +++ b/omamer/_clock.py @@ -0,0 +1,8 @@ +""" CPU-time returning clock() function which works from within njit-ted code """ +import ctypes +from ctypes.util import find_library + +__LIB = find_library("c") + +clock = ctypes.CDLL(__LIB).clock +clock.argtypes = [] diff --git a/omamer/main.py b/omamer/main.py index cc7a6ca..d7aa561 100644 --- a/omamer/main.py +++ b/omamer/main.py @@ -265,6 +265,7 @@ def get_thread_count(): help="Show metadata about an omamer database.", description="Show metadata about an existing omamer database", ) + info_parser.set_defaults(func=info_db) info_parser.add_argument( "-d", @@ -304,3 +305,7 @@ def get_thread_count(): args.func(args) else: parser.print_usage() + + +if __name__ == "__main__": + main() diff --git a/omamer/merge_search.py b/omamer/merge_search.py index f0fb765..2b4b502 100644 --- a/omamer/merge_search.py +++ b/omamer/merge_search.py @@ -38,6 +38,7 @@ is_taxon_implied, get_children, ) +from ._clock import clock # maximum neglogp to set @@ -629,7 +630,18 @@ def func( # flags to ignore k-mers containing X x_flag = table_idx.size - 1 + parse_time = 0.0 + search_time = 0.0 + filter_time = 0.0 + pvalue_time = 0.0 + place_time = 0.0 + sort_time = 0.0 + total_time = 0.0 + for zz in numba.prange(len(seqs_idx) - 1): + + t0 = clock() + # load seq s = seqs[seqs_idx[zz] : np.int64(seqs_idx[zz + 1] - 1)] query_len = s.shape[0] @@ -648,11 +660,19 @@ def func( elif r1[0] == x_flag: continue + t1 = clock() + parse_time += t1 - t0 + t0 = clock() + # search using kmers (hog_counts, fam_counts, fam_lowloc, fam_highloc) = search_seq_kmers( r1, p1, hog_tab, fam_tab, x_flag, table_idx, table_buff ) + t1 = clock() + search_time += t1 - t0 + t0 = clock() + # identify families of interest. note: repeat(zeros_like..) numba hack idx = np.argwhere(fam_counts > 0).flatten() qres = np.repeat(np.zeros_like(family_results[zz, 0]), len(idx)) @@ -677,6 +697,10 @@ def func( ), ) + t1 = clock() + pvalue_time += t1 - t0 + t0 = clock() + # 2. Filtering # - a. filter to significant families (on p-value) alpha = -1.0 * np.log(alpha_cutoff) @@ -705,6 +729,10 @@ def func( len(r1) - top_fam_expect_counts ) + t1 = clock() + filter_time += t1 - t0 + t0 = clock() + # 4. Store results # - a. sort by normcount, then overlap, then p-value for tie-breaking qres = family_result_sort(qres) @@ -720,6 +748,10 @@ def func( :top_n_fams ] + t1 = clock() + sort_time += t1 - t0 + t0 = clock() + # 5. Place within families for i in numba.prange(min(len(qres), top_n_fams)): entry = fam_tab[qres["id"][i]] @@ -766,4 +798,14 @@ def func( c[int(choice)] if choice_score != 0.0 else 0 ) - return numba.jit(func, parallel=True, nopython=True, nogil=True) + t1 = clock() + place_time += t1 - t0 + + print("Parse time\t", parse_time) + print("Search time\t", search_time) + print("Filter time\t", filter_time) + print("Pvalue time\t", pvalue_time) + print("Place time\t", place_time) + print("Sort time\t", sort_time) + + return numba.jit(func, parallel=False, nopython=True, nogil=True) From 8cf9763cfb9c22ca57428a5574c91ec072d5c874 Mon Sep 17 00:00:00 2001 From: Nikolai Romashchenko Date: Thu, 29 Aug 2024 10:51:14 +0200 Subject: [PATCH 09/17] Improved time measurements --- omamer/_clock.py | 14 ++++++++++---- omamer/merge_search.py | 19 ++++++++++++------- 2 files changed, 22 insertions(+), 11 deletions(-) diff --git a/omamer/_clock.py b/omamer/_clock.py index 8d57dc6..e02e1bf 100644 --- a/omamer/_clock.py +++ b/omamer/_clock.py @@ -1,8 +1,14 @@ -""" CPU-time returning clock() function which works from within njit-ted code """ import ctypes -from ctypes.util import find_library -__LIB = find_library("c") +# Access the _PyTime_AsSecondsDouble and _PyTime_GetSystemClock functions from pythonapi +clock = ctypes.pythonapi._PyTime_GetSystemClock +as_seconds = ctypes.pythonapi._PyTime_AsSecondsDouble -clock = ctypes.CDLL(__LIB).clock +# Set the argument types and return types of the functions clock.argtypes = [] +clock.restype = ctypes.c_int64 + +as_seconds.argtypes = [ctypes.c_int64] +as_seconds.restype = ctypes.c_double + + diff --git a/omamer/merge_search.py b/omamer/merge_search.py index 2b4b502..006f476 100644 --- a/omamer/merge_search.py +++ b/omamer/merge_search.py @@ -38,7 +38,7 @@ is_taxon_implied, get_children, ) -from ._clock import clock +from ._clock import clock, as_seconds # maximum neglogp to set @@ -801,11 +801,16 @@ def func( t1 = clock() place_time += t1 - t0 - print("Parse time\t", parse_time) - print("Search time\t", search_time) - print("Filter time\t", filter_time) - print("Pvalue time\t", pvalue_time) - print("Place time\t", place_time) - print("Sort time\t", sort_time) + total_time = parse_time + search_time + filter_time + pvalue_time + place_time + sort_time + + print() + print("Parse time\t", as_seconds(parse_time)) + print("Search time\t", as_seconds(search_time)) + print("Filter time\t", as_seconds(filter_time)) + print("Pvalue time\t", as_seconds(pvalue_time)) + print("Place time\t", as_seconds(place_time)) + print("Sort time\t", as_seconds(sort_time)) + print("Batch total\t", as_seconds(total_time)) + print() return numba.jit(func, parallel=False, nopython=True, nogil=True) From e15c65e7444f65076571e622690d212f79e50885 Mon Sep 17 00:00:00 2001 From: Nikolai Romashchenko Date: Thu, 29 Aug 2024 11:43:22 +0200 Subject: [PATCH 10/17] Reimplemented list-based query search with arrays --- omamer/merge_search.py | 48 ++++++++++++++++++++++-------------------- 1 file changed, 25 insertions(+), 23 deletions(-) diff --git a/omamer/merge_search.py b/omamer/merge_search.py index 457383c..d6cd66d 100644 --- a/omamer/merge_search.py +++ b/omamer/merge_search.py @@ -347,23 +347,23 @@ def parse_seq(s, DIGITS_AA_LOOKUP, n_kmers, k, trans, x_flag): @numba.njit def search_seq_kmers(r1, p1, hog_tab, fam_tab, x_flag, table_idx, table_buff, hog_counts, fam_counts, fam_lowloc, fam_highloc, - deinit_fam_list, deinit_hog_list): + hit_fams, num_hit_fams, hit_hogs, num_hit_hogs): """ Perform the kmer search, using the index. """ - - # reinitialize counters for the indexes modified - # by the previous call - for family in deinit_fam_list: + # Reinitialize counters modified by the previous call + for i in range(num_hit_fams): + family = hit_fams[i] fam_counts[family] = 0 fam_lowloc[family] = -1 fam_highloc[family] = -1 - for hog in deinit_hog_list: + for i in range(num_hit_hogs): + hog = hit_hogs[i] hog_counts[hog] = 0 - deinit_fam_list = List.empty_list(numba.int32) - deinit_hog_list = List.empty_list(numba.int32) + num_hit_fams = 0 + num_hit_hogs = 0 # iterate unique k-mers for m in range(r1.shape[0]): @@ -381,13 +381,15 @@ def search_seq_kmers(r1, p1, hog_tab, fam_tab, x_flag, table_idx, table_buff, for hog in hogs: if not hog_counts[hog]: - deinit_hog_list.append(hog) + hit_hogs[num_hit_hogs] = hog + num_hit_hogs += 1 hog_counts[hog] += 1 for fam_off in fams: if not fam_counts[fam_off]: - deinit_fam_list.append(fam_off) + hit_fams[num_hit_fams] = fam_off + num_hit_fams += 1 fam_counts[fam_off] += 1 @@ -402,7 +404,7 @@ def search_seq_kmers(r1, p1, hog_tab, fam_tab, x_flag, table_idx, table_buff, elif loc > fam_highloc[fam_off]: fam_highloc[fam_off] = loc - return deinit_fam_list, deinit_hog_list + return hit_fams, num_hit_fams, hit_hogs, num_hit_hogs ## functions to cumulate HOG k-mer counts @@ -557,7 +559,6 @@ def dict_slice(data: numba.typed.Dict, start: np.uint32, end: np.uint32) -> np.n result[j - start] = data.get(j, np.uint32(0)) return result - class MergeSearch(object): def __init__(self, ki, include_extant_genes=False): assert ki.db.db.mode == "r", "Database must be opened in read mode." @@ -814,8 +815,10 @@ def func( fam_counts = np.zeros(fam_tab.size, dtype=np.uint16) fam_lowloc = np.full(fam_tab.size, -1, dtype=np.int32) fam_highloc = np.full(fam_tab.size, -1, dtype=np.int32) - deinit_fam_list = List.empty_list(numba.int32) - deinit_hog_list = List.empty_list(numba.int32) + hit_fams = np.zeros(fam_tab.size, dtype=np.int32) + hit_hogs = np.zeros(hog_tab.size, dtype=np.int32) + num_hit_fams = 0 + num_hit_hogs = 0 parse_time = 0.0 search_time = 0.0 @@ -852,23 +855,22 @@ def func( t0 = clock() # search using kmers - deinit_fam_list, deinit_hog_list = search_seq_kmers( + hit_fams, num_hit_fams, hit_hogs, num_hit_hogs = search_seq_kmers( r1, p1, hog_tab, fam_tab, x_flag, table_idx, table_buff, hog_counts, fam_counts, fam_lowloc, fam_highloc, - deinit_fam_list, deinit_hog_list + hit_fams, num_hit_fams, hit_hogs, num_hit_hogs ) - t1 = clock() - search_time += t1 - t0 - t0 = clock() - - # identify families of interest. note: repeat(zeros_like..) numba hack - idx = np.argwhere(fam_counts > 0).flatten() - #idx = deinit_fam_list + # Identify families of interest + idx = hit_fams[:num_hit_fams] qres = np.repeat(np.zeros_like(family_results[zz, 0]), len(idx)) qres["id"][:] = idx qres["count"][:] = fam_counts[idx] + t1 = clock() + search_time += t1 - t0 + t0 = clock() + # 2. Filtering # - b. filter on sequence coverage overlap = np.zeros(len(qres)) From 53c897e20ecdd590997844c2199d1b391c974372 Mon Sep 17 00:00:00 2001 From: Nikolai Romashchenko Date: Thu, 29 Aug 2024 12:39:43 +0200 Subject: [PATCH 11/17] Parallel version of search_seq_kmers based on hit families --- omamer/merge_search.py | 37 +++++++++++++++++++++++++------------ 1 file changed, 25 insertions(+), 12 deletions(-) diff --git a/omamer/merge_search.py b/omamer/merge_search.py index d6cd66d..1f2b292 100644 --- a/omamer/merge_search.py +++ b/omamer/merge_search.py @@ -404,7 +404,7 @@ def search_seq_kmers(r1, p1, hog_tab, fam_tab, x_flag, table_idx, table_buff, elif loc > fam_highloc[fam_off]: fam_highloc[fam_off] = loc - return hit_fams, num_hit_fams, hit_hogs, num_hit_hogs + return num_hit_fams, num_hit_hogs ## functions to cumulate HOG k-mer counts @@ -811,14 +811,15 @@ def func( # flags to ignore k-mers containing X x_flag = table_idx.size - 1 - hog_counts = np.zeros(hog_tab.size, dtype=np.uint16) - fam_counts = np.zeros(fam_tab.size, dtype=np.uint16) - fam_lowloc = np.full(fam_tab.size, -1, dtype=np.int32) - fam_highloc = np.full(fam_tab.size, -1, dtype=np.int32) - hit_fams = np.zeros(fam_tab.size, dtype=np.int32) - hit_hogs = np.zeros(hog_tab.size, dtype=np.int32) - num_hit_fams = 0 - num_hit_hogs = 0 + num_threads = numba.get_num_threads() + thread_hog_counts = np.zeros((num_threads, hog_tab.size), dtype=np.uint16) + thread_fam_counts = np.zeros((num_threads, fam_tab.size), dtype=np.uint16) + thread_fam_lowloc = np.full((num_threads, fam_tab.size), -1, dtype=np.int32) + thread_fam_highloc = np.full((num_threads, fam_tab.size), -1, dtype=np.int32) + thread_hit_fams = np.zeros((num_threads, fam_tab.size), dtype=np.int32) + thread_hit_hogs = np.zeros((num_threads, hog_tab.size), dtype=np.int32) + thread_num_hit_fams = np.zeros(num_threads, dtype=np.uint32) + thread_num_hit_hogs = np.zeros(num_threads, dtype=np.uint32) parse_time = 0.0 search_time = 0.0 @@ -829,7 +830,6 @@ def func( total_time = 0.0 for zz in numba.prange(len(seqs_idx) - 1): - t0 = clock() # load seq @@ -854,13 +854,26 @@ def func( parse_time += t1 - t0 t0 = clock() + # Get thread-local data structures for search + hit_fams = thread_hit_fams[numba.get_thread_id()] + hit_hogs = thread_hit_hogs[numba.get_thread_id()] + hog_counts = thread_hog_counts[numba.get_thread_id()] + fam_counts = thread_fam_counts[numba.get_thread_id()] + fam_lowloc = thread_fam_lowloc[numba.get_thread_id()] + fam_highloc = thread_fam_highloc[numba.get_thread_id()] + num_hit_fams = thread_num_hit_fams[numba.get_thread_id()] + num_hit_hogs = thread_num_hit_hogs[numba.get_thread_id()] + # search using kmers - hit_fams, num_hit_fams, hit_hogs, num_hit_hogs = search_seq_kmers( + num_hit_fams, num_hit_hogs = search_seq_kmers( r1, p1, hog_tab, fam_tab, x_flag, table_idx, table_buff, hog_counts, fam_counts, fam_lowloc, fam_highloc, hit_fams, num_hit_fams, hit_hogs, num_hit_hogs ) + thread_num_hit_fams[numba.get_thread_id()] = num_hit_fams + thread_num_hit_hogs[numba.get_thread_id()] = num_hit_hogs + # Identify families of interest idx = hit_fams[:num_hit_fams] qres = np.repeat(np.zeros_like(family_results[zz, 0]), len(idx)) @@ -1010,4 +1023,4 @@ def func( print("Batch total\t", as_seconds(total_time)) print() - return numba.jit(func, parallel=False, nopython=True, nogil=True) + return numba.jit(func, parallel=True, nopython=True, nogil=True) From 6c05d11ac4a01e521540de7e4395bcc1d57c8561 Mon Sep 17 00:00:00 2001 From: Nikolai Romashchenko Date: Fri, 30 Aug 2024 14:58:11 +0200 Subject: [PATCH 12/17] Chernoff filtering --- omamer/merge_search.py | 50 +++++++++++++++++++++++++++++------------- 1 file changed, 35 insertions(+), 15 deletions(-) diff --git a/omamer/merge_search.py b/omamer/merge_search.py index 1f2b292..9020894 100644 --- a/omamer/merge_search.py +++ b/omamer/merge_search.py @@ -821,13 +821,12 @@ def func( thread_num_hit_fams = np.zeros(num_threads, dtype=np.uint32) thread_num_hit_hogs = np.zeros(num_threads, dtype=np.uint32) - parse_time = 0.0 - search_time = 0.0 - filter_time = 0.0 - pvalue_time = 0.0 - place_time = 0.0 - sort_time = 0.0 - total_time = 0.0 + parse_time = 0 + search_time = 0 + filter_time = 0 + pvalue_time = 0 + place_time = 0 + sort_time = 0 for zz in numba.prange(len(seqs_idx) - 1): t0 = clock() @@ -880,11 +879,35 @@ def func( qres["id"][:] = idx qres["count"][:] = fam_counts[idx] + t1 = clock() search_time += t1 - t0 t0 = clock() - # 2. Filtering + # 2. Fast family filtering + # Filter unlikely families by bounding the p-value with the Chernoff's bound: + # P(X >= k) = P(X >= (1 + d) mu) <= (exp(d) / (1 + d)^(1+d))^mu < Alpha + # Here: k is the number of k-mer hit, mu is the expected number of hits + # d = delta is the considered deviation from the expected value, + # Alpha is the p-value cutoff (in original units, not negative log). + # d = delta = (k - mu) / mu by definition of k. + # If we want P(X >= k) >= Alpha, then we can safely eliminate families + # that have P(X >= k) < Alpha. We know that the probability is bounded + # by Chernoff. For such families, we will have then: + # mu ((1+d) * ln(1+d) - d) < -ln(Alpha) + # This allows us to filter out families based on Alpha without + # actually computing the p-value which is quite expensive. + + # Compute the delta, similar to normcount + expected_count = ref_fam_prob[qres["id"]] * len(r1) + delta = (qres["count"] - expected_count) / expected_count + chernoff_bound = expected_count * ((1 + delta) * np.log(1 + delta) - delta) + + alpha = -1.0 * np.log(alpha_cutoff) + qres = qres[chernoff_bound >= alpha] + if len(qres) == 0: + continue + # - b. filter on sequence coverage overlap = np.zeros(len(qres)) for i in range(len(qres)): @@ -925,7 +948,6 @@ def func( # 2. Filtering # - a. filter to significant families (on p-value) - alpha = -1.0 * np.log(alpha_cutoff) qres = qres[qres["pvalue"] >= alpha] # filter out 0 neg log p. alpha > 0 is normal. alpha = 0 is edge case. qres = qres if alpha > 0 else qres[qres["pvalue"] > 0] @@ -934,12 +956,9 @@ def func( continue # 3. Compute normalised count - # - a. compute the expected count - top_fam_expect_counts = ref_fam_prob[qres["id"]] * len(r1) - - # - b. compute the normalised count - qres["normcount"][:] = (qres["count"] - top_fam_expect_counts) / ( - len(r1) - top_fam_expect_counts + expected_count = ref_fam_prob[qres["id"]] * len(r1) + qres["normcount"][:] = (qres["count"] - expected_count) / ( + len(r1) - expected_count ) t1 = clock() @@ -1024,3 +1043,4 @@ def func( print() return numba.jit(func, parallel=True, nopython=True, nogil=True) + #return func From c51cfe7d02fa4b8d88851ecfa3cfddceb0061675 Mon Sep 17 00:00:00 2001 From: Nikolai Romashchenko Date: Wed, 11 Sep 2024 10:25:11 +0200 Subject: [PATCH 13/17] Tighter Chernoff bound based on KL-div --- omamer/merge_search.py | 119 +++++++++++++++++++++-------------------- 1 file changed, 60 insertions(+), 59 deletions(-) diff --git a/omamer/merge_search.py b/omamer/merge_search.py index 9020894..ff28c08 100644 --- a/omamer/merge_search.py +++ b/omamer/merge_search.py @@ -43,11 +43,9 @@ # maximum neglogp to set -MAX_LOGP = 20000 +MAX_LOGP = 20000.0 -# ---- -# stats functions @numba.njit(nogil=True) def binom_neglogccdf(x, n, p): """ @@ -786,24 +784,24 @@ def generate(): @lazy_property def _lookup(self): def func( - family_results, - subfam_results, - seqs, - seqs_idx, - trans, - table_idx, - table_buff, - k, - DIGITS_AA_LOOKUP, - fam_tab, - hog_tab, - level_arr, - top_n_fams, - ref_fam_prob, - ref_hog_prob, - alpha_cutoff, - sst, - family_only, + family_results, + subfam_results, + seqs, + seqs_idx, + trans, + table_idx, + table_buff, + k, + DIGITS_AA_LOOKUP, + fam_tab, + hog_tab, + level_arr, + top_n_fams, + ref_fam_prob, + ref_hog_prob, + alpha_cutoff, + sst, + family_only, ): """ top_n_fams: number of family for which HOG scores are computed @@ -879,44 +877,48 @@ def func( qres["id"][:] = idx qres["count"][:] = fam_counts[idx] - t1 = clock() search_time += t1 - t0 t0 = clock() # 2. Fast family filtering - # Filter unlikely families by bounding the p-value with the Chernoff's bound: - # P(X >= k) = P(X >= (1 + d) mu) <= (exp(d) / (1 + d)^(1+d))^mu < Alpha - # Here: k is the number of k-mer hit, mu is the expected number of hits - # d = delta is the considered deviation from the expected value, - # Alpha is the p-value cutoff (in original units, not negative log). - # d = delta = (k - mu) / mu by definition of k. - # If we want P(X >= k) >= Alpha, then we can safely eliminate families - # that have P(X >= k) < Alpha. We know that the probability is bounded - # by Chernoff. For such families, we will have then: - # mu ((1+d) * ln(1+d) - d) < -ln(Alpha) - # This allows us to filter out families based on Alpha without - # actually computing the p-value which is quite expensive. - - # Compute the delta, similar to normcount - expected_count = ref_fam_prob[qres["id"]] * len(r1) - delta = (qres["count"] - expected_count) / expected_count - chernoff_bound = expected_count * ((1 + delta) * np.log(1 + delta) - delta) + # - a. filter by count. We are only interested in families + # that have at least their expected number of k-mer matches, + # because that number should be already given by random + # (however is still insignificant). + mu = ref_fam_prob[qres["id"]] * len(r1) + qres = qres[qres["count"] >= mu] + + # - b. filter by sequence coverage. There is no point to + # compute p-value for families that are going to be hard + # filtered by coverage later anyway. + for i in range(len(qres)): + family_id = qres["id"][i] + qres["overlap"][i] = (fam_highloc[family_id] - fam_lowloc[family_id] + k) / query_len - alpha = -1.0 * np.log(alpha_cutoff) - qres = qres[chernoff_bound >= alpha] + qres = qres[(qres["overlap"] >= (25 / query_len))] if len(qres) == 0: continue - # - b. filter on sequence coverage - overlap = np.zeros(len(qres)) - for i in range(len(qres)): - family_id = qres["id"][i] - overlap[i] = (fam_highloc[family_id] - fam_lowloc[family_id] + k) / query_len - - qres["overlap"][:] = overlap + # - c. Filter by predicted p-value: perform the Chernoff KL-div test, + # that is, the Chernoff upper bound for the binomial X: + # P(X >= k) <= exp(-n D(k/n || p)) + # where D is Kullback-Leibler divergence. + # First, compute k / n, the empirical proportion of Bernoulli successes. + # We need to clip it a little for the case n = k, because it's used + # in logarithms later. For the other edge case, we already have k > 0 + # guaranteed, so we're good here + epsilon = 1e-10 + n = len(r1) + k_n = np.clip(qres["count"] / n, epsilon, 1 - epsilon) + + # Now, by demanding exp(-n KL(k/n || p)) < alpha, we guarantee P < alpha too. + # There is a theoretical chance of that P < alpha <= bound, and the test will + # fail with a false negative. I could not observe any instances of this. + p = ref_fam_prob[qres["id"]] + kl_div = k_n * np.log(k_n / p) + (1 - k_n) * np.log((1 - k_n) / (1 - p)) + qres = qres[kl_div > -np.log(alpha_cutoff)/n] - qres = qres[(qres["overlap"] >= (25 / query_len))] if len(qres) == 0: continue @@ -932,12 +934,12 @@ def func( max( 0.0, ( - binom_neglogccdf( - qres["count"][i], - len(r1), - ref_fam_prob[qres["id"][i]], - ) - - correction_factor + binom_neglogccdf( + qres["count"][i], + len(r1), + ref_fam_prob[qres["id"][i]], + ) + - correction_factor ), ), ) @@ -946,19 +948,18 @@ def func( pvalue_time += t1 - t0 t0 = clock() - # 2. Filtering - # - a. filter to significant families (on p-value) + # Filter on the actual p-value + alpha = -1.0 * np.log(alpha_cutoff) qres = qres[qres["pvalue"] >= alpha] # filter out 0 neg log p. alpha > 0 is normal. alpha = 0 is edge case. qres = qres if alpha > 0 else qres[qres["pvalue"] > 0] - if len(qres) == 0: continue # 3. Compute normalised count expected_count = ref_fam_prob[qres["id"]] * len(r1) qres["normcount"][:] = (qres["count"] - expected_count) / ( - len(r1) - expected_count + len(r1) - expected_count ) t1 = clock() From 3ae846b99ad165a53fb470a111d88ddc2ba1c4a5 Mon Sep 17 00:00:00 2001 From: Nikolai Romashchenko Date: Thu, 21 Nov 2024 13:38:49 +0100 Subject: [PATCH 14/17] Restored the final sorting of multiple results per query after quickselect --- omamer/merge_search.py | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/omamer/merge_search.py b/omamer/merge_search.py index ff28c08..e564ec2 100644 --- a/omamer/merge_search.py +++ b/omamer/merge_search.py @@ -170,9 +170,19 @@ def family_result_sort(x, k): Sort the family results according to normcount, overlap and pvalue (to break ties). this uses a quicksort implementation as np.argsort does not support struct type in numba. """ + + # Quickselect top k results idx = np.arange(len(x)) _ = _select(x, idx, k, 0, len(x) - 1) - return x[idx[:k]] + x = x[idx[:k]] + + # Now mergesort the selected results, because we need to + # report them sorted + idx = family_result_argsort(x, list(range(len(x)))) + y = np.zeros_like(x) + for i in range(len(x)): + y[i] = x[idx[i]] + return y @numba.njit @@ -235,7 +245,6 @@ def _select(x, idx, k, low, high): return idx[k] -# ---- ## generic functions @numba.njit def get_fam_hog2parent(fam_ent, hog_tab): @@ -926,7 +935,7 @@ def func( filter_time += t1 - t0 t0 = clock() - # 1. compute p-value for each family. note: in negative log units + # 3. compute p-value for each family. note: in negative log units correction_factor = np.log(len(ref_fam_prob)) for i in numba.prange(len(qres)): qres["pvalue"][i] = min( @@ -956,7 +965,7 @@ def func( if len(qres) == 0: continue - # 3. Compute normalised count + # 4. Compute normalised count expected_count = ref_fam_prob[qres["id"]] * len(r1) qres["normcount"][:] = (qres["count"] - expected_count) / ( len(r1) - expected_count @@ -966,7 +975,7 @@ def func( filter_time += t1 - t0 t0 = clock() - # 4. Store results + # 5. Store results # - a. sort by normcount, then overlap, then p-value for tie-breaking qres = family_result_sort(qres, top_n_fams) From 8a43650ee415159871d67d9b9184e1843e0fce39 Mon Sep 17 00:00:00 2001 From: Nikolai Romashchenko Date: Mon, 25 Nov 2024 09:48:29 +0100 Subject: [PATCH 15/17] Clean up --- omamer/main.py | 6 +-- omamer/merge_search.py | 117 ++++++++++++----------------------------- 2 files changed, 36 insertions(+), 87 deletions(-) diff --git a/omamer/main.py b/omamer/main.py index d7aa561..f8272e7 100644 --- a/omamer/main.py +++ b/omamer/main.py @@ -304,8 +304,4 @@ def get_thread_count(): # call the relevant runner func args.func(args) else: - parser.print_usage() - - -if __name__ == "__main__": - main() + parser.print_usage() \ No newline at end of file diff --git a/omamer/merge_search.py b/omamer/merge_search.py index e564ec2..29b33fd 100644 --- a/omamer/merge_search.py +++ b/omamer/merge_search.py @@ -818,28 +818,21 @@ def func( # flags to ignore k-mers containing X x_flag = table_idx.size - 1 + # Arrays for thread-local data. We allocate them + # in advance to avoid doing so for every query num_threads = numba.get_num_threads() - thread_hog_counts = np.zeros((num_threads, hog_tab.size), dtype=np.uint16) - thread_fam_counts = np.zeros((num_threads, fam_tab.size), dtype=np.uint16) - thread_fam_lowloc = np.full((num_threads, fam_tab.size), -1, dtype=np.int32) - thread_fam_highloc = np.full((num_threads, fam_tab.size), -1, dtype=np.int32) - thread_hit_fams = np.zeros((num_threads, fam_tab.size), dtype=np.int32) - thread_hit_hogs = np.zeros((num_threads, hog_tab.size), dtype=np.int32) - thread_num_hit_fams = np.zeros(num_threads, dtype=np.uint32) - thread_num_hit_hogs = np.zeros(num_threads, dtype=np.uint32) - - parse_time = 0 - search_time = 0 - filter_time = 0 - pvalue_time = 0 - place_time = 0 - sort_time = 0 + hog_counts = np.zeros((num_threads, hog_tab.size), dtype=np.uint16) + fam_counts = np.zeros((num_threads, fam_tab.size), dtype=np.uint16) + fam_lowloc = np.full((num_threads, fam_tab.size), -1, dtype=np.int32) + fam_highloc = np.full((num_threads, fam_tab.size), -1, dtype=np.int32) + hit_fams = np.zeros((num_threads, fam_tab.size), dtype=np.int32) + hit_hogs = np.zeros((num_threads, hog_tab.size), dtype=np.int32) + num_hit_fams = np.zeros(num_threads, dtype=np.uint32) + num_hit_hogs = np.zeros(num_threads, dtype=np.uint32) for zz in numba.prange(len(seqs_idx) - 1): - t0 = clock() - # load seq - s = seqs[seqs_idx[zz] : np.int64(seqs_idx[zz + 1] - 1)] + s = seqs[seqs_idx[zz]: np.int64(seqs_idx[zz + 1] - 1)] query_len = s.shape[0] n_kmers = query_len - (k - 1) @@ -856,39 +849,31 @@ def func( elif r1[0] == x_flag: continue - t1 = clock() - parse_time += t1 - t0 - t0 = clock() - # Get thread-local data structures for search - hit_fams = thread_hit_fams[numba.get_thread_id()] - hit_hogs = thread_hit_hogs[numba.get_thread_id()] - hog_counts = thread_hog_counts[numba.get_thread_id()] - fam_counts = thread_fam_counts[numba.get_thread_id()] - fam_lowloc = thread_fam_lowloc[numba.get_thread_id()] - fam_highloc = thread_fam_highloc[numba.get_thread_id()] - num_hit_fams = thread_num_hit_fams[numba.get_thread_id()] - num_hit_hogs = thread_num_hit_hogs[numba.get_thread_id()] + thread_hit_fams = hit_fams[numba.get_thread_id()] + thread_hit_hogs = hit_hogs[numba.get_thread_id()] + thread_hog_counts = hog_counts[numba.get_thread_id()] + thread_fam_counts = fam_counts[numba.get_thread_id()] + thread_fam_lowloc = fam_lowloc[numba.get_thread_id()] + thread_fam_highloc = fam_highloc[numba.get_thread_id()] + thread_num_hit_fams = num_hit_fams[numba.get_thread_id()] + thread_num_hit_hogs = num_hit_hogs[numba.get_thread_id()] # search using kmers - num_hit_fams, num_hit_hogs = search_seq_kmers( + thread_num_hit_fams, thread_num_hit_hogs = search_seq_kmers( r1, p1, hog_tab, fam_tab, x_flag, table_idx, table_buff, - hog_counts, fam_counts, fam_lowloc, fam_highloc, - hit_fams, num_hit_fams, hit_hogs, num_hit_hogs + thread_hog_counts, thread_fam_counts, thread_fam_lowloc, thread_fam_highloc, + thread_hit_fams, thread_num_hit_fams, thread_hit_hogs, thread_num_hit_hogs ) - thread_num_hit_fams[numba.get_thread_id()] = num_hit_fams - thread_num_hit_hogs[numba.get_thread_id()] = num_hit_hogs + num_hit_fams[numba.get_thread_id()] = thread_num_hit_fams + num_hit_hogs[numba.get_thread_id()] = thread_num_hit_hogs # Identify families of interest - idx = hit_fams[:num_hit_fams] + idx = thread_hit_fams[:thread_num_hit_fams] qres = np.repeat(np.zeros_like(family_results[zz, 0]), len(idx)) qres["id"][:] = idx - qres["count"][:] = fam_counts[idx] - - t1 = clock() - search_time += t1 - t0 - t0 = clock() + qres["count"][:] = thread_fam_counts[idx] # 2. Fast family filtering # - a. filter by count. We are only interested in families @@ -903,7 +888,7 @@ def func( # filtered by coverage later anyway. for i in range(len(qres)): family_id = qres["id"][i] - qres["overlap"][i] = (fam_highloc[family_id] - fam_lowloc[family_id] + k) / query_len + qres["overlap"][i] = (thread_fam_highloc[family_id] - thread_fam_lowloc[family_id] + k) / query_len qres = qres[(qres["overlap"] >= (25 / query_len))] if len(qres) == 0: @@ -931,10 +916,6 @@ def func( if len(qres) == 0: continue - t1 = clock() - filter_time += t1 - t0 - t0 = clock() - # 3. compute p-value for each family. note: in negative log units correction_factor = np.log(len(ref_fam_prob)) for i in numba.prange(len(qres)): @@ -943,20 +924,16 @@ def func( max( 0.0, ( - binom_neglogccdf( - qres["count"][i], - len(r1), - ref_fam_prob[qres["id"][i]], - ) - - correction_factor + binom_neglogccdf( + qres["count"][i], + len(r1), + ref_fam_prob[qres["id"][i]], + ) + - correction_factor ), ), ) - t1 = clock() - pvalue_time += t1 - t0 - t0 = clock() - # Filter on the actual p-value alpha = -1.0 * np.log(alpha_cutoff) qres = qres[qres["pvalue"] >= alpha] @@ -971,10 +948,6 @@ def func( len(r1) - expected_count ) - t1 = clock() - filter_time += t1 - t0 - t0 = clock() - # 5. Store results # - a. sort by normcount, then overlap, then p-value for tie-breaking qres = family_result_sort(qres, top_n_fams) @@ -986,10 +959,6 @@ def func( family_results["normcount"][zz, :top_n_fams] = qres["normcount"][:top_n_fams] family_results["overlap"][zz, :top_n_fams] = qres["overlap"][:top_n_fams] - t1 = clock() - sort_time += t1 - t0 - t0 = clock() - # 5. Place within families for i in numba.prange(min(len(qres), top_n_fams)): entry = fam_tab[qres["id"][i]] @@ -1005,7 +974,7 @@ def func( fam_level_offsets = get_fam_level_offsets(entry, level_arr) # cumulation of counts - c = hog_counts[hog_s:hog_e].copy() + c = thread_hog_counts[hog_s:hog_e].copy() cumulate_counts_1fam(c, fam_level_offsets, fam_hog2parent) @@ -1015,7 +984,7 @@ def func( r1.size, fam_level_offsets, fam_hog2parent, - hog_counts[hog_s:hog_e], + thread_hog_counts[hog_s:hog_e], ref_hog_prob[hog_s:hog_e], ) @@ -1037,20 +1006,4 @@ def func( c[int(choice)] if choice_score != 0.0 else 0 ) - t1 = clock() - place_time += t1 - t0 - - total_time = parse_time + search_time + filter_time + pvalue_time + place_time + sort_time - - print() - print("Parse time\t", as_seconds(parse_time)) - print("Search time\t", as_seconds(search_time)) - print("Filter time\t", as_seconds(filter_time)) - print("Pvalue time\t", as_seconds(pvalue_time)) - print("Place time\t", as_seconds(place_time)) - print("Sort time\t", as_seconds(sort_time)) - print("Batch total\t", as_seconds(total_time)) - print() - return numba.jit(func, parallel=True, nopython=True, nogil=True) - #return func From 0bbb6e9edd06f9a938b3a8416b6088cb1614ef2e Mon Sep 17 00:00:00 2001 From: Nikolai Romashchenko Date: Mon, 25 Nov 2024 10:38:42 +0100 Subject: [PATCH 16/17] Updated license headers --- omamer/HOGParser.py | 1 + omamer/__init__.py | 3 ++- omamer/_clock.py | 24 ++++++++++++++++++++++++ omamer/_runners.py | 1 + omamer/_utils.py | 1 + omamer/database.py | 1 + omamer/hierarchy.py | 1 + omamer/index.py | 1 + omamer/main.py | 1 + omamer/merge_search.py | 1 + omamer/results_reader.py | 1 + omamer/sequence_buffer.py | 1 + omamer/sequence_reader.py | 1 + setup.py | 1 + 14 files changed, 38 insertions(+), 1 deletion(-) diff --git a/omamer/HOGParser.py b/omamer/HOGParser.py index 6c394d4..e938140 100644 --- a/omamer/HOGParser.py +++ b/omamer/HOGParser.py @@ -2,6 +2,7 @@ HOGPROP - Propagation of Gene Ontology (GO) annotations through Hierarchical Orthologous Groups (HOGs) from the OMA project. + (C) 2024 Nikolai Romashchenko (C) 2015-2023 Alex Warwick Vesztrocy This file is part of HOGPROP. It contains a module for parsing an diff --git a/omamer/__init__.py b/omamer/__init__.py index 5df936b..8ccc8eb 100644 --- a/omamer/__init__.py +++ b/omamer/__init__.py @@ -1,6 +1,7 @@ """ OMAmer - tree-driven and alignment-free protein assignment to sub-families + (C) 2024 Nikolai Romashchenko (C) 2022-2023 Alex Warwick Vesztrocy (C) 2019-2021 Victor Rossier and Alex Warwick Vesztrocy @@ -24,6 +25,6 @@ __packagename__ = "omamer" __version__ = "2.0.5" -__copyright__ = "(C) 2019-{:d} Victor Rossier and Alex Warwick Vesztrocy ".format( +__copyright__ = "(C) 2019-{:d} Victor Rossier and Alex Warwick Vesztrocy and Nikolai Romashchenko ".format( date.today().year ) diff --git a/omamer/_clock.py b/omamer/_clock.py index e02e1bf..68313a6 100644 --- a/omamer/_clock.py +++ b/omamer/_clock.py @@ -1,3 +1,27 @@ +""" + OMAmer - tree-driven and alignment-free protein assignment to sub-families + + (C) 2024 Nikolai Romashchenko + (C) 2022-2023 Alex Warwick Vesztrocy + (C) 2019-2021 Victor Rossier and + Alex Warwick Vesztrocy + + This file is part of OMAmer. + + OMAmer is free software: you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + OMAmer is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with OMAmer. If not, see . +""" + import ctypes # Access the _PyTime_AsSecondsDouble and _PyTime_GetSystemClock functions from pythonapi diff --git a/omamer/_runners.py b/omamer/_runners.py index 799ed25..14ba6fd 100644 --- a/omamer/_runners.py +++ b/omamer/_runners.py @@ -1,6 +1,7 @@ """ OMAmer - tree-driven and alignment-free protein assignment to sub-families + (C) 2024 Nikolai Romashchenko (C) 2022-2023 Alex Warwick Vesztrocy (C) 2019-2021 Victor Rossier and Alex Warwick Vesztrocy diff --git a/omamer/_utils.py b/omamer/_utils.py index 32ffbdb..e1f0ae3 100644 --- a/omamer/_utils.py +++ b/omamer/_utils.py @@ -1,6 +1,7 @@ """ OMAmer - tree-driven and alignment-free protein assignment to sub-families + (C) 2024 Nikolai Romashchenko (C) 2022-2023 Alex Warwick Vesztrocy (C) 2019-2021 Victor Rossier and Alex Warwick Vesztrocy diff --git a/omamer/database.py b/omamer/database.py index ce61f97..b861e05 100644 --- a/omamer/database.py +++ b/omamer/database.py @@ -1,6 +1,7 @@ """ OMAmer - tree-driven and alignment-free protein assignment to sub-families + (C) 2024 Nikolai Romashchenko (C) 2022-2023 Alex Warwick Vesztrocy (C) 2019-2021 Victor Rossier and Alex Warwick Vesztrocy diff --git a/omamer/hierarchy.py b/omamer/hierarchy.py index d4f9dc3..eab4bd6 100644 --- a/omamer/hierarchy.py +++ b/omamer/hierarchy.py @@ -1,6 +1,7 @@ """ OMAmer - tree-driven and alignment-free protein assignment to sub-families + (C) 2024 Nikolai Romashchenko (C) 2022-2023 Alex Warwick Vesztrocy (C) 2019-2021 Victor Rossier and Alex Warwick Vesztrocy diff --git a/omamer/index.py b/omamer/index.py index 323bfe8..ba687cc 100644 --- a/omamer/index.py +++ b/omamer/index.py @@ -1,6 +1,7 @@ """ OMAmer - tree-driven and alignment-free protein assignment to sub-families + (C) 2024 Nikolai Romashchenko (C) 2022-2023 Alex Warwick Vesztrocy (C) 2019-2021 Victor Rossier and Alex Warwick Vesztrocy diff --git a/omamer/main.py b/omamer/main.py index f8272e7..db4e41b 100644 --- a/omamer/main.py +++ b/omamer/main.py @@ -1,6 +1,7 @@ """ OMAmer - tree-driven and alignment-free protein assignment to sub-families + (C) 2024 Nikolai Romashchenko (C) 2022-2023 Alex Warwick Vesztrocy (C) 2019-2021 Victor Rossier and Alex Warwick Vesztrocy diff --git a/omamer/merge_search.py b/omamer/merge_search.py index 29b33fd..9266b8d 100644 --- a/omamer/merge_search.py +++ b/omamer/merge_search.py @@ -1,6 +1,7 @@ """ OMAmer - tree-driven and alignment-free protein assignment to sub-families + (C) 2024 Nikolai Romashchenko (C) 2022-2023 Alex Warwick Vesztrocy (C) 2019-2021 Victor Rossier and Alex Warwick Vesztrocy diff --git a/omamer/results_reader.py b/omamer/results_reader.py index b925269..a16bf09 100644 --- a/omamer/results_reader.py +++ b/omamer/results_reader.py @@ -1,6 +1,7 @@ """ OMAmer - tree-driven and alignment-free protein assignment to sub-families + (C) 2024 Nikolai Romashchenko (C) 2022-2023 Alex Warwick Vesztrocy (C) 2019-2021 Victor Rossier and Alex Warwick Vesztrocy diff --git a/omamer/sequence_buffer.py b/omamer/sequence_buffer.py index 8293ac6..c95a79f 100644 --- a/omamer/sequence_buffer.py +++ b/omamer/sequence_buffer.py @@ -1,6 +1,7 @@ """ OMAmer - tree-driven and alignment-free protein assignment to sub-families + (C) 2024 Nikolai Romashchenko (C) 2022-2023 Alex Warwick Vesztrocy (C) 2019-2021 Victor Rossier and Alex Warwick Vesztrocy diff --git a/omamer/sequence_reader.py b/omamer/sequence_reader.py index 539e470..d5297ad 100644 --- a/omamer/sequence_reader.py +++ b/omamer/sequence_reader.py @@ -1,6 +1,7 @@ """ OMAmer - tree-driven and alignment-free protein assignment to sub-families + (C) 2024 Nikolai Romashchenko (C) 2022-2023 Alex Warwick Vesztrocy (C) 2019-2021 Victor Rossier and Alex Warwick Vesztrocy diff --git a/setup.py b/setup.py index fa1f599..2d7ee08 100644 --- a/setup.py +++ b/setup.py @@ -1,6 +1,7 @@ """ OMAmer - tree-driven and alignment-free protein assignment to sub-families + (C) 2024 Nikolai Romashchenko (C) 2022-2023 Alex Warwick Vesztrocy (C) 2019-2021 Victor Rossier and Alex Warwick Vesztrocy From 15070a99ae801ced1da7eba368a7bd9b9e2588a3 Mon Sep 17 00:00:00 2001 From: Nikolai Romashchenko Date: Mon, 25 Nov 2024 10:48:32 +0100 Subject: [PATCH 17/17] =?UTF-8?q?Bump=20version:=202.0.4=20=E2=86=92=202.1?= =?UTF-8?q?.0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- omamer/__init__.py | 2 +- setup.cfg | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/omamer/__init__.py b/omamer/__init__.py index 8ccc8eb..4654e67 100644 --- a/omamer/__init__.py +++ b/omamer/__init__.py @@ -24,7 +24,7 @@ from datetime import date __packagename__ = "omamer" -__version__ = "2.0.5" +__version__ = "2.1.0" __copyright__ = "(C) 2019-{:d} Victor Rossier and Alex Warwick Vesztrocy and Nikolai Romashchenko ".format( date.today().year ) diff --git a/setup.cfg b/setup.cfg index 9d2c31c..5f54ea0 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 2.0.4 +current_version = 2.1.0 commit = True tag = False