diff --git a/omamer/HOGParser.py b/omamer/HOGParser.py index 6c394d4..e938140 100644 --- a/omamer/HOGParser.py +++ b/omamer/HOGParser.py @@ -2,6 +2,7 @@ HOGPROP - Propagation of Gene Ontology (GO) annotations through Hierarchical Orthologous Groups (HOGs) from the OMA project. + (C) 2024 Nikolai Romashchenko (C) 2015-2023 Alex Warwick Vesztrocy This file is part of HOGPROP. It contains a module for parsing an diff --git a/omamer/__init__.py b/omamer/__init__.py index 5df936b..4654e67 100644 --- a/omamer/__init__.py +++ b/omamer/__init__.py @@ -1,6 +1,7 @@ """ OMAmer - tree-driven and alignment-free protein assignment to sub-families + (C) 2024 Nikolai Romashchenko (C) 2022-2023 Alex Warwick Vesztrocy (C) 2019-2021 Victor Rossier and Alex Warwick Vesztrocy @@ -23,7 +24,7 @@ from datetime import date __packagename__ = "omamer" -__version__ = "2.0.5" -__copyright__ = "(C) 2019-{:d} Victor Rossier and Alex Warwick Vesztrocy ".format( +__version__ = "2.1.0" +__copyright__ = "(C) 2019-{:d} Victor Rossier and Alex Warwick Vesztrocy and Nikolai Romashchenko ".format( date.today().year ) diff --git a/omamer/_clock.py b/omamer/_clock.py new file mode 100644 index 0000000..68313a6 --- /dev/null +++ b/omamer/_clock.py @@ -0,0 +1,38 @@ +""" + OMAmer - tree-driven and alignment-free protein assignment to sub-families + + (C) 2024 Nikolai Romashchenko + (C) 2022-2023 Alex Warwick Vesztrocy + (C) 2019-2021 Victor Rossier and + Alex Warwick Vesztrocy + + This file is part of OMAmer. + + OMAmer is free software: you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + OMAmer is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with OMAmer. If not, see . +""" + +import ctypes + +# Access the _PyTime_AsSecondsDouble and _PyTime_GetSystemClock functions from pythonapi +clock = ctypes.pythonapi._PyTime_GetSystemClock +as_seconds = ctypes.pythonapi._PyTime_AsSecondsDouble + +# Set the argument types and return types of the functions +clock.argtypes = [] +clock.restype = ctypes.c_int64 + +as_seconds.argtypes = [ctypes.c_int64] +as_seconds.restype = ctypes.c_double + + diff --git a/omamer/_runners.py b/omamer/_runners.py index 799ed25..14ba6fd 100644 --- a/omamer/_runners.py +++ b/omamer/_runners.py @@ -1,6 +1,7 @@ """ OMAmer - tree-driven and alignment-free protein assignment to sub-families + (C) 2024 Nikolai Romashchenko (C) 2022-2023 Alex Warwick Vesztrocy (C) 2019-2021 Victor Rossier and Alex Warwick Vesztrocy diff --git a/omamer/_utils.py b/omamer/_utils.py index 32ffbdb..e1f0ae3 100644 --- a/omamer/_utils.py +++ b/omamer/_utils.py @@ -1,6 +1,7 @@ """ OMAmer - tree-driven and alignment-free protein assignment to sub-families + (C) 2024 Nikolai Romashchenko (C) 2022-2023 Alex Warwick Vesztrocy (C) 2019-2021 Victor Rossier and Alex Warwick Vesztrocy diff --git a/omamer/database.py b/omamer/database.py index ce61f97..b861e05 100644 --- a/omamer/database.py +++ b/omamer/database.py @@ -1,6 +1,7 @@ """ OMAmer - tree-driven and alignment-free protein assignment to sub-families + (C) 2024 Nikolai Romashchenko (C) 2022-2023 Alex Warwick Vesztrocy (C) 2019-2021 Victor Rossier and Alex Warwick Vesztrocy diff --git a/omamer/hierarchy.py b/omamer/hierarchy.py index d4f9dc3..eab4bd6 100644 --- a/omamer/hierarchy.py +++ b/omamer/hierarchy.py @@ -1,6 +1,7 @@ """ OMAmer - tree-driven and alignment-free protein assignment to sub-families + (C) 2024 Nikolai Romashchenko (C) 2022-2023 Alex Warwick Vesztrocy (C) 2019-2021 Victor Rossier and Alex Warwick Vesztrocy diff --git a/omamer/index.py b/omamer/index.py index 323bfe8..ba687cc 100644 --- a/omamer/index.py +++ b/omamer/index.py @@ -1,6 +1,7 @@ """ OMAmer - tree-driven and alignment-free protein assignment to sub-families + (C) 2024 Nikolai Romashchenko (C) 2022-2023 Alex Warwick Vesztrocy (C) 2019-2021 Victor Rossier and Alex Warwick Vesztrocy diff --git a/omamer/main.py b/omamer/main.py index cc7a6ca..db4e41b 100644 --- a/omamer/main.py +++ b/omamer/main.py @@ -1,6 +1,7 @@ """ OMAmer - tree-driven and alignment-free protein assignment to sub-families + (C) 2024 Nikolai Romashchenko (C) 2022-2023 Alex Warwick Vesztrocy (C) 2019-2021 Victor Rossier and Alex Warwick Vesztrocy @@ -265,6 +266,7 @@ def get_thread_count(): help="Show metadata about an omamer database.", description="Show metadata about an existing omamer database", ) + info_parser.set_defaults(func=info_db) info_parser.add_argument( "-d", @@ -303,4 +305,4 @@ def get_thread_count(): # call the relevant runner func args.func(args) else: - parser.print_usage() + parser.print_usage() \ No newline at end of file diff --git a/omamer/merge_search.py b/omamer/merge_search.py index f0fb765..9266b8d 100644 --- a/omamer/merge_search.py +++ b/omamer/merge_search.py @@ -1,6 +1,7 @@ """ OMAmer - tree-driven and alignment-free protein assignment to sub-families + (C) 2024 Nikolai Romashchenko (C) 2022-2023 Alex Warwick Vesztrocy (C) 2019-2021 Victor Rossier and Alex Warwick Vesztrocy @@ -20,8 +21,9 @@ You should have received a copy of the GNU Lesser General Public License along with OMAmer. If not, see . """ -from Rmath4 import pbinom +from Rmath4 import pbinom, phyper from numba.tests.support import captured_stdout +from numba.typed import List, Dict from property_manager import lazy_property, cached_property from time import time import numba @@ -38,14 +40,13 @@ is_taxon_implied, get_children, ) +from ._clock import clock, as_seconds # maximum neglogp to set -MAX_LOGP = 20000 +MAX_LOGP = 20000.0 -# ---- -# stats functions @numba.njit(nogil=True) def binom_neglogccdf(x, n, p): """ @@ -56,6 +57,10 @@ def binom_neglogccdf(x, n, p): # return -1.0 * np.log(sc.bdtrc(np.float64(x - 1), n, p)) return -1.0 * pbinom(x - 1, n, p, 0, 1) + # pbinom(n - 1, m, m/N, 0, 0) + # phyper(n - 1, m, N - m, m, 0, 0) + #return -1.0 * phyper(x - 1, n, n/p - n, n, 0, 1) + # ---- # family result sorting @@ -80,6 +85,52 @@ def fam_res_compare(x1, x2): return 0 +@numba.njit +def fam_res_less(x1, x2): + """ + Same as fam_res_compare, but operates as '<' + """ + if x1["normcount"] != x2["normcount"]: + return x1["normcount"] < x2["normcount"] + + if x1["overlap"] != x2["overlap"]: + return x1["overlap"] < x2["overlap"] + + if x1["pvalue"] != x2["pvalue"]: + return x1["pvalue"] < x2["pvalue"] + + return False + + +@numba.njit +def fam_res_le(x1, x2): + """ + Same as fam_res_compare, but operates as '<=' + """ + if not fam_res_less(x1, x2): + return x1["normcount"] == x2["normcount"] and \ + x1["overlap"] == x2["overlap"] and \ + x1["pvalue"] == x2["pvalue"] + + return True + + +@numba.njit +def fam_res_greater(x1, x2): + """ + Same as fam_res_compare, but operates as '>' + """ + return not fam_res_le(x1, x2) + + +@numba.njit +def fam_res_ge(x1, x2): + """ + Same as fam_res_compare, but operates as '>=' + """ + return not fam_res_less(x1, x2) + + @numba.njit def family_result_argsort(x, ii): """ @@ -115,20 +166,86 @@ def family_result_argsort(x, ii): @numba.njit -def family_result_sort(x): +def family_result_sort(x, k): """ Sort the family results according to normcount, overlap and pvalue (to break ties). this uses a quicksort implementation as np.argsort does not support struct type in numba. """ - # perform the family result sorting. + + # Quickselect top k results + idx = np.arange(len(x)) + _ = _select(x, idx, k, 0, len(x) - 1) + x = x[idx[:k]] + + # Now mergesort the selected results, because we need to + # report them sorted idx = family_result_argsort(x, list(range(len(x)))) y = np.zeros_like(x) - for i in numba.prange(len(x)): + for i in range(len(x)): y[i] = x[idx[i]] return y -# ---- +@numba.njit +def _swap(array, i, j): + tmp = array[i] + array[i] = array[j] + array[j] = tmp + + +@numba.njit +def _partition(x, idx, low, high): + """ + Index-based version of the partition algorithm of + quicksort. Juggles indexes of the idx array that + indexes the x array, considering the range [low, high]. + """ + mid = (low + high) >> 1 + + # Use median of three {low, middle, high} as the pivot + if fam_res_greater(x[idx[mid]], x[idx[low]]): + _swap(idx, mid, low) + if fam_res_greater(x[idx[high]], x[idx[mid]]): + _swap(idx, high, mid) + if fam_res_greater(x[idx[mid]], x[idx[low]]): + _swap(idx, low, mid) + + pivot = x[idx[mid]] + # Put the pivot in the end of the array + _swap(idx, mid, high) + + # Collect elements that are > pivot in the beginning + i = low + for j in range(low, high): + if fam_res_greater(x[idx[j]], pivot): + _swap(idx, i, j) + i += 1 + + # All indexes of idx in [0, i) are good now. + # Let's place the pivot back where it should be + _swap(idx, i, high) + return i + + +@numba.njit +def _select(x, idx, k, low, high): + """ + Select the k'th largest element of the x array + """ + if k >= len(x): + return len(x) + + i = _partition(x, idx, low, high) + while i != k: + if i < k: + low = i + 1 + i = _partition(x, idx, low, high) + else: + high = i - 1 + i = _partition(x, idx, low, high) + return idx[k] + + ## generic functions @numba.njit def get_fam_hog2parent(fam_ent, hog_tab): @@ -176,6 +293,24 @@ def custom_unique1d(ar): return aux[mask], perm[mask], np.diff(idx) +@numba.njit +def unique1d_linear(array): + """ + Find a set of unique elements in linear time + """ + unique_list = List() + index_list = List() + seen = set() + + for i in range(len(array)): + if array[i] not in seen: + seen.add(array[i]) + unique_list.append(array[i]) + index_list.append(i) + + return np.asarray(unique_list), np.asarray(index_list, dtype=np.uint32), None + + @numba.njit def parse_seq(s, DIGITS_AA_LOOKUP, n_kmers, k, trans, x_flag): """ @@ -183,30 +318,60 @@ def parse_seq(s, DIGITS_AA_LOOKUP, n_kmers, k, trans, x_flag): """ s_norm = DIGITS_AA_LOOKUP[s] r = np.zeros(n_kmers, dtype=np.uint32) # max kmer 7 - for i in numba.prange(n_kmers): - # numba can't do np.dot with non-float - for j in numba.prange(k): - r[i] += trans[j] * s_norm[i + j] - - # does k-mer contain any X? - x_seen = np.any(s_norm[i : i + k] == DIGITS_AA_LOOKUP[88]) # 88==b'X' - # if yes, replace it by the x_flag + + # compute the code of the first k-mer + for j in range(k): + r[0] += trans[j] * s_norm[j] + + # does k-mer contain any X? + x_seen = np.any(s_norm[0:k] == DIGITS_AA_LOOKUP[88]) + # if yes, replace it by the x_flag + r[0] = r[0] if not x_seen else x_flag + + # codes for other k-mers + for i in range(1, n_kmers): + if not x_seen: + # if the previous k-mer was valid, + # recompute the current code from the previous one + + # remove the first character from the code + shared = r[i-1] - (trans[0] * s_norm[i - 1]) + + # trans[-2] is the alphabet size + r[i] = shared * trans[-2] + trans[-1] * s_norm[i + k - 1] + + else: + # if the previous k-mer has Xs, + # just compute the code from scratch + for j in range(k): + r[i] += trans[j] * s_norm[i + j] + + x_seen = np.any(s_norm[i: i + k] == DIGITS_AA_LOOKUP[88]) r[i] = r[i] if not x_seen else x_flag - return custom_unique1d(r) + return unique1d_linear(r) @numba.njit -def search_seq_kmers(r1, p1, hog_tab, fam_tab, x_flag, table_idx, table_buff): +def search_seq_kmers(r1, p1, hog_tab, fam_tab, x_flag, table_idx, table_buff, + hog_counts, fam_counts, fam_lowloc, fam_highloc, + hit_fams, num_hit_fams, hit_hogs, num_hit_hogs): """ Perform the kmer search, using the index. """ - hog_counts = np.zeros(hog_tab.size, dtype=np.uint16) - fam_counts = np.zeros(fam_tab.size, dtype=np.uint16) + # Reinitialize counters modified by the previous call + for i in range(num_hit_fams): + family = hit_fams[i] + fam_counts[family] = 0 + fam_lowloc[family] = -1 + fam_highloc[family] = -1 + + for i in range(num_hit_hogs): + hog = hit_hogs[i] + hog_counts[hog] = 0 - # store lowest and higher k-mer locations - fam_lowloc = np.full(fam_tab.size, -1, dtype=np.int32) - fam_highloc = np.full(fam_tab.size, -1, dtype=np.int32) + num_hit_fams = 0 + num_hit_hogs = 0 # iterate unique k-mers for m in range(r1.shape[0]): @@ -216,18 +381,26 @@ def search_seq_kmers(r1, p1, hog_tab, fam_tab, x_flag, table_idx, table_buff): # to ignore k-mers with X if kmer == x_flag: continue - else: - pass # get mapping to HOGs - x = table_idx[kmer : kmer + 2] - hogs = table_buff[x[0] : x[1]] + x = table_idx[kmer: kmer + 2] + hogs = table_buff[x[0]: x[1]] fams = hog_tab["FamOff"][hogs] - hog_counts[hogs] += np.uint16(1) - fam_counts[fams] += np.uint16(1) - # store lowest and highest locations + for hog in hogs: + if not hog_counts[hog]: + hit_hogs[num_hit_hogs] = hog + num_hit_hogs += 1 + + hog_counts[hog] += 1 + for fam_off in fams: + if not fam_counts[fam_off]: + hit_fams[num_hit_fams] = fam_off + num_hit_fams += 1 + + fam_counts[fam_off] += 1 + # initiate first location if fam_lowloc[fam_off] == -1: fam_lowloc[fam_off] = loc @@ -239,7 +412,7 @@ def search_seq_kmers(r1, p1, hog_tab, fam_tab, x_flag, table_idx, table_buff): elif loc > fam_highloc[fam_off]: fam_highloc[fam_off] = loc - return (hog_counts, fam_counts, fam_lowloc, fam_highloc) + return num_hit_fams, num_hit_hogs ## functions to cumulate HOG k-mer counts @@ -377,6 +550,23 @@ def get_closest_taxa_from_ref(q2hog_off, ref_taxoff, tax_tab, hog_tab, chog_buff return q2closest_taxon +@numba.njit +def unzip_dict(data: numba.typed.Dict): + keys = np.zeros(len(data)) + values = np.zeros_like(keys) + for i, key in enumerate(data): + keys[i] = key + values[i] = data[key] + return keys, values + + +@numba.njit +def dict_slice(data: numba.typed.Dict, start: np.uint32, end: np.uint32) -> np.ndarray: + result = np.zeros(end - start, dtype=np.uint32) + for j in range(start, end): + result[j - start] = data.get(j, np.uint32(0)) + return result + class MergeSearch(object): def __init__(self, ki, include_extant_genes=False): assert ki.db.db.mode == "r", "Database must be opened in read mode." @@ -604,24 +794,24 @@ def generate(): @lazy_property def _lookup(self): def func( - family_results, - subfam_results, - seqs, - seqs_idx, - trans, - table_idx, - table_buff, - k, - DIGITS_AA_LOOKUP, - fam_tab, - hog_tab, - level_arr, - top_n_fams, - ref_fam_prob, - ref_hog_prob, - alpha_cutoff, - sst, - family_only, + family_results, + subfam_results, + seqs, + seqs_idx, + trans, + table_idx, + table_buff, + k, + DIGITS_AA_LOOKUP, + fam_tab, + hog_tab, + level_arr, + top_n_fams, + ref_fam_prob, + ref_hog_prob, + alpha_cutoff, + sst, + family_only, ): """ top_n_fams: number of family for which HOG scores are computed @@ -629,9 +819,21 @@ def func( # flags to ignore k-mers containing X x_flag = table_idx.size - 1 + # Arrays for thread-local data. We allocate them + # in advance to avoid doing so for every query + num_threads = numba.get_num_threads() + hog_counts = np.zeros((num_threads, hog_tab.size), dtype=np.uint16) + fam_counts = np.zeros((num_threads, fam_tab.size), dtype=np.uint16) + fam_lowloc = np.full((num_threads, fam_tab.size), -1, dtype=np.int32) + fam_highloc = np.full((num_threads, fam_tab.size), -1, dtype=np.int32) + hit_fams = np.zeros((num_threads, fam_tab.size), dtype=np.int32) + hit_hogs = np.zeros((num_threads, hog_tab.size), dtype=np.int32) + num_hit_fams = np.zeros(num_threads, dtype=np.uint32) + num_hit_hogs = np.zeros(num_threads, dtype=np.uint32) + for zz in numba.prange(len(seqs_idx) - 1): # load seq - s = seqs[seqs_idx[zz] : np.int64(seqs_idx[zz + 1] - 1)] + s = seqs[seqs_idx[zz]: np.int64(seqs_idx[zz + 1] - 1)] query_len = s.shape[0] n_kmers = query_len - (k - 1) @@ -648,22 +850,78 @@ def func( elif r1[0] == x_flag: continue + # Get thread-local data structures for search + thread_hit_fams = hit_fams[numba.get_thread_id()] + thread_hit_hogs = hit_hogs[numba.get_thread_id()] + thread_hog_counts = hog_counts[numba.get_thread_id()] + thread_fam_counts = fam_counts[numba.get_thread_id()] + thread_fam_lowloc = fam_lowloc[numba.get_thread_id()] + thread_fam_highloc = fam_highloc[numba.get_thread_id()] + thread_num_hit_fams = num_hit_fams[numba.get_thread_id()] + thread_num_hit_hogs = num_hit_hogs[numba.get_thread_id()] + # search using kmers - (hog_counts, fam_counts, fam_lowloc, fam_highloc) = search_seq_kmers( - r1, p1, hog_tab, fam_tab, x_flag, table_idx, table_buff + thread_num_hit_fams, thread_num_hit_hogs = search_seq_kmers( + r1, p1, hog_tab, fam_tab, x_flag, table_idx, table_buff, + thread_hog_counts, thread_fam_counts, thread_fam_lowloc, thread_fam_highloc, + thread_hit_fams, thread_num_hit_fams, thread_hit_hogs, thread_num_hit_hogs ) - # identify families of interest. note: repeat(zeros_like..) numba hack - idx = np.argwhere(fam_counts > 0).flatten() + num_hit_fams[numba.get_thread_id()] = thread_num_hit_fams + num_hit_hogs[numba.get_thread_id()] = thread_num_hit_hogs + + # Identify families of interest + idx = thread_hit_fams[:thread_num_hit_fams] qres = np.repeat(np.zeros_like(family_results[zz, 0]), len(idx)) qres["id"][:] = idx - qres["count"][:] = fam_counts[idx] + qres["count"][:] = thread_fam_counts[idx] + + # 2. Fast family filtering + # - a. filter by count. We are only interested in families + # that have at least their expected number of k-mer matches, + # because that number should be already given by random + # (however is still insignificant). + mu = ref_fam_prob[qres["id"]] * len(r1) + qres = qres[qres["count"] >= mu] + + # - b. filter by sequence coverage. There is no point to + # compute p-value for families that are going to be hard + # filtered by coverage later anyway. + for i in range(len(qres)): + family_id = qres["id"][i] + qres["overlap"][i] = (thread_fam_highloc[family_id] - thread_fam_lowloc[family_id] + k) / query_len + + qres = qres[(qres["overlap"] >= (25 / query_len))] + if len(qres) == 0: + continue + + # - c. Filter by predicted p-value: perform the Chernoff KL-div test, + # that is, the Chernoff upper bound for the binomial X: + # P(X >= k) <= exp(-n D(k/n || p)) + # where D is Kullback-Leibler divergence. + # First, compute k / n, the empirical proportion of Bernoulli successes. + # We need to clip it a little for the case n = k, because it's used + # in logarithms later. For the other edge case, we already have k > 0 + # guaranteed, so we're good here + epsilon = 1e-10 + n = len(r1) + k_n = np.clip(qres["count"] / n, epsilon, 1 - epsilon) + + # Now, by demanding exp(-n KL(k/n || p)) < alpha, we guarantee P < alpha too. + # There is a theoretical chance of that P < alpha <= bound, and the test will + # fail with a false negative. I could not observe any instances of this. + p = ref_fam_prob[qres["id"]] + kl_div = k_n * np.log(k_n / p) + (1 - k_n) * np.log((1 - k_n) / (1 - p)) + qres = qres[kl_div > -np.log(alpha_cutoff)/n] + + if len(qres) == 0: + continue - # 1. compute p-value for each family. note: in negative log units + # 3. compute p-value for each family. note: in negative log units correction_factor = np.log(len(ref_fam_prob)) for i in numba.prange(len(qres)): qres["pvalue"][i] = min( - MAX_LOGP, + float(MAX_LOGP), max( 0.0, ( @@ -677,48 +935,30 @@ def func( ), ) - # 2. Filtering - # - a. filter to significant families (on p-value) + # Filter on the actual p-value alpha = -1.0 * np.log(alpha_cutoff) qres = qres[qres["pvalue"] >= alpha] # filter out 0 neg log p. alpha > 0 is normal. alpha = 0 is edge case. qres = qres if alpha > 0 else qres[qres["pvalue"] > 0] - - if len(qres) == 0: - continue - - # - b. filter on sequence coverage - qres["overlap"][:] = ( - fam_highloc[qres["id"]] - fam_lowloc[qres["id"]] + k - ) / query_len - - qres = qres[(qres["overlap"] >= (25 / query_len))] if len(qres) == 0: continue - # 3. Compute normalised count - # - a. compute the expected count - top_fam_expect_counts = ref_fam_prob[qres["id"]] * len(r1) - - # - b. compute the normalised count - qres["normcount"][:] = (qres["count"] - top_fam_expect_counts) / ( - len(r1) - top_fam_expect_counts + # 4. Compute normalised count + expected_count = ref_fam_prob[qres["id"]] * len(r1) + qres["normcount"][:] = (qres["count"] - expected_count) / ( + len(r1) - expected_count ) - # 4. Store results + # 5. Store results # - a. sort by normcount, then overlap, then p-value for tie-breaking - qres = family_result_sort(qres) + qres = family_result_sort(qres, top_n_fams) # - b. store results family_results["id"][zz, :top_n_fams] = qres["id"][:top_n_fams] + 1 family_results["pvalue"][zz, :top_n_fams] = qres["pvalue"][:top_n_fams] family_results["count"][zz, :top_n_fams] = qres["count"][:top_n_fams] - family_results["normcount"][zz, :top_n_fams] = qres["normcount"][ - :top_n_fams - ] - family_results["overlap"][zz, :top_n_fams] = qres["overlap"][ - :top_n_fams - ] + family_results["normcount"][zz, :top_n_fams] = qres["normcount"][:top_n_fams] + family_results["overlap"][zz, :top_n_fams] = qres["overlap"][:top_n_fams] # 5. Place within families for i in numba.prange(min(len(qres), top_n_fams)): @@ -735,7 +975,8 @@ def func( fam_level_offsets = get_fam_level_offsets(entry, level_arr) # cumulation of counts - c = hog_counts[hog_s:hog_e].copy() + c = thread_hog_counts[hog_s:hog_e].copy() + cumulate_counts_1fam(c, fam_level_offsets, fam_hog2parent) # new expected count, but using old cumulation @@ -744,7 +985,7 @@ def func( r1.size, fam_level_offsets, fam_hog2parent, - hog_counts[hog_s:hog_e], + thread_hog_counts[hog_s:hog_e], ref_hog_prob[hog_s:hog_e], ) diff --git a/omamer/results_reader.py b/omamer/results_reader.py index b925269..a16bf09 100644 --- a/omamer/results_reader.py +++ b/omamer/results_reader.py @@ -1,6 +1,7 @@ """ OMAmer - tree-driven and alignment-free protein assignment to sub-families + (C) 2024 Nikolai Romashchenko (C) 2022-2023 Alex Warwick Vesztrocy (C) 2019-2021 Victor Rossier and Alex Warwick Vesztrocy diff --git a/omamer/sequence_buffer.py b/omamer/sequence_buffer.py index 8293ac6..c95a79f 100644 --- a/omamer/sequence_buffer.py +++ b/omamer/sequence_buffer.py @@ -1,6 +1,7 @@ """ OMAmer - tree-driven and alignment-free protein assignment to sub-families + (C) 2024 Nikolai Romashchenko (C) 2022-2023 Alex Warwick Vesztrocy (C) 2019-2021 Victor Rossier and Alex Warwick Vesztrocy diff --git a/omamer/sequence_reader.py b/omamer/sequence_reader.py index 539e470..d5297ad 100644 --- a/omamer/sequence_reader.py +++ b/omamer/sequence_reader.py @@ -1,6 +1,7 @@ """ OMAmer - tree-driven and alignment-free protein assignment to sub-families + (C) 2024 Nikolai Romashchenko (C) 2022-2023 Alex Warwick Vesztrocy (C) 2019-2021 Victor Rossier and Alex Warwick Vesztrocy diff --git a/setup.cfg b/setup.cfg index 9d2c31c..5f54ea0 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 2.0.4 +current_version = 2.1.0 commit = True tag = False diff --git a/setup.py b/setup.py index fa1f599..2d7ee08 100644 --- a/setup.py +++ b/setup.py @@ -1,6 +1,7 @@ """ OMAmer - tree-driven and alignment-free protein assignment to sub-families + (C) 2024 Nikolai Romashchenko (C) 2022-2023 Alex Warwick Vesztrocy (C) 2019-2021 Victor Rossier and Alex Warwick Vesztrocy