From 01a3623e4834a91a81bf417c55d4e23a68c55cd6 Mon Sep 17 00:00:00 2001 From: Adrian Altenhoff Date: Thu, 12 Sep 2024 09:27:40 +0200 Subject: [PATCH] [FIX] computation of Completeness score score needs to be computed on the original species tree, not the pruned one used to efficiently process a single rhog. --- FastOMA/_hog_class.py | 4 ++-- FastOMA/_utils_subhog.py | 35 ++++++----------------------------- 2 files changed, 8 insertions(+), 31 deletions(-) diff --git a/FastOMA/_hog_class.py b/FastOMA/_hog_class.py index cf40a03..beb1c12 100644 --- a/FastOMA/_hog_class.py +++ b/FastOMA/_hog_class.py @@ -103,7 +103,7 @@ def __init__(self, input_instantiate, taxnomic_range:TreeNode, rhogid:str, msa:O hog_members |= sub_hog.get_members() self._members = hog_members # set.union(*tup) # this also include dubious_members self._subhogs = list(input_instantiate) # full members of subhog, children - self._num_species_tax_speciestree = len(taxnomic_range.get_leaves()) + self._num_species_tax_speciestree = taxnomic_range.size self._representatives = [r for r in representatives] representatives_id = set(r.get_id() for r in representatives) assert len(representatives_id) == len(self._representatives), "Non-unique ids among representatives" @@ -338,7 +338,7 @@ def _sorter_key(sh): #hog_elemnt = ET.Element('orthologGroup', attrib={"id": str(self._hogid)}) hog_elemnt = ET.Element('orthologGroup', attrib={"id": str(self._hogid)}, ) num_species_tax_hog = len(set([i.split("||")[1] for i in self._members])) # 'tr|H2MU14|H2MU14_ORYLA||ORYLA||1056022282' - completeness_score = round(num_species_tax_hog/self._num_species_tax_speciestree,4) + completeness_score = round(num_species_tax_hog/self._num_species_tax_speciestree, 4) property_element = ET.SubElement(hog_elemnt, "score", attrib={"id": "CompletenessScore", "value": str(completeness_score)}) property_element = ET.SubElement(hog_elemnt, "property", attrib={"name": "TaxRange", "value": str(self._tax_now.name)}) diff --git a/FastOMA/_utils_subhog.py b/FastOMA/_utils_subhog.py index f084160..f40af96 100644 --- a/FastOMA/_utils_subhog.py +++ b/FastOMA/_utils_subhog.py @@ -249,8 +249,7 @@ def genetree_sd(node_species_tree, gene_tree, genetree_msa_file_addr, conf_infer return gene_tree, all_species_dubious_sd_dic, genetree_msa_file_addr - -def prepare_species_tree(rhog_i, species_tree, rhogid): +def prepare_species_tree(rhog_i: List[SeqRecord], species_tree: Tree, rhogid: str): """ orthoxml_to_newick.py function for extracting orthoxml_to_newick.py subtree from the input species tree orthoxml_to_newick.py.k.orthoxml_to_newick.py pruning, based on the names of species in the rootHOG. @@ -266,46 +265,24 @@ def prepare_species_tree(rhog_i, species_tree, rhogid): prot_id = rec.id.split("||") prot_name = prot_id[2] # for debugging prot_id[0] readable prot name, for xml prot_id[2] species_name = prot_id[1] - bird_dataset = True - if species_name.endswith("_") and not bird_dataset: - species_name = prot_id[1][:-1] - # if species_name == 'RAT_': species_name = "RATNO_" - # gene_id = prot_id[2] species_names_rhog.append(species_name) prot_names_rhog.append(prot_name) assert len(species_names_rhog) > 0, "species names list is empty in rhog, probably issue in formating with || in previous step find rhog" species_names_uniqe = set(species_names_rhog) + # annotate the species tree with the original number of children at each + # node (`size`). this is used to compute the correct completeness score + for n in species_tree.traverse(): + n.add_feature("size", len(n)) + first_common_ancestor_name = species_tree.get_common_ancestor(species_names_uniqe).name species_tree.prune(species_names_uniqe, preserve_branch_length=True) # todo check internal node with one child, we need to report for i or not ? species_tree.name = first_common_ancestor_name - # add internal node name to the tree - # this has an issue with root name, cannot add the root name - # print(species_tree.write(format=1, format_root_node=True)) - # counter_internal = 0 - # for node in species_tree.traverse(strategy="postorder"): - # node_name = node.name - # num_leaves_no_name = 0 - # if len(node_name) < 1: - # if node.is_leaf(): - # node.name = "leaf_" + str(num_leaves_no_name) - # else: - # node_children = node.children - # # list_children_names = [str(node_child.name) for node_child in node_children] - # # node.name = '_'.join(list_children_names) - # # ?? to imrpove, if the species tree has internal node name, keep it, - # # then checn condition in _infer_subhog.py, where logger.info("Finding hogs for rhogid: "+str(rh - # node.name = "internal_" + str(counter_internal) # +"_rhg"+rhogid # for debuging - # counter_internal += 1 - # print("Working on the following species tree.") - # print(species_tree.write(format=1, format_root_node=True)) - return species_tree, species_names_rhog, prot_names_rhog - def label_sd_internal_nodes(tree_out, threshold_dubious_sd): """ for the input gene tree, run the species overlap method