Skip to content

Commit

Permalink
[FIX] computation of Completeness score
Browse files Browse the repository at this point in the history
score needs to be computed on the original species tree, not the pruned
one used to efficiently process a single rhog.
  • Loading branch information
alpae committed Sep 12, 2024
1 parent a218a9a commit 01a3623
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 31 deletions.
4 changes: 2 additions & 2 deletions FastOMA/_hog_class.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ def __init__(self, input_instantiate, taxnomic_range:TreeNode, rhogid:str, msa:O
hog_members |= sub_hog.get_members()
self._members = hog_members # set.union(*tup) # this also include dubious_members
self._subhogs = list(input_instantiate) # full members of subhog, children
self._num_species_tax_speciestree = len(taxnomic_range.get_leaves())
self._num_species_tax_speciestree = taxnomic_range.size
self._representatives = [r for r in representatives]
representatives_id = set(r.get_id() for r in representatives)
assert len(representatives_id) == len(self._representatives), "Non-unique ids among representatives"
Expand Down Expand Up @@ -338,7 +338,7 @@ def _sorter_key(sh):
#hog_elemnt = ET.Element('orthologGroup', attrib={"id": str(self._hogid)})
hog_elemnt = ET.Element('orthologGroup', attrib={"id": str(self._hogid)}, )
num_species_tax_hog = len(set([i.split("||")[1] for i in self._members])) # 'tr|H2MU14|H2MU14_ORYLA||ORYLA||1056022282'
completeness_score = round(num_species_tax_hog/self._num_species_tax_speciestree,4)
completeness_score = round(num_species_tax_hog/self._num_species_tax_speciestree, 4)
property_element = ET.SubElement(hog_elemnt, "score", attrib={"id": "CompletenessScore", "value": str(completeness_score)})
property_element = ET.SubElement(hog_elemnt, "property", attrib={"name": "TaxRange", "value": str(self._tax_now.name)})

Expand Down
35 changes: 6 additions & 29 deletions FastOMA/_utils_subhog.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,8 +249,7 @@ def genetree_sd(node_species_tree, gene_tree, genetree_msa_file_addr, conf_infer
return gene_tree, all_species_dubious_sd_dic, genetree_msa_file_addr



def prepare_species_tree(rhog_i, species_tree, rhogid):
def prepare_species_tree(rhog_i: List[SeqRecord], species_tree: Tree, rhogid: str):
"""
orthoxml_to_newick.py function for extracting orthoxml_to_newick.py subtree from the input species tree orthoxml_to_newick.py.k.orthoxml_to_newick.py pruning,
based on the names of species in the rootHOG.
Expand All @@ -266,46 +265,24 @@ def prepare_species_tree(rhog_i, species_tree, rhogid):
prot_id = rec.id.split("||")
prot_name = prot_id[2] # for debugging prot_id[0] readable prot name, for xml prot_id[2]
species_name = prot_id[1]
bird_dataset = True
if species_name.endswith("_") and not bird_dataset:
species_name = prot_id[1][:-1]
# if species_name == 'RAT_': species_name = "RATNO_"
# gene_id = prot_id[2]
species_names_rhog.append(species_name)
prot_names_rhog.append(prot_name)
assert len(species_names_rhog) > 0, "species names list is empty in rhog, probably issue in formating with || in previous step find rhog"

species_names_uniqe = set(species_names_rhog)

# annotate the species tree with the original number of children at each
# node (`size`). this is used to compute the correct completeness score
for n in species_tree.traverse():
n.add_feature("size", len(n))

first_common_ancestor_name = species_tree.get_common_ancestor(species_names_uniqe).name
species_tree.prune(species_names_uniqe, preserve_branch_length=True)
# todo check internal node with one child, we need to report for i or not ?
species_tree.name = first_common_ancestor_name
# add internal node name to the tree
# this has an issue with root name, cannot add the root name
# print(species_tree.write(format=1, format_root_node=True))
# counter_internal = 0
# for node in species_tree.traverse(strategy="postorder"):
# node_name = node.name
# num_leaves_no_name = 0
# if len(node_name) < 1:
# if node.is_leaf():
# node.name = "leaf_" + str(num_leaves_no_name)
# else:
# node_children = node.children
# # list_children_names = [str(node_child.name) for node_child in node_children]
# # node.name = '_'.join(list_children_names)
# # ?? to imrpove, if the species tree has internal node name, keep it,
# # then checn condition in _infer_subhog.py, where logger.info("Finding hogs for rhogid: "+str(rh
# node.name = "internal_" + str(counter_internal) # +"_rhg"+rhogid # for debuging
# counter_internal += 1
# print("Working on the following species tree.")
# print(species_tree.write(format=1, format_root_node=True))

return species_tree, species_names_rhog, prot_names_rhog



def label_sd_internal_nodes(tree_out, threshold_dubious_sd):
"""
for the input gene tree, run the species overlap method
Expand Down

0 comments on commit 01a3623

Please sign in to comment.