Skip to content

Commit

Permalink
Merge branch 'fix-completeness-score' into dev
Browse files Browse the repository at this point in the history
  • Loading branch information
alpae committed Sep 12, 2024
2 parents a218a9a + 01a3623 commit 58c987d
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 31 deletions.
4 changes: 2 additions & 2 deletions FastOMA/_hog_class.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ def __init__(self, input_instantiate, taxnomic_range:TreeNode, rhogid:str, msa:O
hog_members |= sub_hog.get_members()
self._members = hog_members # set.union(*tup) # this also include dubious_members
self._subhogs = list(input_instantiate) # full members of subhog, children
self._num_species_tax_speciestree = len(taxnomic_range.get_leaves())
self._num_species_tax_speciestree = taxnomic_range.size
self._representatives = [r for r in representatives]
representatives_id = set(r.get_id() for r in representatives)
assert len(representatives_id) == len(self._representatives), "Non-unique ids among representatives"
Expand Down Expand Up @@ -338,7 +338,7 @@ def _sorter_key(sh):
#hog_elemnt = ET.Element('orthologGroup', attrib={"id": str(self._hogid)})
hog_elemnt = ET.Element('orthologGroup', attrib={"id": str(self._hogid)}, )
num_species_tax_hog = len(set([i.split("||")[1] for i in self._members])) # 'tr|H2MU14|H2MU14_ORYLA||ORYLA||1056022282'
completeness_score = round(num_species_tax_hog/self._num_species_tax_speciestree,4)
completeness_score = round(num_species_tax_hog/self._num_species_tax_speciestree, 4)
property_element = ET.SubElement(hog_elemnt, "score", attrib={"id": "CompletenessScore", "value": str(completeness_score)})
property_element = ET.SubElement(hog_elemnt, "property", attrib={"name": "TaxRange", "value": str(self._tax_now.name)})

Expand Down
35 changes: 6 additions & 29 deletions FastOMA/_utils_subhog.py
Original file line number Diff line number Diff line change
Expand Up @@ -249,8 +249,7 @@ def genetree_sd(node_species_tree, gene_tree, genetree_msa_file_addr, conf_infer
return gene_tree, all_species_dubious_sd_dic, genetree_msa_file_addr



def prepare_species_tree(rhog_i, species_tree, rhogid):
def prepare_species_tree(rhog_i: List[SeqRecord], species_tree: Tree, rhogid: str):
"""
orthoxml_to_newick.py function for extracting orthoxml_to_newick.py subtree from the input species tree orthoxml_to_newick.py.k.orthoxml_to_newick.py pruning,
based on the names of species in the rootHOG.
Expand All @@ -266,46 +265,24 @@ def prepare_species_tree(rhog_i, species_tree, rhogid):
prot_id = rec.id.split("||")
prot_name = prot_id[2] # for debugging prot_id[0] readable prot name, for xml prot_id[2]
species_name = prot_id[1]
bird_dataset = True
if species_name.endswith("_") and not bird_dataset:
species_name = prot_id[1][:-1]
# if species_name == 'RAT_': species_name = "RATNO_"
# gene_id = prot_id[2]
species_names_rhog.append(species_name)
prot_names_rhog.append(prot_name)
assert len(species_names_rhog) > 0, "species names list is empty in rhog, probably issue in formating with || in previous step find rhog"

species_names_uniqe = set(species_names_rhog)

# annotate the species tree with the original number of children at each
# node (`size`). this is used to compute the correct completeness score
for n in species_tree.traverse():
n.add_feature("size", len(n))

first_common_ancestor_name = species_tree.get_common_ancestor(species_names_uniqe).name
species_tree.prune(species_names_uniqe, preserve_branch_length=True)
# todo check internal node with one child, we need to report for i or not ?
species_tree.name = first_common_ancestor_name
# add internal node name to the tree
# this has an issue with root name, cannot add the root name
# print(species_tree.write(format=1, format_root_node=True))
# counter_internal = 0
# for node in species_tree.traverse(strategy="postorder"):
# node_name = node.name
# num_leaves_no_name = 0
# if len(node_name) < 1:
# if node.is_leaf():
# node.name = "leaf_" + str(num_leaves_no_name)
# else:
# node_children = node.children
# # list_children_names = [str(node_child.name) for node_child in node_children]
# # node.name = '_'.join(list_children_names)
# # ?? to imrpove, if the species tree has internal node name, keep it,
# # then checn condition in _infer_subhog.py, where logger.info("Finding hogs for rhogid: "+str(rh
# node.name = "internal_" + str(counter_internal) # +"_rhg"+rhogid # for debuging
# counter_internal += 1
# print("Working on the following species tree.")
# print(species_tree.write(format=1, format_root_node=True))

return species_tree, species_names_rhog, prot_names_rhog



def label_sd_internal_nodes(tree_out, threshold_dubious_sd):
"""
for the input gene tree, run the species overlap method
Expand Down

0 comments on commit 58c987d

Please sign in to comment.