Skip to content

Commit

Permalink
[ADD] extract pw relations from orthoxml
Browse files Browse the repository at this point in the history
  • Loading branch information
alpae committed Dec 12, 2023
1 parent b5f4d4c commit b6910b0
Show file tree
Hide file tree
Showing 4 changed files with 73 additions and 6 deletions.
16 changes: 15 additions & 1 deletion FastOMA.nf
Original file line number Diff line number Diff line change
Expand Up @@ -295,6 +295,20 @@ process collect_subhogs{
"""
}

process extract_pairwise_ortholog_relations {
publishDir params.output_folder, mode: 'copy'
input:
path orthoxml
output:
path "orthologs.tsv.gz"
script:
"""
fastoma-helper -vv pw-rel --orthoxml $orthoxml \
--out orthologs.tsv \
--type ortholog
"""
}

workflow {
proteome_folder = Channel.fromPath(params.proteome_folder, type: "dir", checkIfExists:true).first()
proteomes = Channel.fromPath(params.proteome_folder + "/*", type:'any', checkIfExists:true)
Expand All @@ -318,7 +332,7 @@ workflow {
channel.empty().concat(pickle_big_rhog, pickle_rest_rhog).set{ all_rhog_pickle }

(orthoxml_file, OrthologousGroupsFasta, OrthologousGroups_tsv, rootHOGs_tsv) = collect_subhogs(all_rhog_pickle.collect(), gene_id_dic_xml, omamer_rhogs, species_tree_checked, params.fasta_header_id_transformer)

extract_pairwise_ortholog_relations(orthoxml_file)
}

workflow.onComplete {
Expand Down
34 changes: 34 additions & 0 deletions FastOMA/helper_scripts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import argparse
from ._config import logger_hog as logger
from .zoo.utils import auto_open


def extract_pw_rels(args):
from lxml import etree
from .zoo.hog import transform
xml = etree.parse(args.orthoxml)
with auto_open(args.out, 'wt') as fout:
for p1, p2 in transform.iter_pairwise_relations(xml, rel_type=args.type, id_attribute="protId"):
fout.write(f"{p1}\t{p2}\n")


def main():
parser = argparse.ArgumentParser(description="FastOMA helper scripts")
parser.add_argument('-v', default=0, action="count", help="increase verbosity")
subparsers = parser.add_subparsers(required=True)

parser_pw = subparsers.add_parser('pw-rel')
parser_pw.add_argument("--type", choices=("ortholog", "paralog"), default="ortholog",
help="Type of relations to extract. either 'ortholog' or 'paralog'")
parser_pw.add_argument("--out", required=True, help="Path to output file")
parser_pw.add_argument("--orthoxml", required=True, help="Path to input orthoxml file")
parser_pw.set_defaults(func=extract_pw_rels)

conf = parser.parse_args()
logger.setLevel(level=30 - 10 * min(conf.v, 2))
logger.debug(conf)
conf.func(conf)


if __name__ == "__main__":
main()
26 changes: 22 additions & 4 deletions FastOMA/zoo/hog/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def leaf_label(self, leaf):

def _extract_pw(self, node):
if self.is_leaf(node):
return set([self.leaf_label(node)])
return {self.leaf_label(node)}
elif self.is_internal_node(node):
nodes_of_children = [self._extract_pw(child) for child in self.get_children(node)]
if self.shall_extract_from_node(node):
Expand Down Expand Up @@ -73,8 +73,18 @@ def check_rel_type(self, rel_type):


class OrthoXmlRelationExtractor(BaseRelationExtractor):
def __init__(self, fh):
def __init__(self, fh, id_attribute=None, **kwargs):
super(OrthoXmlRelationExtractor, self).__init__(fh)
if id_attribute is not None:
self.geneRef_to_id = {}
ns = "{http://orthoXML.org/2011/}"
for g in self.obj.findall(f'./{ns}species/{ns}database/{ns}genes/{ns}gene'):
try:
self.geneRef_to_id[g.get('id')] = g.get(id_attribute)
except AttributeError:
raise KeyError(f"gene {g} does not have an attribute '{id_attribute}'")
else:
self.geneRef_to_id = None

def default_node_list(self):
return self.obj.find(".//{http://orthoXML.org/2011/}groups")
Expand All @@ -86,7 +96,10 @@ def is_internal_node(self, node):
return lxml.etree.QName(node).localname in ('orthologGroup', 'paralogGroup')

def leaf_label(self, leaf):
return leaf.get('id')
res = leaf.get('id')
if self.geneRef_to_id:
res = self.geneRef_to_id[res]
return res

def is_leaf(self, node):
return lxml.etree.QName(node).localname == "geneRef"
Expand Down Expand Up @@ -149,6 +162,11 @@ def iter_pairwise_relations(obj, rel_type=None, node=None, **kwargs):
Defaults to the root of the tree or the <groups> tag
respectively.
:param id_attribute: applies only for orthoxml files. If parameter
is provided, instead of the internal geneRef-id, the value that
is stored in the gene under the given attribute is returned. So
meaningful values can be `protId` or `geneId`.
:param kwargs: additional arguments that are passed. So far
none will be used. Foreseen examples would be a callback
function on leaf nodes.
Expand All @@ -164,7 +182,7 @@ def iter_pairwise_relations(obj, rel_type=None, node=None, **kwargs):
[('2', '3'), ('2', '4')]
"""
if hasattr(obj, 'docinfo') and obj.docinfo.root_name == "orthoXML":
parser = OrthoXmlRelationExtractor(obj)
parser = OrthoXmlRelationExtractor(obj, **kwargs)
elif isinstance(obj, dpy.Tree):
parser = TreeRelationExtractor(obj)

Expand Down
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ dependencies = [
"networkx",
]

[project.optinal-dependencies]
[project.optional-dependencies]
nextflow = [
"nextflow"
]
Expand All @@ -36,6 +36,7 @@ fastoma-check-input = "FastOMA.check_input:fastoma_check_input"
fastoma-collect-subhogs = "FastOMA.collect_subhogs:fastoma_collect_subhogs"
fastoma-infer-roothogs = "FastOMA.infer_roothogs:fastoma_infer_roothogs"
fastoma-infer-subhogs = "FastOMA.infer_subhogs:fastoma_infer_subhogs"
fastoma-helper = "FastOMA.helper_scripts:main"

[project.urls]
Homepage = "https://github.com/DessimozLab/FastOMA"
Expand Down

0 comments on commit b6910b0

Please sign in to comment.