From 3d1a0102d879b7d2164e64d702030e698036e897 Mon Sep 17 00:00:00 2001 From: Valerio Arnaboldi Date: Thu, 5 Jul 2018 08:33:21 -0700 Subject: [PATCH] added orthology sentences - added orthology sentences and configuration options for WB - select best orthologs based on number of methods - if more than 3 orthologs with the same (highest) number of methods, use gene class to group them - special case for human orthologs - use gene families instead of classes --- config_wb.yml | 142 +++++++++--------------- genedescriptions/config_parser.py | 2 + genedescriptions/data_fetcher.py | 87 ++++++++++++++- genedescriptions/descriptions_rules.py | 87 ++++++++++++++- genedescriptions/descriptions_writer.py | 22 ++++ generate_desc_from_raw_files.py | 53 +++++++-- requirements.txt | 1 + tests/test_data_fetcher.py | 26 +++++ tests/test_descriptions_rules.py | 2 +- 9 files changed, 319 insertions(+), 103 deletions(-) diff --git a/config_wb.yml b/config_wb.yml index c71b439..ff76aac 100644 --- a/config_wb.yml +++ b/config_wb.yml @@ -5,44 +5,89 @@ generic_data_fetcher: # options for WormBase gene description generation - used when data_fetcher is set to 'wb_data_fetcher' wb_data_fetcher: - release: "WS265" + release: "WS266" raw_files_source: "ftp://ftp.ebi.ac.uk/pub/databases/wormbase/releases" organisms_to_process: # add the species to be processed to the following list. Entries must have the same text as in 'organisms' # option - c_elegans - #- b_malayi - #- c_brenneri + - b_malayi + - c_brenneri - c_briggsae - #- c_japonica - #- c_remanei - #- o_volvulus - #- p_pacificus - #- s_ratti - #- t_muris + - c_japonica + - c_remanei + - o_volvulus + - p_pacificus + - s_ratti + - t_muris organisms: + human: + full_name: "Homo sapiens" b_malayi: + full_name: "Brugia Malayi" project_id: "PRJNA10729" + ortholog: + - c_elegans + - o_volvulus + main_sister_species: c_elegans c_brenneri: + full_name: "Caenorhabditis brenneri" project_id: "PRJNA20035" + ortholog: + - c_elegans + main_sister_species: c_elegans c_briggsae: + full_name: "Caenorhabditis briggsae" project_id: "PRJNA10731" main_sister_species: c_elegans + ortholog: + - c_elegans c_elegans: + full_name: "Caenorhabditis elegans" project_id: "PRJNA13758" name: "C. elegans" + ortholog: + - human c_japonica: + full_name: "Caenorhabditis japonica" project_id: "PRJNA12591" + main_sister_species: c_elegans + ortholog: + - c_elegans c_remanei: + full_name: "Caenorhabditis remanei" project_id: "PRJNA53967" + main_sister_species: c_elegans + ortholog: + - c_elegans o_volvulus: + full_name: "Onchocerca volvulus" project_id: "PRJEB513" + ortholog: + - c_elegans + - b_malayi + main_sister_species: c_elegans p_pacificus: + full_name: "Pristionchus pacificus" project_id: "PRJNA12644" + ortholog: + - c_elegans + main_sister_species: c_elegans s_ratti: + full_name: "Strongyloides ratti" project_id: "PRJEB125" + ortholog: + - c_elegans + - b_malayi + - o_volvulus + main_sister_species: c_elegans t_muris: + full_name: "Trichuris Muris" project_id: "PRJEB126" + ortholog: + - c_elegans + - b_malayi + main_sister_species: c_elegans agr_data_fetcher: raw_files_source: "https://s3.amazonaws.com/mod-datadumps" @@ -107,6 +152,7 @@ go_sentences_options: "(.*)embryo development ending in birth or egg hatching(.*)": "\\1embryo development\\2" "(.*)synaptic transmission, (\\w+)(.*)": "\\1\\2 synaptic transmission\\3" "(.*)postsynaptic synaptic(.*)": "\\1postsynaptic\\2" + "(.*)binding(.*)": "\\1binding activity\\2" evidence_codes: EXP: group: EXPERIMENTAL @@ -290,24 +336,6 @@ go_sentences_options: prefix: "is involved in" postfix: "" special_cases: - - aspect: P - group: PHYLOGENETIC_ANALYSIS_AND_SEQUENCE_BASED_ANALYSIS - qualifier: "" - prefix: "is predicted to be involved in" - postfix: "" - special_cases: - - aspect: P - group: INFERRED_BY_CURATORS_AND_AUTHORS - qualifier: "" - prefix: "is predicted to be involved in" - postfix: "" - special_cases: - - aspect: P - group: ELECTRONIC_AND_COMPUTATIONAL_ANALYSIS - qualifier: "" - prefix: "is predicted to be involved in" - postfix: "" - special_cases: - aspect: C group: EXPERIMENTAL qualifier: "" @@ -348,66 +376,6 @@ go_sentences_options: match_regex: "intracellular$" prefix: "is" postfix: "" - - aspect: C - group: PHYLOGENETIC_ANALYSIS_AND_SEQUENCE_BASED_ANALYSIS - qualifier: "" - prefix: "is predicted to localize to" - postfix: "" - special_cases: - - id: 1 - match_regex: "intracellular$" - prefix: "is predicted to be" - postfix: "" - - aspect: C - group: PHYLOGENETIC_ANALYSIS_AND_SEQUENCE_BASED_ANALYSIS - qualifier: "colocalizes_with" - prefix: "is predicted to colocalize with" - postfix: "" - special_cases: - - id: 1 - match_regex: "intracellular$" - prefix: "is predicted to be" - postfix: "" - - aspect: C - group: INFERRED_BY_CURATORS_AND_AUTHORS - qualifier: "" - prefix: "is predicted to localize to" - postfix: "" - special_cases: - - id: 1 - match_regex: "intracellular$" - prefix: "is predicted to be" - postfix: "" - - aspect: C - group: INFERRED_BY_CURATORS_AND_AUTHORS - qualifier: "colocalizes_with" - prefix: "is predicted to colocalize with" - postfix: "" - special_cases: - - id: 1 - match_regex: "intracellular$" - prefix: "is predicted to be" - postfix: "" - - aspect: C - group: ELECTRONIC_AND_COMPUTATIONAL_ANALYSIS - qualifier: "" - prefix: "is predicted to localize to" - postfix: "" - special_cases: - - id: 1 - match_regex: "intracellular$" - prefix: "is predicted to be" - postfix: "" - - aspect: C - group: ELECTRONIC_AND_COMPUTATIONAL_ANALYSIS - qualifier: "colocalize_with" - prefix: "is predicted to colocalize with" - postfix: "" - special_cases: - - id: 1 - match_regex: "intracellular$" - prefix: "is predicted to be" - postfix: "" go_truncate_others_aggregation_word: several go_truncate_others_terms: diff --git a/genedescriptions/config_parser.py b/genedescriptions/config_parser.py index 54e91e3..08ec66c 100644 --- a/genedescriptions/config_parser.py +++ b/genedescriptions/config_parser.py @@ -338,4 +338,6 @@ def get_genedesc_writer(self): def get_genedesc_output_dir(self, genedesc_writer: str): return self.config[genedesc_writer + "_options"]["output_dir"] + def get_ortholog_species(self): + return diff --git a/genedescriptions/data_fetcher.py b/genedescriptions/data_fetcher.py index ffb5c15..02c6944 100644 --- a/genedescriptions/data_fetcher.py +++ b/genedescriptions/data_fetcher.py @@ -3,8 +3,9 @@ import shutil import os import re + from enum import Enum -from collections import namedtuple +from collections import namedtuple, defaultdict from typing import List, Iterable, Dict from ontobio import AssociationSetFactory from genedescriptions.descriptions_rules import SingleDescStats, set_all_depths_in_subgraph @@ -260,12 +261,37 @@ def set_gene_data(self, gene_data: List[Gene]): def load_gene_data_from_file(self): pass + @staticmethod + def get_human_gene_props(): + """ retrieve data for human genes, including Ensembl ID, symbol, name, and family name + Returns: + Dict[List[str]]: a dictionary of all human genes properties, indexed by Ensembl ID + + """ + human_genes_props = defaultdict(list) + human_content = urllib.request.urlopen("https://www.genenames.org/cgi-bin/download?col=gd_hgnc_id&col=g" + "d_app_sym&col=gd_app_name&col=gd_pub_ensembl_id&col=family.id&c" + "ol=family.name&status=Approved&status=Entry+Withdrawn&status_op" + "t=2&where=&order_by=gd_app_sym_sort&format=text&limit=&hgnc_dbt" + "ag=on&submit=submit") + header = True + for line in human_content: + if not header: + linearr = line.decode("utf-8").split("\t") + linearr[-1] = linearr[-1].strip() + if linearr[3] != "": + human_genes_props[linearr[3]] = [linearr[1], linearr[2], linearr[5]] + else: + header = False + return human_genes_props + class WBDataFetcher(DataFetcher): """data fetcher for WormBase raw files for a single species""" def __init__(self, raw_files_source: str, cache_location: str, release_version: str, species: str, project_id: str, - go_relations: List[str] = None, do_relations: List[str] = None, use_cache: bool = False): + go_relations: List[str] = None, do_relations: List[str] = None, use_cache: bool = False, + sister_sp_fullname: str = ""): """create a new data fetcher for WormBase. Files will be downloaded from WB ftp site. For convenience, file locations are automatically generated and stored in class variables ending in _url for remote filed and _cache_path for caching @@ -308,6 +334,13 @@ def __init__(self, raw_files_source: str, cache_location: str, release_version: ".do_annotations.daf.txt") self.do_associations_new_url = raw_files_source + '/' + release_version + '/ONTOLOGY/disease_association.' + \ release_version + '.daf.txt' + self.orthology_url = raw_files_source + '/' + release_version + '/species/' + species + '/' + project_id + \ + '/annotation/' + species + '.' + project_id + '.' + release_version + '.orthologs.txt.gz' + self.orthology_cache_path = os.path.join(cache_location, "wormbase", release_version, "species", species, + project_id, "annotation", species + '.' + project_id + '.' + + release_version + ".orthologs.txt.gz") + self.orthologs = defaultdict(lambda: defaultdict(list)) + self.sister_sp_fullname = sister_sp_fullname def load_gene_data_from_file(self) -> None: """load gene list from pre-set file location""" @@ -382,6 +415,55 @@ def load_associations_from_file(self, associations_type: DataType, associations_ ontology=self.do_ontology, terms_blacklist=exclusion_list) + def load_orthology_from_file(self): + orthology_file = self._get_cached_file(cache_path=self.orthology_cache_path, + file_source_url=self.orthology_url) + orthologs = defaultdict(list) + gene_id = "" + header = True + for line in open(orthology_file): + if not line.startswith("#"): + if line.strip() == "=": + header = True + self.orthologs["WB:" + gene_id] = orthologs + orthologs = defaultdict(list) + elif header: + gene_id = line.strip().split()[0] + header = False + else: + ortholog_arr = line.strip().split("\t") + orthologs[ortholog_arr[0]].append(ortholog_arr[1:4]) + + def get_best_orthologs_for_gene(self, gene_id: str, orth_species_full_name: List[str], + sister_species_data_fetcher: DataFetcher = None, + ecode_priority_list: List[str] = None): + best_orthologs = None + curr_orth_fullname = None + if len(orth_species_full_name) > 0: + for curr_orth_fullname in orth_species_full_name: + if curr_orth_fullname in self.orthologs[gene_id]: + orthologs = self.orthologs[gene_id][curr_orth_fullname] + orthologs_keys = [] + if len(orthologs) > 1: + for ortholog in orthologs: + if sister_species_data_fetcher: + orthologs_keys.append([ortholog[0], ortholog[1], len(ortholog[2].split(";")), + len(sister_species_data_fetcher.get_annotations_for_gene( + gene_id=ortholog[0], annot_type=DataType.GO, + priority_list=ecode_priority_list))]) + else: + orthologs_keys.append([ortholog[0], ortholog[1], len(ortholog[2].split(";"))]) + if sister_species_data_fetcher: + best_orthologs = [sorted(orthologs_keys, key=lambda x: (x[2], x[3]), reverse=True)[0][0:2]] + else: + best_orthologs = [[orth_key[0], orth_key[1]] for orth_key in + sorted(orthologs_keys, key=lambda x: x[2], reverse=True) if + orth_key[2] == max([orth[2] for orth in orthologs_keys])] + else: + best_orthologs = [[orthologs[0][0], orthologs[0][1]]] + break + return best_orthologs, curr_orth_fullname + def load_all_data_from_file(self, go_terms_replacement_regex: Dict[str, str] = None, go_terms_exclusion_list: List[str] = None, do_terms_replacement_regex: Dict[str, str] = None, @@ -413,4 +495,5 @@ def load_all_data_from_file(self, go_terms_replacement_regex: Dict[str, str] = N self.load_associations_from_file(associations_type=DataType.DO, associations_url=self.do_associations_url, associations_cache_path=self.do_associations_cache_path, exclusion_list=do_terms_exclusion_list) + self.load_orthology_from_file() diff --git a/genedescriptions/descriptions_rules.py b/genedescriptions/descriptions_rules.py index 193caf9..eb24412 100644 --- a/genedescriptions/descriptions_rules.py +++ b/genedescriptions/descriptions_rules.py @@ -1,5 +1,9 @@ +import json + import inflect import re +import urllib.request + from namedlist import namedlist from genedescriptions.ontology_tools import * from ontobio.ontol import Ontology @@ -24,12 +28,17 @@ def __init__(self): class GeneDesc(object): """gene description""" def __init__(self, gene_id: str, gene_name: str = "", description: str = "", go_description: str = "", - disease_description: str = "", stats: SingleDescStats = None): + disease_description: str = "", stats: SingleDescStats = None, publications: str = "", refs: str = "", + species: str = "", release_version: str = ""): self.gene_id = gene_id self.gene_name = gene_name self.description = description self.go_description = go_description self.disease_description = disease_description + self.publications = publications + self.refs = refs + self.species = species + self.release_version = release_version if stats: self.stats = stats else: @@ -372,3 +381,79 @@ def _get_single_sentence(node_ids: List[str], ontology: Ontology, aspect: str, e ancestors_covering_multiple_terms=ancestors_with_multiple_children) else: return None + + +def generate_ortholog_sentence(orthologs: List[List[str]], orthologs_sp_fullname: str, human_genes_props): + orth_sentence = None + if orthologs_sp_fullname == "Homo sapiens": + if len(orthologs) > 3: + gene_families = defaultdict(list) + for ortholog in orthologs: + if human_genes_props[ortholog[0]]: + gene_families[human_genes_props[ortholog[0]][2]].append(human_genes_props[ortholog[0]]) + if len(gene_families.values()) > 0: + gene_family_names = list(gene_families.keys()) + if len(gene_family_names) > 3: + gene_family_names = gene_family_names[0:3] + gene_names = [ortholog[0] + " (" + ortholog[1] + ")" for orthologs in gene_families.values() for + ortholog in orthologs] + if len(gene_names) > 3: + gene_names = gene_names[0:3] + family_word = "family" + if len(gene_family_names) > 1: + family_word = "families" + if len(gene_family_names) > 2: + ortholog_families_str = ", ".join(gene_family_names[0:-1]) + ", and " + gene_family_names[-1] + else: + ortholog_families_str = " and ".join(gene_family_names) + if len(gene_names) > 2: + ortholog_genes_str = ", ".join(gene_names[0:-1]) + ", and " + gene_names[-1] + else: + ortholog_genes_str = " and ".join(gene_names) + orth_sentence = "is an ortholog of members of the human " + ortholog_families_str + " gene " + \ + family_word + " including " + ortholog_genes_str + else: + symbol_name_arr = sorted([human_genes_props[best_orth[0]][0] + " (" + human_genes_props[best_orth[0]][1] + + ")" for best_orth in orthologs if human_genes_props[best_orth[0]]]) + if len(symbol_name_arr) > 0: + if len(symbol_name_arr) > 2: + orth_sentence = "is an ortholog of human " + ", ".join(symbol_name_arr[0:-1]) + ", and " + \ + symbol_name_arr[-1] + else: + orth_sentence = "is an ortholog of human " + " and ".join(symbol_name_arr) + else: + fullname_arr = orthologs_sp_fullname.split(" ") + if len(fullname_arr[0]) > 2: + fullname_arr[0] = fullname_arr[0][0] + "." + orthologs_sp_fullname = " ".join(fullname_arr) + if len(orthologs) > 3: + gene_classes = defaultdict(list) + for ortholog in orthologs: + gene_class_data = json.loads(urllib.request.urlopen("http://rest.wormbase.org/rest/field/gene/" + + ortholog[0] + "/gene_class").read()) + if "gene_class" in gene_class_data and gene_class_data["gene_class"]["data"] and "tag" in \ + gene_class_data["gene_class"]["data"] and "label" in \ + gene_class_data["gene_class"]["data"]["tag"]: + gene_classes[gene_class_data["gene_class"]["data"]["tag"]["label"]].append(ortholog) + classes_gene_symbols = list(gene_classes.keys()) + if len(classes_gene_symbols) > 0: + classes_word = "class" + if len(classes_gene_symbols) > 1: + classes_word = "classes" + if len(classes_gene_symbols) > 2: + orth_sentence = "is an ortholog of " + orthologs_sp_fullname + " " + \ + ", ".join(classes_gene_symbols[0:-1]) + ", and " + classes_gene_symbols[-1] + \ + " gene " + classes_word + else: + orth_sentence = "is an ortholog of " + orthologs_sp_fullname + " " + \ + " and ".join(classes_gene_symbols) + " gene " + classes_word + return orth_sentence + orthologs_symbols = [orth[1] for orth in orthologs] + if len(orthologs_symbols) > 2: + if len(orthologs_symbols) > 3: + orthologs_symbols = orthologs_symbols[0:3] + orth_sentence = "is an ortholog of " + orthologs_sp_fullname + " " + ", ".join(orthologs_symbols[0:-1]) + \ + ", and " + orthologs_symbols[-1] + else: + orth_sentence = "is an ortholog of " + orthologs_sp_fullname + " " + " and ".join(orthologs_symbols) + return orth_sentence diff --git a/genedescriptions/descriptions_writer.py b/genedescriptions/descriptions_writer.py index 8ea14b2..0b9a9d9 100644 --- a/genedescriptions/descriptions_writer.py +++ b/genedescriptions/descriptions_writer.py @@ -1,3 +1,4 @@ +import datetime import json from abc import ABCMeta, abstractmethod import numpy as np @@ -71,3 +72,24 @@ def write(self, file_path: str, pretty: bool = False, include_single_gene_stats: with open(file_path, "w") as outfile: json.dump(vars(json_serializable_self), outfile, indent=indent) + +class WBWriter(DescriptionsWriter): + def __init__(self): + super().__init__() + + def write(self, file_path: str): + """write the descriptions to a WB file + + Args: + file_path (str): the path to the file to write + """ + with open(file_path, "w") as outfile: + for genedesc in self.data: + now = datetime.datetime.now() + outfile.write(genedesc.gene_id + "\t" + str(now.year) + "-" + str(now.month) + "-" + str(now.day) + + "\t" + genedesc.publications + "\t" + genedesc.refs + "\t" + genedesc.description + "\t" + + genedesc.species + "\t" + "This description was generated automatically by a script " + "based on homology/orthology data, Gene Ontology (GO) " + "annotations, Disease ontology (DO) annotations, and tissue " + "expression data from the " + genedesc.release_version + + " version of WormBase)") diff --git a/generate_desc_from_raw_files.py b/generate_desc_from_raw_files.py index 295a341..208d602 100755 --- a/generate_desc_from_raw_files.py +++ b/generate_desc_from_raw_files.py @@ -4,7 +4,7 @@ import os from genedescriptions.config_parser import GenedescConfigParser -from genedescriptions.data_fetcher import WBDataFetcher, DataType +from genedescriptions.data_fetcher import WBDataFetcher, DataType, DataFetcher from genedescriptions.descriptions_rules import * from genedescriptions.descriptions_writer import JsonGDWriter, GeneDesc @@ -53,17 +53,24 @@ def main(): organisms_list = conf_parser.get_agr_organisms_to_process() else: organisms_list = conf_parser.get_wb_organisms_to_process() + human_genes_props = DataFetcher.get_human_gene_props() for organism in organisms_list: logging.info("processing organism " + organism) - sister_gene_name_id_map = {} sister_df = None - species = None species = conf_parser.get_wb_species() + sister_sp_fullname = "" + if "main_sister_species" in species[organism] and "full_name" in \ + species[species[organism]["main_sister_species"]]: + sister_sp_fullname = species[species[organism]["main_sister_species"]]["full_name"] + orthologs_sp_fullname = "" + if "ortholog" in species[organism] and all(["full_name" in species[ortholog_sp] for ortholog_sp in + species[organism]["ortholog"]]): + orthologs_sp_fullname = [species[ortholog_sp]["full_name"] for ortholog_sp in species[organism]["ortholog"]] df = WBDataFetcher(raw_files_source=conf_parser.get_raw_file_sources("wb_data_fetcher"), release_version=conf_parser.get_release("wb_data_fetcher"), species=organism, project_id=species[organism]["project_id"], cache_location=conf_parser.get_cache_location(), do_relations=None, - go_relations=["subClassOf", "BFO:0000050"]) + go_relations=["subClassOf", "BFO:0000050"], sister_sp_fullname=sister_sp_fullname) if "main_sister_species" in species[organism] and species[organism]["main_sister_species"]: sister_df = WBDataFetcher(raw_files_source=conf_parser.get_raw_file_sources("wb_data_fetcher"), release_version=conf_parser.get_release("wb_data_fetcher"), @@ -71,9 +78,10 @@ def main(): project_id=species[species[organism]["main_sister_species"]]["project_id"], cache_location=conf_parser.get_cache_location(), do_relations=None, go_relations=["subClassOf", "BFO:0000050"]) - sister_df.load_all_data_from_file() - for gene in sister_df.get_gene_data(): - sister_gene_name_id_map[gene.name] = gene.id + sister_df.load_all_data_from_file(go_terms_replacement_regex=conf_parser.get_go_rename_terms(), + go_terms_exclusion_list=conf_parser.get_go_terms_exclusion_list(), + do_terms_replacement_regex=None, + do_terms_exclusion_list=conf_parser.get_do_terms_exclusion_list()) df.load_all_data_from_file(go_terms_replacement_regex=conf_parser.get_go_rename_terms(), go_terms_exclusion_list=conf_parser.get_go_terms_exclusion_list(), do_terms_replacement_regex=None, @@ -81,8 +89,23 @@ def main(): desc_writer = JsonGDWriter() for gene in df.get_gene_data(): logging.debug("processing gene " + gene.name) - gene_desc = GeneDesc(gene_id=gene.id, gene_name=gene.name) + gene_desc = GeneDesc(gene_id=gene.id, gene_name=gene.name, + publications=", ".join([annot["publication"] for annot in df.get_annotations_for_gene( + gene.id, annot_type=DataType.GO, + priority_list=conf_parser.get_go_evidence_groups_priority_list())]), + refs=", ".join([annot["refs"] for annot in df.get_annotations_for_gene( + gene.id, annot_type=DataType.GO, + priority_list=conf_parser.get_go_evidence_groups_priority_list())]), + species=species[organism]["full_name"], + release_version=conf_parser.get_release("wb_data_fetcher")) joined_sent = [] + + best_orthologs, selected_orth_name = df.get_best_orthologs_for_gene( + gene.id, orth_species_full_name=orthologs_sp_fullname) + if best_orthologs: + orth_sent = generate_ortholog_sentence(best_orthologs, selected_orth_name, human_genes_props) + if orth_sent: + joined_sent.append(orth_sent) go_sent_generator = SentenceGenerator( annotations=df.get_annotations_for_gene(gene_id=gene.id, annot_type=DataType.GO, priority_list=conf_parser.get_go_annotations_priority(), @@ -125,17 +148,23 @@ def main(): joined_sent.append(disease_sent) if conf_parser.get_data_fetcher() == "wb_data_fetcher" and "main_sister_species" in species[organism] and \ - species[organism]["main_sister_species"] and gene.name.startswith("Cbr-") and gene.name[4:] in \ - sister_gene_name_id_map: + species[organism]["main_sister_species"] and df.get_best_orthologs_for_gene( + gene.id, orth_species_full_name=[sister_sp_fullname], sister_species_data_fetcher=sister_df, + ecode_priority_list=["EXP", "IDA", "IPI", "IMP", "IGI", "IEP", "HTP", "HDA", "HMP", "HGI", + "HEP"])[0]: + best_ortholog = df.get_best_orthologs_for_gene( + gene.id, orth_species_full_name=[sister_sp_fullname], sister_species_data_fetcher=sister_df, + ecode_priority_list=["EXP", "IDA", "IPI", "IMP", "IGI", "IEP", "HTP", "HDA", "HMP", "HGI", + "HEP"])[0][0] sister_sentences_generator = SentenceGenerator(sister_df.get_annotations_for_gene( - annot_type=DataType.GO, gene_id=sister_gene_name_id_map[gene.name[4:]], + annot_type=DataType.GO, gene_id="WB:" + best_ortholog[0], priority_list=("EXP", "IDA", "IPI", "IMP", "IGI", "IEP", "HTP", "HDA", "HMP", "HGI", "HEP"), desc_stats=gene_desc.stats), ontology=df.go_ontology, **go_sent_gen_common_props) sister_proc_sent = " and ".join([sentence.text for sentence in sister_sentences_generator.get_sentences( aspect='P', merge_groups_with_same_prefix=True, keep_only_best_group=True, **go_sent_common_props)]) if sister_proc_sent: joined_sent.append("in " + species[species[organism]["main_sister_species"]]["name"] + ", " + - gene.name[4:] + " " + sister_proc_sent) + best_ortholog[1] + " " + sister_proc_sent) if len(joined_sent) > 0: desc = "; ".join(joined_sent) + "." if len(desc) > 0: diff --git a/requirements.txt b/requirements.txt index 3269d56..d48f3e7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ +urllib3 namedlist==1.7 inflect==0.3.1 PyYAML==3.12 diff --git a/tests/test_data_fetcher.py b/tests/test_data_fetcher.py index 0e039ef..9daa7cc 100644 --- a/tests/test_data_fetcher.py +++ b/tests/test_data_fetcher.py @@ -54,4 +54,30 @@ def test_load_do_associations_from_file(self): exclusion_list=self.conf_parser.get_do_terms_exclusion_list()) self.assertTrue(self.df.do_associations is not None) + def test_load_orthology_from_file(self): + species = self.conf_parser.get_wb_species() + df = WBDataFetcher(raw_files_source=self.conf_parser.get_raw_file_sources("wb_data_fetcher"), + release_version="WS265", species="c_briggsae", + project_id=species["c_briggsae"]["project_id"], + cache_location=self.conf_parser.get_cache_location(), do_relations=None, + go_relations=["subClassOf", "BFO:0000050"], sister_sp_fullname="Caenorhabditis elegans") + sister_df = WBDataFetcher(raw_files_source=self.conf_parser.get_raw_file_sources("wb_data_fetcher"), + release_version="WS265", species="c_elegans", + project_id=species["c_elegans"]["project_id"], + cache_location=self.conf_parser.get_cache_location(), do_relations=None, + go_relations=["subClassOf", "BFO:0000050"]) + sister_df.load_gene_data_from_file() + sister_df.load_ontology_from_file(ontology_type=DataType.GO, ontology_url=sister_df.go_ontology_url, + ontology_cache_path=sister_df.go_ontology_cache_path, + terms_replacement_regex=self.conf_parser.get_go_rename_terms()) + sister_df.load_associations_from_file(associations_type=DataType.GO, + associations_url=sister_df.go_associations_url, + associations_cache_path=sister_df.go_associations_cache_path, + exclusion_list=self.conf_parser.get_do_terms_exclusion_list()) + df.load_orthology_from_file(sister_species_data_fetcher=sister_df, + ecode_priority_list=["EXP", "IDA", "IPI", "IMP", "IGI", "IEP", "HTP", "HDA", "HMP", + "HGI", "HEP"]) + best_ortholog = df.get_best_sister_ortholog_for_gene("WB:WBGene00000307") + pass + diff --git a/tests/test_descriptions_rules.py b/tests/test_descriptions_rules.py index 17d0660..6fff700 100644 --- a/tests/test_descriptions_rules.py +++ b/tests/test_descriptions_rules.py @@ -10,7 +10,7 @@ class TestDescriptionsRules(unittest.TestCase): def setUp(self): this_dir = os.path.split(__file__)[0] - self.conf_parser = GenedescConfigParser(os.path.join(this_dir, os.path.pardir, "config_wb.yml")) + self.conf_parser = GenedescConfigParser(os.path.join(this_dir, os.path.pardir, "config_wb_non_c_elegans.yml")) def test_generate_sentences(self): df = WBDataFetcher(raw_files_source=self.conf_parser.get_raw_file_sources("wb_data_fetcher"),