From 9fd3be942198087e7dce770a3d625be9324850a6 Mon Sep 17 00:00:00 2001 From: Valerio Arnaboldi Date: Thu, 5 Jul 2018 14:25:51 -0700 Subject: [PATCH] updated reports - updated classes to calculate reports to include more stats and remove old ones - code to compose WN descriptions moved from main to description_rules --- genedescriptions/data_fetcher.py | 20 +-- genedescriptions/descriptions_rules.py | 183 +++++++++++++++++++++--- genedescriptions/descriptions_writer.py | 59 ++++++-- generate_desc_from_raw_files.py | 91 ++---------- 4 files changed, 218 insertions(+), 135 deletions(-) diff --git a/genedescriptions/data_fetcher.py b/genedescriptions/data_fetcher.py index 02c6944..bd4f19b 100644 --- a/genedescriptions/data_fetcher.py +++ b/genedescriptions/data_fetcher.py @@ -4,23 +4,15 @@ import os import re -from enum import Enum -from collections import namedtuple, defaultdict +from collections import defaultdict from typing import List, Iterable, Dict from ontobio import AssociationSetFactory -from genedescriptions.descriptions_rules import SingleDescStats, set_all_depths_in_subgraph +from genedescriptions.descriptions_rules import set_all_depths_in_subgraph, Gene, DataType from ontobio.ontol_factory import OntologyFactory from ontobio.ontol import Ontology from ontobio.assocmodel import AssociationSet from ontobio.io.gafparser import GafParser -Gene = namedtuple('Gene', ['id', 'name', 'dead', 'pseudo']) - - -class DataType(Enum): - GO = 1 - DO = 2 - class DataFetcher(object): """retrieve data for gene descriptions from different sources""" @@ -204,8 +196,7 @@ def get_annotations_for_gene(self, gene_id: str, annot_type: DataType = DataType include_obsolete: bool = False, include_negative_results: bool = False, priority_list: Iterable = ("EXP", "IDA", "IPI", "IMP", "IGI", "IEP", "IC", "ISS", "ISO", "ISA", "ISM", "IGC", "IBA", "IBD", "IKR", "IRD", - "RCA", "IEA"), - desc_stats: SingleDescStats = None) -> List[Dict]: + "RCA", "IEA")) -> List[Dict]: """ retrieve go annotations for a given gene id and a given type. The annotations are unique for each pair . This means that when multiple annotations for the same pair are found in the go data, the @@ -221,7 +212,6 @@ def get_annotations_for_gene(self, gene_id: str, annot_type: DataType = DataType term are found, only the one with highest priority is returned. The first element in the list has the highest priority, whereas the last has the lowest. Only annotations with evidence codes in the priority list are returned. All other annotations are ignored - desc_stats (SingleDescStats): an object containing the description statistics where to save the total number of annotations for the gene Returns: List[Dict]: the list of annotations for the given gene @@ -239,8 +229,6 @@ def get_annotations_for_gene(self, gene_id: str, annot_type: DataType = DataType not ontology.is_obsolete( annotation["object"]["id"])) and (include_negative_results or "NOT" not in annotation["qualifiers"])] - if desc_stats: - desc_stats.total_num_go_annotations = len(annotations) id_selected_annotation = {} for annotation in annotations: if annotation["evidence"]["type"] in priority_map.keys(): @@ -250,8 +238,6 @@ def get_annotations_for_gene(self, gene_id: str, annot_type: DataType = DataType id_selected_annotation[annotation["object"]["id"]] = annotation else: id_selected_annotation[annotation["object"]["id"]] = annotation - if desc_stats: - desc_stats.num_prioritized_go_annotations = len(id_selected_annotation.keys()) return [annotation for annotation in id_selected_annotation.values()] def set_gene_data(self, gene_data: List[Gene]): diff --git a/genedescriptions/descriptions_rules.py b/genedescriptions/descriptions_rules.py index eb24412..63a9fa1 100644 --- a/genedescriptions/descriptions_rules.py +++ b/genedescriptions/descriptions_rules.py @@ -1,13 +1,26 @@ import json +from collections import namedtuple +from enum import Enum import inflect import re import urllib.request from namedlist import namedlist + +from genedescriptions.config_parser import GenedescConfigParser from genedescriptions.ontology_tools import * from ontobio.ontol import Ontology + +class DataType(Enum): + GO = 1 + DO = 2 + + +Gene = namedtuple('Gene', ['id', 'name', 'dead', 'pseudo']) + + Sentence = namedlist('Sentence', ['prefix', 'terms_ids', 'postfix', 'text', 'aspect', 'evidence_group', 'terms_merged', 'additional_prefix', 'qualifier', 'ancestors_covering_multiple_terms']) @@ -15,26 +28,33 @@ class SingleDescStats(object): """statistics for a single gene description""" def __init__(self): - self.num_terms_notrim_nogroup_priority_nomerge = defaultdict(int) - self.num_terms_trim_nogroup_priority_nomerge = defaultdict(int) - self.num_terms_trim_group_priority_merge = defaultdict(int) - self.total_num_go_annotations = 0 - self.num_prioritized_go_annotations = 0 - self.terms_notrim_nogroup_priority_nomerge = defaultdict(list) - self.terms_trim_nogroup_priority_nomerge = defaultdict(list) - self.terms_trim_group_priority_merge = defaultdict(list) + self.total_number_go_annotations = 0 + self.number_initial_go_terms_f = 0 + self.number_initial_go_terms_p = 0 + self.number_initial_go_terms_c = 0 + self.number_final_go_terms_f = 0 + self.number_final_go_terms_p = 0 + self.number_final_go_terms_c = 0 + self.total_number_do_annotations = 0 + self.number_initial_do_terms = 0 + self.number_final_do_terms = 0 + self.number_final_do_term_covering_multiple_initial_do_terms_present = 0 class GeneDesc(object): """gene description""" - def __init__(self, gene_id: str, gene_name: str = "", description: str = "", go_description: str = "", - disease_description: str = "", stats: SingleDescStats = None, publications: str = "", refs: str = "", - species: str = "", release_version: str = ""): + def __init__(self, gene_id: str, gene_name: str = "", description: str = None, go_description: str = None, + go_function_description: str = None, go_process_description: str = None, + go_component_description: str = None, do_description: str = None, stats: SingleDescStats = None, + publications: str = "", refs: str = "", species: str = "", release_version: str = ""): self.gene_id = gene_id self.gene_name = gene_name self.description = description self.go_description = go_description - self.disease_description = disease_description + self.go_function_description = go_function_description + self.go_process_description = go_process_description + self.go_component_description = go_component_description + self.do_description = do_description self.publications = publications self.refs = refs self.species = species @@ -48,10 +68,28 @@ def __init__(self, gene_id: str, gene_name: str = "", description: str = "", go_ class DescriptionsStats(object): """overall statistics for a set of gene descriptions""" def __init__(self): - self.num_genes_with_go_sentence = 0 - self.average_num_go_terms_if_desc_trim_group_priority_merge = 0 - self.average_num_go_terms_if_desc_trim_nogroup_priority_nomerge = 0 - self.average_num_go_terms_if_desc_notrim_nogroup_priority_nomerge = 0 + self.total_number__of_genes = 0 + self.number_genes_with_non_null_description = 0 + self.number_genes_with_non_null_go_description = 0 + self.number_genes_with_non_null_go_function_description = 0 + self.number_genes_with_non_null_go_process_description = 0 + self.number_genes_with_non_null_go_component_description = 0 + self.number_genes_with_null_go_description = 0 + self.number_genes_with_more_than_3_initial_go_terms = 0 + self.number_genes_with_non_null_do_description = 0 + self.number_genes_with_null_do_description = 0 + self.number_genes_with_more_than_3_initial_do_terms = 0 + self.number_genes_with_final_do_terms_covering_multiple_initial_terms = 0 + self.average_number_initial_go_terms_f = 0 + self.average_number_initial_go_terms_p = 0 + self.average_number_initial_go_terms_c = 0 + self.average_number_final_go_terms_f = 0 + self.average_number_final_go_terms_p = 0 + self.average_number_final_go_terms_c = 0 + self.average_number_initial_do_terms = 0 + self.average_number_final_do_terms = 0 + self.average_number_go_annotations = 0 + self.average_number_do_annotations = 0 class SentenceMerger(object): @@ -116,7 +154,6 @@ def get_sentences(self, aspect: str, qualifier: str = '', keep_only_best_group: merge_num_terms_threshold: int = 3, merge_min_distance_from_root: dict = None, truncate_others_generic_word: str = "several", truncate_others_aspect_words: Dict[str, str] = None, - desc_stats: SingleDescStats = None, remove_successive_overlapped_terms: bool = True, exclude_terms_ids: List[str] = None, add_multiple_if_covers_more_children: bool = False) -> List[Sentence]: @@ -136,8 +173,6 @@ def get_sentences(self, aspect: str, qualifier: str = '', keep_only_best_group: merge_min_distance_from_root (dict): minimum distance from root terms for the selection of common ancestors during merging operations. Three values must be provided in the form of a dictionary with keys 'F', 'P', and 'C' for go aspect names and values integers indicating the threshold for each aspect - desc_stats (SingleDescStat): an object containing the description statistics where to save the total number - of annotations for the gene truncate_others_generic_word (str): a generic word to indicate that the set of terms reported in the sentence is only a subset of the original terms, e.g., 'several' truncate_others_aspect_words (Dict[str, str]): one word for each aspect describing the kind of terms that @@ -153,7 +188,7 @@ def get_sentences(self, aspect: str, qualifier: str = '', keep_only_best_group: if not merge_min_distance_from_root: merge_min_distance_from_root = {'F': 1, 'P': 1, 'C': 2, 'D': 3} if not truncate_others_aspect_words: - truncate_others_aspect_words = {'F': 'functions', 'P': 'processes', 'C': 'components'} + truncate_others_aspect_words = {'F': 'functions', 'P': 'processes', 'C': 'components', 'D': 'diseases'} sentences = [] terms_already_covered = set() evidence_group_priority = {eg: p for p, eg in enumerate(self.evidence_groups_priority_list)} @@ -165,9 +200,6 @@ def get_sentences(self, aspect: str, qualifier: str = '', keep_only_best_group: terms -= terms_already_covered if exclude_terms_ids: terms -= set(exclude_terms_ids) - if desc_stats: - desc_stats.num_terms_notrim_nogroup_priority_nomerge[aspect] += len(terms) - desc_stats.terms_notrim_nogroup_priority_nomerge[aspect].extend(terms) add_others = False if remove_parent_terms: terms_no_ancestors = terms - set([ancestor for node_id in terms for ancestor in @@ -457,3 +489,108 @@ def generate_ortholog_sentence(orthologs: List[List[str]], orthologs_sp_fullname else: orth_sentence = "is an ortholog of " + orthologs_sp_fullname + " " + " and ".join(orthologs_symbols) return orth_sentence + + +def compose_wormbase_description(gene: Gene, conf_parser: GenedescConfigParser, species, organism, df, + orthologs_sp_fullname, go_sent_gen_common_props, go_sent_common_props, + human_genes_props, do_sent_gen_common_prop, do_sent_common_props, sister_sp_fullname, + sister_df, desc_writer): + gene_desc = GeneDesc(gene_id=gene.id, gene_name=gene.name, + publications=", ".join([annot["publication"] for annot in df.get_annotations_for_gene( + gene.id, annot_type=DataType.GO, + priority_list=conf_parser.get_go_evidence_groups_priority_list())]), + refs=", ".join([annot["refs"] for annot in df.get_annotations_for_gene( + gene.id, annot_type=DataType.GO, + priority_list=conf_parser.get_go_evidence_groups_priority_list())]), + species=species[organism]["full_name"], + release_version=conf_parser.get_release("wb_data_fetcher")) + joined_sent = [] + + best_orthologs, selected_orth_name = df.get_best_orthologs_for_gene( + gene.id, orth_species_full_name=orthologs_sp_fullname) + if best_orthologs: + orth_sent = generate_ortholog_sentence(best_orthologs, selected_orth_name, human_genes_props) + if orth_sent: + joined_sent.append(orth_sent) + go_annotations = df.get_annotations_for_gene(gene_id=gene.id, annot_type=DataType.GO, + priority_list=conf_parser.get_go_annotations_priority()) + go_sent_generator = SentenceGenerator(annotations=go_annotations, ontology=df.go_ontology, + **go_sent_gen_common_props) + gene_desc.stats.total_number_go_annotations = len(go_annotations) + gene_desc.stats.number_initial_go_terms = {aspect: len(terms) for aspect, terms in + go_sent_generator.terms_groups.items()} + raw_func_sent = go_sent_generator.get_sentences(aspect='F', merge_groups_with_same_prefix=True, + keep_only_best_group=True, **go_sent_common_props) + gene_desc.stats.number_final_go_terms_f += sum([len(sentence.terms_ids) for sentence in raw_func_sent]) + func_sent = " and ".join([sentence.text for sentence in raw_func_sent]) + if func_sent: + joined_sent.append(func_sent) + contributes_to_raw_func_sent = go_sent_generator.get_sentences( + aspect='F', qualifier='contributes_to', merge_groups_with_same_prefix=True, keep_only_best_group=True, + **go_sent_common_props) + gene_desc.stats.number_final_go_terms_f += sum([len(sentence.terms_ids) for sentence in + contributes_to_raw_func_sent]) + contributes_to_func_sent = " and ".join([sentence.text for sentence in contributes_to_raw_func_sent]) + if contributes_to_func_sent: + joined_sent.append(contributes_to_func_sent) + raw_proc_sent = go_sent_generator.get_sentences(aspect='P', merge_groups_with_same_prefix=True, + keep_only_best_group=True, **go_sent_common_props) + gene_desc.stats.number_final_go_terms_p += sum([len(sentence.terms_ids) for sentence in raw_proc_sent]) + proc_sent = " and ".join([sentence.text for sentence in raw_proc_sent]) + if proc_sent: + joined_sent.append(proc_sent) + raw_comp_sent = go_sent_generator.get_sentences( + aspect='C', merge_groups_with_same_prefix=True, keep_only_best_group=True, **go_sent_common_props) + gene_desc.stats.number_final_go_terms_c += sum([len(sentence.terms_ids) for sentence in raw_comp_sent]) + comp_sent = " and ".join([sentence.text for sentence in raw_comp_sent]) + if comp_sent: + joined_sent.append(comp_sent) + colocalizes_with_raw_comp_sent = go_sent_generator.get_sentences( + aspect='C', qualifier='colocalizes_with', merge_groups_with_same_prefix=True, + keep_only_best_group=True, **go_sent_common_props) + gene_desc.stats.number_final_go_terms_c += sum([len(sentence.terms_ids) for sentence in + colocalizes_with_raw_comp_sent]) + colocalizes_with_comp_sent = " and ".join([sentence.text for sentence in colocalizes_with_raw_comp_sent]) + if colocalizes_with_comp_sent: + joined_sent.append(colocalizes_with_comp_sent) + do_annotations = df.get_annotations_for_gene(gene_id=gene.id, annot_type=DataType.DO, + priority_list=conf_parser.get_do_annotations_priority()) + do_sentence_generator = SentenceGenerator(annotations=do_annotations, ontology=df.do_ontology, + **do_sent_gen_common_prop) + gene_desc.stats.total_number_do_annotations = len(do_annotations) + gene_desc.stats.number_initial_do_terms = sum([len(terms) for terms in + do_sentence_generator.terms_groups.values()]) + raw_disease_sent = do_sentence_generator.get_sentences( + aspect='D', merge_groups_with_same_prefix=True, keep_only_best_group=False, **do_sent_common_props) + disease_sent = "; ".join([sentence.text for sentence in raw_disease_sent]) + if disease_sent: + joined_sent.append(disease_sent) + gene_desc.stats.number_final_do_terms += sum([len(sentence.terms_ids) for sentence in raw_disease_sent]) + if "(multiple)" in disease_sent: + gene_desc.stats.number_final_do_term_covering_multiple_initial_do_terms_present = \ + disease_sent.count("(multiple)") + if conf_parser.get_data_fetcher() == "wb_data_fetcher" and "main_sister_species" in species[organism] and \ + species[organism]["main_sister_species"] and df.get_best_orthologs_for_gene( + gene.id, orth_species_full_name=[sister_sp_fullname], sister_species_data_fetcher=sister_df, + ecode_priority_list=["EXP", "IDA", "IPI", "IMP", "IGI", "IEP", "HTP", "HDA", "HMP", "HGI", + "HEP"])[0]: + best_ortholog = df.get_best_orthologs_for_gene( + gene.id, orth_species_full_name=[sister_sp_fullname], sister_species_data_fetcher=sister_df, + ecode_priority_list=["EXP", "IDA", "IPI", "IMP", "IGI", "IEP", "HTP", "HDA", "HMP", "HGI", + "HEP"])[0][0] + sister_sentences_generator = SentenceGenerator(sister_df.get_annotations_for_gene( + annot_type=DataType.GO, gene_id="WB:" + best_ortholog[0], + priority_list=("EXP", "IDA", "IPI", "IMP", "IGI", "IEP", "HTP", "HDA", "HMP", "HGI", "HEP")), + ontology=df.go_ontology, **go_sent_gen_common_props) + sister_proc_sent = " and ".join([sentence.text for sentence in sister_sentences_generator.get_sentences( + aspect='P', merge_groups_with_same_prefix=True, keep_only_best_group=True, **go_sent_common_props)]) + if sister_proc_sent: + joined_sent.append("in " + species[species[organism]["main_sister_species"]]["name"] + ", " + + best_ortholog[1] + " " + sister_proc_sent) + if len(joined_sent) > 0: + desc = "; ".join(joined_sent) + "." + if len(desc) > 0: + gene_desc.description = desc[0].upper() + desc[1:] + else: + gene_desc.description = None + desc_writer.add_gene_desc(gene_desc) \ No newline at end of file diff --git a/genedescriptions/descriptions_writer.py b/genedescriptions/descriptions_writer.py index 0b9a9d9..368ec50 100644 --- a/genedescriptions/descriptions_writer.py +++ b/genedescriptions/descriptions_writer.py @@ -10,8 +10,8 @@ class DescriptionsWriter(metaclass=ABCMeta): @abstractmethod def __init__(self): - self.data = [] self.general_stats = DescriptionsStats() + self.data = [] @abstractmethod def write(self): @@ -27,17 +27,52 @@ def add_gene_desc(self, gene_description: GeneDesc): def _calculate_stats(self): """calculate overall stats and populate fields""" - self.general_stats.average_num_go_terms_if_desc_trim_group_priority_merge = np.average( - [sum(gene_desc.stats.num_terms_trim_group_priority_merge.values()) for - gene_desc in self.data if gene_desc.description != "No description available"]) - self.general_stats.average_num_go_terms_if_desc_trim_nogroup_priority_nomerge = np.average( - [sum(gene_desc.stats.num_terms_trim_nogroup_priority_nomerge.values()) for gene_desc in self.data if - gene_desc.description != "No description available"]) - self.general_stats.average_num_go_terms_if_desc_notrim_nogroup_priority_nomerge = np.average( - [sum(gene_desc.stats.num_terms_notrim_nogroup_priority_nomerge.values()) for gene_desc in self.data if - gene_desc.description != "No description available"]) - self.general_stats.num_genes_with_go_sentence = len([gene_desc for gene_desc in self.data if - gene_desc.description != "No description available"]) + self.general_stats.average_number_initial_go_terms_f = np.average( + [gene_desc.stats.number_initial_go_terms_f for gene_desc in self.data if gene_desc.description is not None]) + self.general_stats.average_number_initial_go_terms_p = np.average( + [gene_desc.stats.number_initial_go_terms_p for gene_desc in self.data if gene_desc.description is not None]) + self.general_stats.average_number_initial_go_terms_c = np.average( + [gene_desc.stats.number_initial_go_terms_c for gene_desc in self.data if gene_desc.description is not None]) + self.general_stats.average_number_final_go_terms_f = np.average( + [gene_desc.stats.number_final_go_terms_f for gene_desc in self.data if gene_desc.description is not None]) + self.general_stats.average_number_final_go_terms_p = np.average( + [gene_desc.stats.number_final_go_terms_p for gene_desc in self.data if gene_desc.description is not None]) + self.general_stats.average_number_final_go_terms_c = np.average( + [gene_desc.stats.number_final_go_terms_c for gene_desc in self.data if gene_desc.description is not None]) + self.general_stats.average_number_initial_do_terms = np.average( + [gene_desc.stats.number_initial_do_terms for gene_desc in self.data if gene_desc.description is not None]) + self.general_stats.average_number_final_do_terms = np.average( + [gene_desc.stats.number_final_do_terms for gene_desc in self.data if gene_desc.description is not None]) + self.general_stats.total_number_of_genes = len(self.data) + self.general_stats.number_genes_with_non_null_description = len([gene_desc for gene_desc in self.data if + gene_desc.description is not None]) + self.general_stats.number_genes_with_non_null_go_description = len([gene_desc for gene_desc in self.data if + gene_desc.go_description is not None]) + self.general_stats.number_genes_with_null_go_description = len([gene_desc for gene_desc in self.data if + gene_desc.go_description is None]) + self.general_stats.number_genes_with_non_null_go_function_description = \ + len([gene_desc for gene_desc in self.data if gene_desc.go_function_description is not None]) + self.general_stats.number_genes_with_non_null_go_process_description = \ + len([gene_desc for gene_desc in self.data if gene_desc.go_process_description is not None]) + self.general_stats.number_genes_with_non_null_go_component_description = \ + len([gene_desc for gene_desc in self.data if gene_desc.go_component_description is not None]) + self.general_stats.number_genes_with_more_than_3_initial_go_terms = \ + len([gene_desc for gene_desc in self.data if any(gene_desc.stats.number_initial_go_terms.values()) > 3]) + self.general_stats.number_genes_with_non_null_do_description = len([gene_desc for gene_desc in self.data if + gene_desc.do_description is not None]) + self.general_stats.number_genes_with_null_do_description = len([gene_desc for gene_desc in self.data if + gene_desc.do_description is None]) + self.general_stats.number_genes_with_more_than_3_initial_do_terms = \ + len([gene_desc for gene_desc in self.data if gene_desc.stats.number_initial_do_terms > 3]) + self.general_stats.number_genes_with_final_do_terms_covering_multiple_initial_terms = \ + sum([gene_desc.final_do_term_covering_multiple_initial_do_terms_present for gene_desc in self.data if + gene_desc.do_description is not None]) + self.general_stats.average_number_go_annotations = np.average( + [gene_desc.stats.total_number_go_annotations for gene_desc in self.data if gene_desc.description is not + None]) + self.general_stats.average_number_do_annotations = np.average( + [gene_desc.stats.total_number_do_annotations for gene_desc in self.data if gene_desc.description is not + None]) class JsonGDWriter(DescriptionsWriter): diff --git a/generate_desc_from_raw_files.py b/generate_desc_from_raw_files.py index 208d602..884d7c5 100755 --- a/generate_desc_from_raw_files.py +++ b/generate_desc_from_raw_files.py @@ -89,89 +89,14 @@ def main(): desc_writer = JsonGDWriter() for gene in df.get_gene_data(): logging.debug("processing gene " + gene.name) - gene_desc = GeneDesc(gene_id=gene.id, gene_name=gene.name, - publications=", ".join([annot["publication"] for annot in df.get_annotations_for_gene( - gene.id, annot_type=DataType.GO, - priority_list=conf_parser.get_go_evidence_groups_priority_list())]), - refs=", ".join([annot["refs"] for annot in df.get_annotations_for_gene( - gene.id, annot_type=DataType.GO, - priority_list=conf_parser.get_go_evidence_groups_priority_list())]), - species=species[organism]["full_name"], - release_version=conf_parser.get_release("wb_data_fetcher")) - joined_sent = [] - - best_orthologs, selected_orth_name = df.get_best_orthologs_for_gene( - gene.id, orth_species_full_name=orthologs_sp_fullname) - if best_orthologs: - orth_sent = generate_ortholog_sentence(best_orthologs, selected_orth_name, human_genes_props) - if orth_sent: - joined_sent.append(orth_sent) - go_sent_generator = SentenceGenerator( - annotations=df.get_annotations_for_gene(gene_id=gene.id, annot_type=DataType.GO, - priority_list=conf_parser.get_go_annotations_priority(), - desc_stats=gene_desc.stats), - ontology=df.go_ontology, **go_sent_gen_common_props) - func_sent = " and ".join([sentence.text for sentence in go_sent_generator.get_sentences( - aspect='F', merge_groups_with_same_prefix=True, keep_only_best_group=True, )]) - if func_sent: - joined_sent.append(func_sent) - contributes_to_func_sent = " and ".join([sentence.text for sentence in go_sent_generator.get_sentences( - aspect='F', qualifier='contributes_to', merge_groups_with_same_prefix=True, - keep_only_best_group=True, desc_stats=gene_desc.stats, **go_sent_common_props)]) - if contributes_to_func_sent: - joined_sent.append(contributes_to_func_sent) - proc_sent = " and ".join([sentence.text for sentence in go_sent_generator.get_sentences( - aspect='P', merge_groups_with_same_prefix=True, keep_only_best_group=True, - desc_stats=gene_desc.stats, **go_sent_common_props)]) - if proc_sent: - joined_sent.append(proc_sent) - comp_sent = " and ".join([sentence.text for sentence in go_sent_generator.get_sentences( - aspect='C', merge_groups_with_same_prefix=True, keep_only_best_group=True, - desc_stats=gene_desc.stats, **go_sent_common_props)]) - if comp_sent: - joined_sent.append(comp_sent) - colocalizes_with_comp_sent = " and ".join([sentence.text for sentence in go_sent_generator.get_sentences( - aspect='C', qualifier='colocalizes_with', merge_groups_with_same_prefix=True, - desc_stats=gene_desc.stats, keep_only_best_group=True, **go_sent_common_props)]) - if colocalizes_with_comp_sent: - joined_sent.append(colocalizes_with_comp_sent) - - do_sentence_generator = SentenceGenerator( - df.get_annotations_for_gene(gene_id=gene.id, annot_type=DataType.DO, - priority_list=conf_parser.get_do_annotations_priority(), - desc_stats=gene_desc.stats), - ontology=df.do_ontology, **do_sent_gen_common_prop) - disease_sent = "; ".join([sentence.text for sentence in do_sentence_generator.get_sentences( - aspect='D', merge_groups_with_same_prefix=True, keep_only_best_group=False, desc_stats=gene_desc.stats, - **do_sent_common_props)]) - if disease_sent: - joined_sent.append(disease_sent) - - if conf_parser.get_data_fetcher() == "wb_data_fetcher" and "main_sister_species" in species[organism] and \ - species[organism]["main_sister_species"] and df.get_best_orthologs_for_gene( - gene.id, orth_species_full_name=[sister_sp_fullname], sister_species_data_fetcher=sister_df, - ecode_priority_list=["EXP", "IDA", "IPI", "IMP", "IGI", "IEP", "HTP", "HDA", "HMP", "HGI", - "HEP"])[0]: - best_ortholog = df.get_best_orthologs_for_gene( - gene.id, orth_species_full_name=[sister_sp_fullname], sister_species_data_fetcher=sister_df, - ecode_priority_list=["EXP", "IDA", "IPI", "IMP", "IGI", "IEP", "HTP", "HDA", "HMP", "HGI", - "HEP"])[0][0] - sister_sentences_generator = SentenceGenerator(sister_df.get_annotations_for_gene( - annot_type=DataType.GO, gene_id="WB:" + best_ortholog[0], - priority_list=("EXP", "IDA", "IPI", "IMP", "IGI", "IEP", "HTP", "HDA", "HMP", "HGI", "HEP"), - desc_stats=gene_desc.stats), ontology=df.go_ontology, **go_sent_gen_common_props) - sister_proc_sent = " and ".join([sentence.text for sentence in sister_sentences_generator.get_sentences( - aspect='P', merge_groups_with_same_prefix=True, keep_only_best_group=True, **go_sent_common_props)]) - if sister_proc_sent: - joined_sent.append("in " + species[species[organism]["main_sister_species"]]["name"] + ", " + - best_ortholog[1] + " " + sister_proc_sent) - if len(joined_sent) > 0: - desc = "; ".join(joined_sent) + "." - if len(desc) > 0: - gene_desc.description = desc[0].upper() + desc[1:] - else: - gene_desc.description = None - desc_writer.add_gene_desc(gene_desc) + compose_wormbase_description(gene=gene, conf_parser=conf_parser, species=species, organism=organism, df=df, + orthologs_sp_fullname=orthologs_sp_fullname, + go_sent_gen_common_props=go_sent_gen_common_props, + go_sent_common_props=go_sent_common_props, human_genes_props=human_genes_props, + do_sent_gen_common_prop=go_sent_gen_common_props, + do_sent_common_props=do_sent_common_props, + sister_sp_fullname=sister_sp_fullname, sister_df=sister_df, + desc_writer=desc_writer) desc_writer.write(os.path.join(conf_parser.get_genedesc_output_dir(conf_parser.get_genedesc_writer()), organism + "_with_stats.json"), pretty=True, include_single_gene_stats=True) desc_writer.write(os.path.join(conf_parser.get_genedesc_output_dir(conf_parser.get_genedesc_writer()),