From 9fd3be942198087e7dce770a3d625be9324850a6 Mon Sep 17 00:00:00 2001
From: Valerio Arnaboldi <valearna@caltech.edu>
Date: Thu, 5 Jul 2018 14:25:51 -0700
Subject: [PATCH] updated reports

- updated classes to calculate reports to include more stats and remove old ones
- code to compose WN descriptions moved from main to description_rules
---
 genedescriptions/data_fetcher.py        |  20 +--
 genedescriptions/descriptions_rules.py  | 183 +++++++++++++++++++++---
 genedescriptions/descriptions_writer.py |  59 ++++++--
 generate_desc_from_raw_files.py         |  91 ++----------
 4 files changed, 218 insertions(+), 135 deletions(-)

diff --git a/genedescriptions/data_fetcher.py b/genedescriptions/data_fetcher.py
index 02c6944..bd4f19b 100644
--- a/genedescriptions/data_fetcher.py
+++ b/genedescriptions/data_fetcher.py
@@ -4,23 +4,15 @@
 import os
 import re
 
-from enum import Enum
-from collections import namedtuple, defaultdict
+from collections import defaultdict
 from typing import List, Iterable, Dict
 from ontobio import AssociationSetFactory
-from genedescriptions.descriptions_rules import SingleDescStats, set_all_depths_in_subgraph
+from genedescriptions.descriptions_rules import set_all_depths_in_subgraph, Gene, DataType
 from ontobio.ontol_factory import OntologyFactory
 from ontobio.ontol import Ontology
 from ontobio.assocmodel import AssociationSet
 from ontobio.io.gafparser import GafParser
 
-Gene = namedtuple('Gene', ['id', 'name', 'dead', 'pseudo'])
-
-
-class DataType(Enum):
-    GO = 1
-    DO = 2
-
 
 class DataFetcher(object):
     """retrieve data for gene descriptions from different sources"""
@@ -204,8 +196,7 @@ def get_annotations_for_gene(self, gene_id: str, annot_type: DataType = DataType
                                  include_obsolete: bool = False, include_negative_results: bool = False,
                                  priority_list: Iterable = ("EXP", "IDA", "IPI", "IMP", "IGI", "IEP", "IC", "ISS",
                                                             "ISO", "ISA", "ISM", "IGC", "IBA", "IBD", "IKR", "IRD",
-                                                            "RCA", "IEA"),
-                                 desc_stats: SingleDescStats = None) -> List[Dict]:
+                                                            "RCA", "IEA")) -> List[Dict]:
         """
         retrieve go annotations for a given gene id and a given type. The annotations are unique for each pair
         <gene_id, term_id>. This means that when multiple annotations for the same pair are found in the go data, the
@@ -221,7 +212,6 @@ def get_annotations_for_gene(self, gene_id: str, annot_type: DataType = DataType
                 term are found, only the one with highest priority is returned. The first element in the list has the
                 highest priority, whereas the last has the lowest. Only annotations with evidence codes in the priority
                 list are returned. All other annotations are ignored
-            desc_stats (SingleDescStats): an object containing the description statistics where to save the total number
                 of annotations for the gene
         Returns:
             List[Dict]: the list of annotations for the given gene
@@ -239,8 +229,6 @@ def get_annotations_for_gene(self, gene_id: str, annot_type: DataType = DataType
                                                                                       not ontology.is_obsolete(
                                                                                           annotation["object"]["id"]))
                        and (include_negative_results or "NOT" not in annotation["qualifiers"])]
-        if desc_stats:
-            desc_stats.total_num_go_annotations = len(annotations)
         id_selected_annotation = {}
         for annotation in annotations:
             if annotation["evidence"]["type"] in priority_map.keys():
@@ -250,8 +238,6 @@ def get_annotations_for_gene(self, gene_id: str, annot_type: DataType = DataType
                         id_selected_annotation[annotation["object"]["id"]] = annotation
                 else:
                     id_selected_annotation[annotation["object"]["id"]] = annotation
-        if desc_stats:
-            desc_stats.num_prioritized_go_annotations = len(id_selected_annotation.keys())
         return [annotation for annotation in id_selected_annotation.values()]
 
     def set_gene_data(self, gene_data: List[Gene]):
diff --git a/genedescriptions/descriptions_rules.py b/genedescriptions/descriptions_rules.py
index eb24412..63a9fa1 100644
--- a/genedescriptions/descriptions_rules.py
+++ b/genedescriptions/descriptions_rules.py
@@ -1,13 +1,26 @@
 import json
+from collections import namedtuple
+from enum import Enum
 
 import inflect
 import re
 import urllib.request
 
 from namedlist import namedlist
+
+from genedescriptions.config_parser import GenedescConfigParser
 from genedescriptions.ontology_tools import *
 from ontobio.ontol import Ontology
 
+
+class DataType(Enum):
+    GO = 1
+    DO = 2
+
+
+Gene = namedtuple('Gene', ['id', 'name', 'dead', 'pseudo'])
+
+
 Sentence = namedlist('Sentence', ['prefix', 'terms_ids', 'postfix', 'text', 'aspect', 'evidence_group', 'terms_merged',
                                   'additional_prefix', 'qualifier', 'ancestors_covering_multiple_terms'])
 
@@ -15,26 +28,33 @@
 class SingleDescStats(object):
     """statistics for a single gene description"""
     def __init__(self):
-        self.num_terms_notrim_nogroup_priority_nomerge = defaultdict(int)
-        self.num_terms_trim_nogroup_priority_nomerge = defaultdict(int)
-        self.num_terms_trim_group_priority_merge = defaultdict(int)
-        self.total_num_go_annotations = 0
-        self.num_prioritized_go_annotations = 0
-        self.terms_notrim_nogroup_priority_nomerge = defaultdict(list)
-        self.terms_trim_nogroup_priority_nomerge = defaultdict(list)
-        self.terms_trim_group_priority_merge = defaultdict(list)
+        self.total_number_go_annotations = 0
+        self.number_initial_go_terms_f = 0
+        self.number_initial_go_terms_p = 0
+        self.number_initial_go_terms_c = 0
+        self.number_final_go_terms_f = 0
+        self.number_final_go_terms_p = 0
+        self.number_final_go_terms_c = 0
+        self.total_number_do_annotations = 0
+        self.number_initial_do_terms = 0
+        self.number_final_do_terms = 0
+        self.number_final_do_term_covering_multiple_initial_do_terms_present = 0
 
 
 class GeneDesc(object):
     """gene description"""
-    def __init__(self, gene_id: str, gene_name: str = "", description: str = "", go_description: str = "",
-                 disease_description: str = "", stats: SingleDescStats = None, publications: str = "", refs: str = "",
-                 species: str = "", release_version: str = ""):
+    def __init__(self, gene_id: str, gene_name: str = "", description: str = None, go_description: str = None,
+                 go_function_description: str = None, go_process_description: str = None,
+                 go_component_description: str = None, do_description: str = None, stats: SingleDescStats = None,
+                 publications: str = "", refs: str = "", species: str = "", release_version: str = ""):
         self.gene_id = gene_id
         self.gene_name = gene_name
         self.description = description
         self.go_description = go_description
-        self.disease_description = disease_description
+        self.go_function_description = go_function_description
+        self.go_process_description = go_process_description
+        self.go_component_description = go_component_description
+        self.do_description = do_description
         self.publications = publications
         self.refs = refs
         self.species = species
@@ -48,10 +68,28 @@ def __init__(self, gene_id: str, gene_name: str = "", description: str = "", go_
 class DescriptionsStats(object):
     """overall statistics for a set of gene descriptions"""
     def __init__(self):
-        self.num_genes_with_go_sentence = 0
-        self.average_num_go_terms_if_desc_trim_group_priority_merge = 0
-        self.average_num_go_terms_if_desc_trim_nogroup_priority_nomerge = 0
-        self.average_num_go_terms_if_desc_notrim_nogroup_priority_nomerge = 0
+        self.total_number__of_genes = 0
+        self.number_genes_with_non_null_description = 0
+        self.number_genes_with_non_null_go_description = 0
+        self.number_genes_with_non_null_go_function_description = 0
+        self.number_genes_with_non_null_go_process_description = 0
+        self.number_genes_with_non_null_go_component_description = 0
+        self.number_genes_with_null_go_description = 0
+        self.number_genes_with_more_than_3_initial_go_terms = 0
+        self.number_genes_with_non_null_do_description = 0
+        self.number_genes_with_null_do_description = 0
+        self.number_genes_with_more_than_3_initial_do_terms = 0
+        self.number_genes_with_final_do_terms_covering_multiple_initial_terms = 0
+        self.average_number_initial_go_terms_f = 0
+        self.average_number_initial_go_terms_p = 0
+        self.average_number_initial_go_terms_c = 0
+        self.average_number_final_go_terms_f = 0
+        self.average_number_final_go_terms_p = 0
+        self.average_number_final_go_terms_c = 0
+        self.average_number_initial_do_terms = 0
+        self.average_number_final_do_terms = 0
+        self.average_number_go_annotations = 0
+        self.average_number_do_annotations = 0
 
 
 class SentenceMerger(object):
@@ -116,7 +154,6 @@ def get_sentences(self, aspect: str, qualifier: str = '', keep_only_best_group:
                       merge_num_terms_threshold: int = 3, merge_min_distance_from_root: dict = None,
                       truncate_others_generic_word: str = "several",
                       truncate_others_aspect_words: Dict[str, str] = None,
-                      desc_stats: SingleDescStats = None,
                       remove_successive_overlapped_terms: bool = True,
                       exclude_terms_ids: List[str] = None,
                       add_multiple_if_covers_more_children: bool = False) -> List[Sentence]:
@@ -136,8 +173,6 @@ def get_sentences(self, aspect: str, qualifier: str = '', keep_only_best_group:
             merge_min_distance_from_root (dict): minimum distance from root terms for the selection of common ancestors
                 during merging operations. Three values must be provided in the form of a dictionary with keys 'F', 'P',
                 and 'C' for go aspect names and values integers indicating the threshold for each aspect
-            desc_stats (SingleDescStat): an object containing the description statistics where to save the total number
-                of annotations for the gene
             truncate_others_generic_word (str): a generic word to indicate that the set of terms reported in the
                 sentence is only a subset of the original terms, e.g., 'several'
             truncate_others_aspect_words (Dict[str, str]): one word for each aspect describing the kind of terms that
@@ -153,7 +188,7 @@ def get_sentences(self, aspect: str, qualifier: str = '', keep_only_best_group:
         if not merge_min_distance_from_root:
             merge_min_distance_from_root = {'F': 1, 'P': 1, 'C': 2, 'D': 3}
         if not truncate_others_aspect_words:
-            truncate_others_aspect_words = {'F': 'functions', 'P': 'processes', 'C': 'components'}
+            truncate_others_aspect_words = {'F': 'functions', 'P': 'processes', 'C': 'components', 'D': 'diseases'}
         sentences = []
         terms_already_covered = set()
         evidence_group_priority = {eg: p for p, eg in enumerate(self.evidence_groups_priority_list)}
@@ -165,9 +200,6 @@ def get_sentences(self, aspect: str, qualifier: str = '', keep_only_best_group:
                 terms -= terms_already_covered
             if exclude_terms_ids:
                 terms -= set(exclude_terms_ids)
-            if desc_stats:
-                desc_stats.num_terms_notrim_nogroup_priority_nomerge[aspect] += len(terms)
-                desc_stats.terms_notrim_nogroup_priority_nomerge[aspect].extend(terms)
             add_others = False
             if remove_parent_terms:
                 terms_no_ancestors = terms - set([ancestor for node_id in terms for ancestor in
@@ -457,3 +489,108 @@ def generate_ortholog_sentence(orthologs: List[List[str]], orthologs_sp_fullname
         else:
             orth_sentence = "is an ortholog of " + orthologs_sp_fullname + " " + " and ".join(orthologs_symbols)
     return orth_sentence
+
+
+def compose_wormbase_description(gene: Gene, conf_parser: GenedescConfigParser, species, organism, df,
+                                 orthologs_sp_fullname, go_sent_gen_common_props, go_sent_common_props,
+                                 human_genes_props, do_sent_gen_common_prop, do_sent_common_props, sister_sp_fullname,
+                                 sister_df, desc_writer):
+    gene_desc = GeneDesc(gene_id=gene.id, gene_name=gene.name,
+                         publications=", ".join([annot["publication"] for annot in df.get_annotations_for_gene(
+                             gene.id, annot_type=DataType.GO,
+                             priority_list=conf_parser.get_go_evidence_groups_priority_list())]),
+                         refs=", ".join([annot["refs"] for annot in df.get_annotations_for_gene(
+                             gene.id, annot_type=DataType.GO,
+                             priority_list=conf_parser.get_go_evidence_groups_priority_list())]),
+                         species=species[organism]["full_name"],
+                         release_version=conf_parser.get_release("wb_data_fetcher"))
+    joined_sent = []
+
+    best_orthologs, selected_orth_name = df.get_best_orthologs_for_gene(
+        gene.id, orth_species_full_name=orthologs_sp_fullname)
+    if best_orthologs:
+        orth_sent = generate_ortholog_sentence(best_orthologs, selected_orth_name, human_genes_props)
+        if orth_sent:
+            joined_sent.append(orth_sent)
+    go_annotations = df.get_annotations_for_gene(gene_id=gene.id, annot_type=DataType.GO,
+                                                 priority_list=conf_parser.get_go_annotations_priority())
+    go_sent_generator = SentenceGenerator(annotations=go_annotations, ontology=df.go_ontology,
+                                          **go_sent_gen_common_props)
+    gene_desc.stats.total_number_go_annotations = len(go_annotations)
+    gene_desc.stats.number_initial_go_terms = {aspect: len(terms) for aspect, terms in
+                                               go_sent_generator.terms_groups.items()}
+    raw_func_sent = go_sent_generator.get_sentences(aspect='F', merge_groups_with_same_prefix=True,
+                                                    keep_only_best_group=True, **go_sent_common_props)
+    gene_desc.stats.number_final_go_terms_f += sum([len(sentence.terms_ids) for sentence in raw_func_sent])
+    func_sent = " and ".join([sentence.text for sentence in raw_func_sent])
+    if func_sent:
+        joined_sent.append(func_sent)
+    contributes_to_raw_func_sent = go_sent_generator.get_sentences(
+        aspect='F', qualifier='contributes_to', merge_groups_with_same_prefix=True, keep_only_best_group=True,
+        **go_sent_common_props)
+    gene_desc.stats.number_final_go_terms_f += sum([len(sentence.terms_ids) for sentence in
+                                                       contributes_to_raw_func_sent])
+    contributes_to_func_sent = " and ".join([sentence.text for sentence in contributes_to_raw_func_sent])
+    if contributes_to_func_sent:
+        joined_sent.append(contributes_to_func_sent)
+    raw_proc_sent = go_sent_generator.get_sentences(aspect='P', merge_groups_with_same_prefix=True,
+                                                    keep_only_best_group=True, **go_sent_common_props)
+    gene_desc.stats.number_final_go_terms_p += sum([len(sentence.terms_ids) for sentence in raw_proc_sent])
+    proc_sent = " and ".join([sentence.text for sentence in raw_proc_sent])
+    if proc_sent:
+        joined_sent.append(proc_sent)
+    raw_comp_sent = go_sent_generator.get_sentences(
+        aspect='C', merge_groups_with_same_prefix=True, keep_only_best_group=True, **go_sent_common_props)
+    gene_desc.stats.number_final_go_terms_c += sum([len(sentence.terms_ids) for sentence in raw_comp_sent])
+    comp_sent = " and ".join([sentence.text for sentence in raw_comp_sent])
+    if comp_sent:
+        joined_sent.append(comp_sent)
+    colocalizes_with_raw_comp_sent = go_sent_generator.get_sentences(
+        aspect='C', qualifier='colocalizes_with', merge_groups_with_same_prefix=True,
+        keep_only_best_group=True, **go_sent_common_props)
+    gene_desc.stats.number_final_go_terms_c += sum([len(sentence.terms_ids) for sentence in
+                                                       colocalizes_with_raw_comp_sent])
+    colocalizes_with_comp_sent = " and ".join([sentence.text for sentence in colocalizes_with_raw_comp_sent])
+    if colocalizes_with_comp_sent:
+        joined_sent.append(colocalizes_with_comp_sent)
+    do_annotations = df.get_annotations_for_gene(gene_id=gene.id, annot_type=DataType.DO,
+                                                 priority_list=conf_parser.get_do_annotations_priority())
+    do_sentence_generator = SentenceGenerator(annotations=do_annotations, ontology=df.do_ontology,
+                                              **do_sent_gen_common_prop)
+    gene_desc.stats.total_number_do_annotations = len(do_annotations)
+    gene_desc.stats.number_initial_do_terms = sum([len(terms) for terms in
+                                                   do_sentence_generator.terms_groups.values()])
+    raw_disease_sent = do_sentence_generator.get_sentences(
+        aspect='D', merge_groups_with_same_prefix=True, keep_only_best_group=False, **do_sent_common_props)
+    disease_sent = "; ".join([sentence.text for sentence in raw_disease_sent])
+    if disease_sent:
+        joined_sent.append(disease_sent)
+    gene_desc.stats.number_final_do_terms += sum([len(sentence.terms_ids) for sentence in raw_disease_sent])
+    if "(multiple)" in disease_sent:
+        gene_desc.stats.number_final_do_term_covering_multiple_initial_do_terms_present = \
+            disease_sent.count("(multiple)")
+    if conf_parser.get_data_fetcher() == "wb_data_fetcher" and "main_sister_species" in species[organism] and \
+            species[organism]["main_sister_species"] and df.get_best_orthologs_for_gene(
+        gene.id, orth_species_full_name=[sister_sp_fullname], sister_species_data_fetcher=sister_df,
+        ecode_priority_list=["EXP", "IDA", "IPI", "IMP", "IGI", "IEP", "HTP", "HDA", "HMP", "HGI",
+                             "HEP"])[0]:
+        best_ortholog = df.get_best_orthologs_for_gene(
+            gene.id, orth_species_full_name=[sister_sp_fullname], sister_species_data_fetcher=sister_df,
+            ecode_priority_list=["EXP", "IDA", "IPI", "IMP", "IGI", "IEP", "HTP", "HDA", "HMP", "HGI",
+                                 "HEP"])[0][0]
+        sister_sentences_generator = SentenceGenerator(sister_df.get_annotations_for_gene(
+            annot_type=DataType.GO, gene_id="WB:" + best_ortholog[0],
+            priority_list=("EXP", "IDA", "IPI", "IMP", "IGI", "IEP", "HTP", "HDA", "HMP", "HGI", "HEP")),
+            ontology=df.go_ontology, **go_sent_gen_common_props)
+        sister_proc_sent = " and ".join([sentence.text for sentence in sister_sentences_generator.get_sentences(
+            aspect='P', merge_groups_with_same_prefix=True, keep_only_best_group=True, **go_sent_common_props)])
+        if sister_proc_sent:
+            joined_sent.append("in " + species[species[organism]["main_sister_species"]]["name"] + ", " +
+                               best_ortholog[1] + " " + sister_proc_sent)
+    if len(joined_sent) > 0:
+        desc = "; ".join(joined_sent) + "."
+        if len(desc) > 0:
+            gene_desc.description = desc[0].upper() + desc[1:]
+    else:
+        gene_desc.description = None
+    desc_writer.add_gene_desc(gene_desc)
\ No newline at end of file
diff --git a/genedescriptions/descriptions_writer.py b/genedescriptions/descriptions_writer.py
index 0b9a9d9..368ec50 100644
--- a/genedescriptions/descriptions_writer.py
+++ b/genedescriptions/descriptions_writer.py
@@ -10,8 +10,8 @@ class DescriptionsWriter(metaclass=ABCMeta):
 
     @abstractmethod
     def __init__(self):
-        self.data = []
         self.general_stats = DescriptionsStats()
+        self.data = []
 
     @abstractmethod
     def write(self):
@@ -27,17 +27,52 @@ def add_gene_desc(self, gene_description: GeneDesc):
 
     def _calculate_stats(self):
         """calculate overall stats and populate fields"""
-        self.general_stats.average_num_go_terms_if_desc_trim_group_priority_merge = np.average(
-            [sum(gene_desc.stats.num_terms_trim_group_priority_merge.values()) for
-             gene_desc in self.data if gene_desc.description != "No description available"])
-        self.general_stats.average_num_go_terms_if_desc_trim_nogroup_priority_nomerge = np.average(
-            [sum(gene_desc.stats.num_terms_trim_nogroup_priority_nomerge.values()) for gene_desc in self.data if
-             gene_desc.description != "No description available"])
-        self.general_stats.average_num_go_terms_if_desc_notrim_nogroup_priority_nomerge = np.average(
-            [sum(gene_desc.stats.num_terms_notrim_nogroup_priority_nomerge.values()) for gene_desc in self.data if
-             gene_desc.description != "No description available"])
-        self.general_stats.num_genes_with_go_sentence = len([gene_desc for gene_desc in self.data if
-                                                             gene_desc.description != "No description available"])
+        self.general_stats.average_number_initial_go_terms_f = np.average(
+            [gene_desc.stats.number_initial_go_terms_f for gene_desc in self.data if gene_desc.description is not None])
+        self.general_stats.average_number_initial_go_terms_p = np.average(
+            [gene_desc.stats.number_initial_go_terms_p for gene_desc in self.data if gene_desc.description is not None])
+        self.general_stats.average_number_initial_go_terms_c = np.average(
+            [gene_desc.stats.number_initial_go_terms_c for gene_desc in self.data if gene_desc.description is not None])
+        self.general_stats.average_number_final_go_terms_f = np.average(
+            [gene_desc.stats.number_final_go_terms_f for gene_desc in self.data if gene_desc.description is not None])
+        self.general_stats.average_number_final_go_terms_p = np.average(
+            [gene_desc.stats.number_final_go_terms_p for gene_desc in self.data if gene_desc.description is not None])
+        self.general_stats.average_number_final_go_terms_c = np.average(
+            [gene_desc.stats.number_final_go_terms_c for gene_desc in self.data if gene_desc.description is not None])
+        self.general_stats.average_number_initial_do_terms = np.average(
+            [gene_desc.stats.number_initial_do_terms for gene_desc in self.data if gene_desc.description is not None])
+        self.general_stats.average_number_final_do_terms = np.average(
+            [gene_desc.stats.number_final_do_terms for gene_desc in self.data if gene_desc.description is not None])
+        self.general_stats.total_number_of_genes = len(self.data)
+        self.general_stats.number_genes_with_non_null_description = len([gene_desc for gene_desc in self.data if
+                                                                         gene_desc.description is not None])
+        self.general_stats.number_genes_with_non_null_go_description = len([gene_desc for gene_desc in self.data if
+                                                                            gene_desc.go_description is not None])
+        self.general_stats.number_genes_with_null_go_description = len([gene_desc for gene_desc in self.data if
+                                                                        gene_desc.go_description is None])
+        self.general_stats.number_genes_with_non_null_go_function_description = \
+            len([gene_desc for gene_desc in self.data if gene_desc.go_function_description is not None])
+        self.general_stats.number_genes_with_non_null_go_process_description = \
+            len([gene_desc for gene_desc in self.data if gene_desc.go_process_description is not None])
+        self.general_stats.number_genes_with_non_null_go_component_description = \
+            len([gene_desc for gene_desc in self.data if gene_desc.go_component_description is not None])
+        self.general_stats.number_genes_with_more_than_3_initial_go_terms = \
+            len([gene_desc for gene_desc in self.data if any(gene_desc.stats.number_initial_go_terms.values()) > 3])
+        self.general_stats.number_genes_with_non_null_do_description = len([gene_desc for gene_desc in self.data if
+                                                                            gene_desc.do_description is not None])
+        self.general_stats.number_genes_with_null_do_description = len([gene_desc for gene_desc in self.data if
+                                                                        gene_desc.do_description is None])
+        self.general_stats.number_genes_with_more_than_3_initial_do_terms = \
+            len([gene_desc for gene_desc in self.data if gene_desc.stats.number_initial_do_terms > 3])
+        self.general_stats.number_genes_with_final_do_terms_covering_multiple_initial_terms = \
+            sum([gene_desc.final_do_term_covering_multiple_initial_do_terms_present for gene_desc in self.data if
+                 gene_desc.do_description is not None])
+        self.general_stats.average_number_go_annotations = np.average(
+            [gene_desc.stats.total_number_go_annotations for gene_desc in self.data if gene_desc.description is not
+             None])
+        self.general_stats.average_number_do_annotations = np.average(
+            [gene_desc.stats.total_number_do_annotations for gene_desc in self.data if gene_desc.description is not
+             None])
 
 
 class JsonGDWriter(DescriptionsWriter):
diff --git a/generate_desc_from_raw_files.py b/generate_desc_from_raw_files.py
index 208d602..884d7c5 100755
--- a/generate_desc_from_raw_files.py
+++ b/generate_desc_from_raw_files.py
@@ -89,89 +89,14 @@ def main():
         desc_writer = JsonGDWriter()
         for gene in df.get_gene_data():
             logging.debug("processing gene " + gene.name)
-            gene_desc = GeneDesc(gene_id=gene.id, gene_name=gene.name,
-                                 publications=", ".join([annot["publication"] for annot in df.get_annotations_for_gene(
-                                     gene.id, annot_type=DataType.GO,
-                                     priority_list=conf_parser.get_go_evidence_groups_priority_list())]),
-                                 refs=", ".join([annot["refs"] for annot in df.get_annotations_for_gene(
-                                     gene.id, annot_type=DataType.GO,
-                                     priority_list=conf_parser.get_go_evidence_groups_priority_list())]),
-                                 species=species[organism]["full_name"],
-                                 release_version=conf_parser.get_release("wb_data_fetcher"))
-            joined_sent = []
-
-            best_orthologs, selected_orth_name = df.get_best_orthologs_for_gene(
-                gene.id, orth_species_full_name=orthologs_sp_fullname)
-            if best_orthologs:
-                orth_sent = generate_ortholog_sentence(best_orthologs, selected_orth_name, human_genes_props)
-                if orth_sent:
-                    joined_sent.append(orth_sent)
-            go_sent_generator = SentenceGenerator(
-                annotations=df.get_annotations_for_gene(gene_id=gene.id, annot_type=DataType.GO,
-                                                        priority_list=conf_parser.get_go_annotations_priority(),
-                                                        desc_stats=gene_desc.stats),
-                ontology=df.go_ontology, **go_sent_gen_common_props)
-            func_sent = " and ".join([sentence.text for sentence in go_sent_generator.get_sentences(
-                aspect='F', merge_groups_with_same_prefix=True, keep_only_best_group=True, )])
-            if func_sent:
-                joined_sent.append(func_sent)
-            contributes_to_func_sent = " and ".join([sentence.text for sentence in go_sent_generator.get_sentences(
-                aspect='F', qualifier='contributes_to', merge_groups_with_same_prefix=True,
-                keep_only_best_group=True, desc_stats=gene_desc.stats, **go_sent_common_props)])
-            if contributes_to_func_sent:
-                joined_sent.append(contributes_to_func_sent)
-            proc_sent = " and ".join([sentence.text for sentence in go_sent_generator.get_sentences(
-                aspect='P', merge_groups_with_same_prefix=True, keep_only_best_group=True,
-                desc_stats=gene_desc.stats, **go_sent_common_props)])
-            if proc_sent:
-                joined_sent.append(proc_sent)
-            comp_sent = " and ".join([sentence.text for sentence in go_sent_generator.get_sentences(
-                aspect='C', merge_groups_with_same_prefix=True, keep_only_best_group=True,
-                desc_stats=gene_desc.stats, **go_sent_common_props)])
-            if comp_sent:
-                joined_sent.append(comp_sent)
-            colocalizes_with_comp_sent = " and ".join([sentence.text for sentence in go_sent_generator.get_sentences(
-                aspect='C', qualifier='colocalizes_with', merge_groups_with_same_prefix=True,
-                desc_stats=gene_desc.stats, keep_only_best_group=True, **go_sent_common_props)])
-            if colocalizes_with_comp_sent:
-                joined_sent.append(colocalizes_with_comp_sent)
-
-            do_sentence_generator = SentenceGenerator(
-                df.get_annotations_for_gene(gene_id=gene.id, annot_type=DataType.DO,
-                                            priority_list=conf_parser.get_do_annotations_priority(),
-                                            desc_stats=gene_desc.stats),
-                ontology=df.do_ontology, **do_sent_gen_common_prop)
-            disease_sent = "; ".join([sentence.text for sentence in do_sentence_generator.get_sentences(
-                aspect='D', merge_groups_with_same_prefix=True, keep_only_best_group=False, desc_stats=gene_desc.stats,
-                **do_sent_common_props)])
-            if disease_sent:
-                joined_sent.append(disease_sent)
-
-            if conf_parser.get_data_fetcher() == "wb_data_fetcher" and "main_sister_species" in species[organism] and \
-                    species[organism]["main_sister_species"] and df.get_best_orthologs_for_gene(
-                    gene.id, orth_species_full_name=[sister_sp_fullname], sister_species_data_fetcher=sister_df,
-                    ecode_priority_list=["EXP", "IDA", "IPI", "IMP", "IGI", "IEP", "HTP", "HDA", "HMP", "HGI",
-                                         "HEP"])[0]:
-                best_ortholog = df.get_best_orthologs_for_gene(
-                    gene.id, orth_species_full_name=[sister_sp_fullname], sister_species_data_fetcher=sister_df,
-                    ecode_priority_list=["EXP", "IDA", "IPI", "IMP", "IGI", "IEP", "HTP", "HDA", "HMP", "HGI",
-                                         "HEP"])[0][0]
-                sister_sentences_generator = SentenceGenerator(sister_df.get_annotations_for_gene(
-                    annot_type=DataType.GO, gene_id="WB:" + best_ortholog[0],
-                    priority_list=("EXP", "IDA", "IPI", "IMP", "IGI", "IEP", "HTP", "HDA", "HMP", "HGI", "HEP"),
-                    desc_stats=gene_desc.stats), ontology=df.go_ontology, **go_sent_gen_common_props)
-                sister_proc_sent = " and ".join([sentence.text for sentence in sister_sentences_generator.get_sentences(
-                    aspect='P', merge_groups_with_same_prefix=True, keep_only_best_group=True, **go_sent_common_props)])
-                if sister_proc_sent:
-                    joined_sent.append("in " + species[species[organism]["main_sister_species"]]["name"] + ", " +
-                                       best_ortholog[1] + " " + sister_proc_sent)
-            if len(joined_sent) > 0:
-                desc = "; ".join(joined_sent) + "."
-                if len(desc) > 0:
-                    gene_desc.description = desc[0].upper() + desc[1:]
-            else:
-                gene_desc.description = None
-            desc_writer.add_gene_desc(gene_desc)
+            compose_wormbase_description(gene=gene, conf_parser=conf_parser, species=species, organism=organism, df=df,
+                                         orthologs_sp_fullname=orthologs_sp_fullname,
+                                         go_sent_gen_common_props=go_sent_gen_common_props,
+                                         go_sent_common_props=go_sent_common_props, human_genes_props=human_genes_props,
+                                         do_sent_gen_common_prop=go_sent_gen_common_props,
+                                         do_sent_common_props=do_sent_common_props,
+                                         sister_sp_fullname=sister_sp_fullname, sister_df=sister_df,
+                                         desc_writer=desc_writer)
         desc_writer.write(os.path.join(conf_parser.get_genedesc_output_dir(conf_parser.get_genedesc_writer()),
                                        organism + "_with_stats.json"), pretty=True, include_single_gene_stats=True)
         desc_writer.write(os.path.join(conf_parser.get_genedesc_output_dir(conf_parser.get_genedesc_writer()),