Merge branch 'develop'

alliance-genome · Jan 3, 2020 · 9db3559 · 9db3559
2 parents ea69a72 + 3fbd06e
commit 9db3559
Show file tree

Hide file tree

Showing 13 changed files with 207,539 additions and 57 deletions.
diff --git a/genedescriptions/commons.py b/genedescriptions/commons.py
@@ -7,6 +7,7 @@
 @dataclass
 class Sentence:
     prefix: str
+    initial_terms_ids: List[str]
     terms_ids: List[str]
     postfix: str
     text: str

diff --git a/genedescriptions/descriptions_generator.py b/genedescriptions/descriptions_generator.py
@@ -21,15 +21,20 @@ def get_description(self):
         return " and ".join([sentence.text for sentence in self.sentences])
 
     def get_ids(self, experimental_only: bool = False):
-        return [term_id for sentence in self.sentences for term_id in sentence.terms_ids if not experimental_only or
-                sentence.evidence_group.startswith("EXPERIMENTAL")]
+        return list({term_id for sentence in self.sentences for term_id in sentence.terms_ids if not experimental_only or
+                     sentence.evidence_group.startswith("EXPERIMENTAL")})
+
+    def get_initial_ids(self, experimental_only: bool = False):
+        return list({term_id for sentence in self.sentences for term_id in sentence.initial_terms_ids if not
+                     experimental_only or sentence.evidence_group.startswith("EXPERIMENTAL")})
 
     def contains_sentences(self):
         return len(self.sentences) > 0
 
 
 class SentenceMerger(object):
     def __init__(self):
+        self.initial_terms_ids = set()
         self.postfix_list = []
         self.terms_ids = set()
         self.term_postfix_dict = {}
@@ -124,6 +129,7 @@ def get_module_sentences(self, aspect: str, qualifier: str = '',
                     and len(trimming_result.final_terms) > 0:
                 sentences.append(
                     _get_single_sentence(
+                        initial_terms_ids=list(terms),
                         node_ids=trimming_result.final_terms, ontology=self.ontology, aspect=aspect,
                         evidence_group=evidence_group, qualifier=qualifier,
                         prepostfix_sentences_map=self.prepostfix_sentences_map,
@@ -233,6 +239,7 @@ def merge_sentences_with_same_prefix(self, sentences: List[Sentence], remove_par
             merged_sentences[prefix].aspect = sentence.aspect
             merged_sentences[prefix].qualifier = sentence.qualifier
             merged_sentences[prefix].terms_ids.update(sentence.terms_ids)
+            merged_sentences[prefix].initial_terms_ids.update(sentence.initial_terms_ids)
             for term in sentence.terms_ids:
                 merged_sentences[prefix].term_postfix_dict[term] = self.prepostfix_sentences_map[
                     (sentence.aspect, sentence.evidence_group, sentence.qualifier)][1]
@@ -252,7 +259,8 @@ def merge_sentences_with_same_prefix(self, sentences: List[Sentence], remove_par
                     logger.debug("Removed " + str(len(sent_merger.terms_ids) - len(terms_no_ancestors)) +
                                  " parents from terms while merging sentences with same prefix")
                     sent_merger.terms_ids = terms_no_ancestors
-        return [Sentence(prefix=prefix, terms_ids=list(sent_merger.terms_ids),
+        return [Sentence(prefix=prefix, initial_terms_ids=list(sent_merger.initial_terms_ids),
+                         terms_ids=list(sent_merger.terms_ids),
                          postfix=OntologySentenceGenerator.merge_postfix_phrases(sent_merger.postfix_list),
                          text=compose_sentence(prefix=prefix,
                                                term_names=[self.ontology.label(node, id_if_null=True) for node in

diff --git a/genedescriptions/gene_description.py b/genedescriptions/gene_description.py
@@ -2,7 +2,7 @@
 
 from typing import List
 
-from genedescriptions.commons import Module
+from genedescriptions.commons import Module, Sentence
 from genedescriptions.config_parser import GenedescConfigParser
 from genedescriptions.descriptions_generator import OntologySentenceGenerator, ModuleSentences
 from genedescriptions.sentence_generation_functions import concatenate_words_with_oxford_comma
@@ -185,52 +185,49 @@ def _get_module_initial_set(aspect: str, sentence_generator: OntologySentenceGen
                     (aspect, additional_qualifier)].items() for elem in sets if (aspect, key, additional_qualifier) in
                  sentence_generator.prepostfix_sentences_map]))
 
-    def set_initial_stats(self, module: Module, sentence_generator: OntologySentenceGenerator,
-                          sentence_generator_exp_only: OntologySentenceGenerator = None):
+    def set_or_update_initial_stats(self, module: Module, sent_generator: OntologySentenceGenerator,
+                                    module_sentences: ModuleSentences):
         """set initial stats for a specific module
 
         Args:
+            sent_generator: the main sentence generator
             module: the module
-            sentence_generator: the main sentence generator
-            sentence_generator_exp_only: sentence generator with experimental evidence codes only
+            module_sentences (ModuleSentences): the module sentences
 
         Returns:
 
         """
         if module == Module.GO_FUNCTION:
-            self.stats.set_initial_go_ids_f = self._get_module_initial_set(
-                aspect="F", additional_qualifier="contributes_to", sentence_generator=sentence_generator)
-            self.stats.set_initial_experimental_go_ids_f = self._get_module_initial_set(
-                aspect="F", additional_qualifier="contributes_to", sentence_generator=sentence_generator_exp_only)
+            self.stats.set_initial_go_ids_f = self._get_merged_ids(module_sentences.get_initial_ids(),
+                                                                   self.stats.set_initial_go_ids_f)
+            self.stats.set_initial_experimental_go_ids_f = self._get_merged_ids(
+                module_sentences.get_initial_ids(experimental_only=True), self.stats.set_initial_experimental_go_ids_f)
         elif module == Module.GO_COMPONENT:
-            self.stats.set_initial_go_ids_c = self._get_module_initial_set(
-                aspect="C", additional_qualifier="colocalizes_with", sentence_generator=sentence_generator)
-            self.stats.set_initial_experimental_go_ids_c = self._get_module_initial_set(
-                aspect="C", additional_qualifier="colocalizes_with", sentence_generator=sentence_generator_exp_only)
+            self.stats.set_initial_go_ids_c = self._get_merged_ids(module_sentences.get_initial_ids(),
+                                                                   self.stats.set_initial_go_ids_c)
+            self.stats.set_initial_experimental_go_ids_c = self._get_merged_ids(
+                module_sentences.get_initial_ids(experimental_only=True), self.stats.set_initial_experimental_go_ids_c)
         elif module == Module.GO_PROCESS:
-            self.stats.set_initial_go_ids_p = self._get_module_initial_set(
-                aspect="P", sentence_generator=sentence_generator)
-            self.stats.set_initial_experimental_go_ids_p = self._get_module_initial_set(
-                aspect="P", sentence_generator=sentence_generator_exp_only)
+            self.stats.set_initial_go_ids_p = self._get_merged_ids(module_sentences.get_initial_ids(),
+                                                                   self.stats.set_initial_go_ids_p)
+            self.stats.set_initial_experimental_go_ids_p = self._get_merged_ids(
+                module_sentences.get_initial_ids(experimental_only=True), self.stats.set_initial_experimental_go_ids_p)
         elif module == Module.EXPRESSION:
-            self.stats.set_initial_expression_ids = self._get_module_initial_set(
-                aspect="A", main_qualifier="Verified", sentence_generator=sentence_generator)
+            self.stats.set_initial_expression_ids = self._get_merged_ids(module_sentences.get_initial_ids(),
+                                                                         self.stats.set_initial_expression_ids)
         elif module == Module.DO_EXPERIMENTAL:
-            self.stats.total_number_do_exp_bio_annotations += len(sentence_generator.gene_annots)
-            self.stats.set_initial_do_ids = self._get_merged_ids(
-                [term_id for terms in sentence_generator.terms_groups.values() for tvalues in terms.values() for
-                 term_id in tvalues], self.stats.set_initial_do_ids)
+            self.stats.total_number_do_exp_bio_annotations += len(sent_generator.gene_annots)
+            self.stats.set_initial_do_ids = self._get_merged_ids(module_sentences.get_initial_ids(),
+                                                                 self.stats.set_initial_do_ids)
         elif module == Module.DO_BIOMARKER:
-            self.stats.total_number_do_exp_bio_annotations += len(sentence_generator.gene_annots)
-            self.stats.set_initial_do_ids = self._get_merged_ids(
-                [term_id for terms in sentence_generator.terms_groups.values() for tvalues in terms.values() for term_id
-                 in tvalues], self.stats.set_initial_do_ids)
+            self.stats.total_number_do_exp_bio_annotations += len(sent_generator.gene_annots)
+            self.stats.set_initial_do_ids = self._get_merged_ids(module_sentences.get_initial_ids(),
+                                                                 self.stats.set_initial_do_ids)
         elif module == Module.DO_ORTHOLOGY:
-            self.stats.total_number_do_via_orth_annotations = len(sentence_generator.gene_annots)
-            self.stats.set_initial_do_ids = self._get_merged_ids(
-                [term_id for terms in sentence_generator.terms_groups.values() for tvalues in terms.values() for term_id
-                 in tvalues], self.stats.set_initial_do_ids)
+            self.stats.total_number_do_via_orth_annotations += len(sent_generator.gene_annots)
+            self.stats.set_initial_do_ids = self._get_merged_ids(module_sentences.get_initial_ids(),
+                                                                 self.stats.set_initial_do_ids)
         self.stats.total_number_do_annotations = self.stats.total_number_do_exp_bio_annotations + \
                                                  self.stats.total_number_do_via_orth_annotations
         if module == Module.GO_PROCESS or module == Module.GO_FUNCTION or module == Module.GO_COMPONENT:
-            self.stats.total_number_go_annotations = len(sentence_generator.gene_annots)
+            self.stats.total_number_go_annotations = len(sent_generator.gene_annots)
diff --git a/genedescriptions/ontology_tools.py b/genedescriptions/ontology_tools.py
@@ -62,7 +62,7 @@ def get_all_common_ancestors(node_ids: List[str], ontology: Ontology, min_distan
                 for basic_prop_val in onto_anc["meta"]["basicPropertyValues"]:
                     if basic_prop_val["pred"] == "OIO:hasOBONamespace":
                         onto_anc_root = basic_prop_val["val"]
-            if onto_anc["depth"] >= min_distance_from_root and (
+            if (ancestor in node_ids or onto_anc["depth"] >= min_distance_from_root) and (
                 not onto_anc_root or onto_anc_root == common_root) and (not nodeids_blacklist or ancestor not in
                                                                         nodeids_blacklist):
                 ancestors[ancestor].append(node_id)
@@ -104,6 +104,7 @@ def set_all_depths_in_subgraph(ontology: Ontology, root_id: str, relations: List
 
 
 def set_ic_ontology_struct(ontology: Ontology, relations: List[str] = None):
+    logger.info("Setting information content values based on ontology structure")
     roots = ontology.get_roots(relations=relations)
     for root_id in roots:
         if "num_subsumers" not in ontology.node(root_id) and ("type" not in ontology.node(root_id) or
@@ -121,6 +122,7 @@ def set_ic_ontology_struct(ontology: Ontology, relations: List[str] = None):
         if "type" not in ontology.node(root_id) or ontology.node_type(root_id) == "CLASS":
             _set_information_content_in_subgraph(ontology=ontology, root_id=root_id,
                                                  maxleaves=ontology.node(root_id)["num_leaves"], relations=relations)
+    logger.info("Finished setting information content values")
 
 
 def set_ic_annot_freq(ontology: Ontology, annotations: AssociationSet):

diff --git a/genedescriptions/precanned_modules.py b/genedescriptions/precanned_modules.py
@@ -50,12 +50,16 @@ def set_gene_ontology_module(dm: DataManager, conf_parser: GenedescConfigParser,
             module_sentences=comp_module_sentences, module=Module.GO_COMPONENT)
     gene_desc.set_or_extend_module_description_and_final_stats(module_sentences=colocalizes_with_module_sentences,
                                                                module=Module.GO_COMPONENT)
-    gene_desc.set_initial_stats(module=Module.GO_FUNCTION, sentence_generator=go_sent_generator,
-                                sentence_generator_exp_only=go_sent_generator_exp)
-    gene_desc.set_initial_stats(module=Module.GO_PROCESS, sentence_generator=go_sent_generator,
-                                sentence_generator_exp_only=go_sent_generator_exp)
-    gene_desc.set_initial_stats(module=Module.GO_COMPONENT, sentence_generator=go_sent_generator,
-                                sentence_generator_exp_only=go_sent_generator_exp)
+    gene_desc.set_or_update_initial_stats(module=Module.GO_FUNCTION, sent_generator=go_sent_generator,
+                                          module_sentences=contributes_to_module_sentences)
+    gene_desc.set_or_update_initial_stats(module=Module.GO_FUNCTION, sent_generator=go_sent_generator,
+                                          module_sentences=func_module_sentences)
+    gene_desc.set_or_update_initial_stats(module=Module.GO_PROCESS, sent_generator=go_sent_generator,
+                                          module_sentences=proc_module_sentences)
+    gene_desc.set_or_update_initial_stats(module=Module.GO_COMPONENT, sent_generator=go_sent_generator,
+                                          module_sentences=colocalizes_with_module_sentences)
+    gene_desc.set_or_update_initial_stats(module=Module.GO_COMPONENT, sent_generator=go_sent_generator,
+                                          module_sentences=comp_module_sentences)
 
 
 def set_disease_module(df: DataManager, conf_parser: GenedescConfigParser, gene_desc: GeneDescription, gene: Gene,
@@ -81,9 +85,12 @@ def set_disease_module(df: DataManager, conf_parser: GenedescConfigParser, gene_
         aspect='D', merge_groups_with_same_prefix=True, keep_only_best_group=False)
     gene_desc.set_or_extend_module_description_and_final_stats(module=Module.DO_ORTHOLOGY,
                                                                module_sentences=disease_via_orth_module_sentences)
-    gene_desc.set_initial_stats(module=Module.DO_EXPERIMENTAL, sentence_generator=do_sentence_exp_generator)
-    gene_desc.set_initial_stats(module=Module.DO_BIOMARKER, sentence_generator=do_sentence_bio_generator)
-    gene_desc.set_initial_stats(module=Module.DO_ORTHOLOGY, sentence_generator=do_via_orth_sentence_generator)
+    gene_desc.set_or_update_initial_stats(module=Module.DO_EXPERIMENTAL, sent_generator=do_sentence_exp_generator,
+                                          module_sentences=disease_exp_module_sentences)
+    gene_desc.set_or_update_initial_stats(module=Module.DO_BIOMARKER, sent_generator=do_sentence_bio_generator,
+                                          module_sentences=disease_bio_module_sentences)
+    gene_desc.set_or_update_initial_stats(module=Module.DO_ORTHOLOGY, sent_generator=do_via_orth_sentence_generator,
+                                          module_sentences=disease_via_orth_module_sentences)
 
 
 def set_expression_module(df: DataManager, conf_parser: GenedescConfigParser, gene_desc: GeneDescription, gene: Gene):
@@ -93,7 +100,8 @@ def set_expression_module(df: DataManager, conf_parser: GenedescConfigParser, ge
         aspect='A', qualifier="Verified", merge_groups_with_same_prefix=True, keep_only_best_group=False)
     gene_desc.set_or_extend_module_description_and_final_stats(module_sentences=expression_module_sentences,
                                                                module=Module.EXPRESSION)
-    gene_desc.set_initial_stats(module=Module.EXPRESSION, sentence_generator=expr_sentence_generator)
+    gene_desc.set_or_update_initial_stats(module=Module.EXPRESSION, sent_generator=expr_sentence_generator,
+                                          module_sentences=expression_module_sentences)
 
 
 def set_alliance_human_orthology_module(orthologs: List[List[str]], gene_desc: GeneDescription,

diff --git a/genedescriptions/sentence_generation_functions.py b/genedescriptions/sentence_generation_functions.py
@@ -53,8 +53,9 @@ def compose_sentence(prefix: str, additional_prefix: str, term_names: List[str],
                                                             separator=config.get_terms_delimiter()) + postfix
 
 
-def _get_single_sentence(node_ids: List[str], ontology: Ontology, aspect: str, evidence_group: str, qualifier: str,
-                         prepostfix_sentences_map: Dict[Tuple[str, str, str], Tuple[str, str]],
+def _get_single_sentence(initial_terms_ids: List[str], node_ids: List[str], ontology: Ontology, aspect: str,
+                         evidence_group: str, qualifier: str, prepostfix_sentences_map: Dict[Tuple[str, str, str],
+                                                                                             Tuple[str, str]],
                          config: GenedescConfigParser, terms_merged: bool = False, add_others: bool = False,
                          truncate_others_generic_word: str = "several",
                          truncate_others_aspect_words: Dict[str, str] = None,
@@ -94,7 +95,7 @@ def _get_single_sentence(node_ids: List[str], ontology: Ontology, aspect: str, e
         term_labels = [ontology.label(node_id, id_if_null=True) for node_id in node_ids]
         if ancestors_with_multiple_children is None:
             ancestors_with_multiple_children = set()
-        return Sentence(prefix=prefix, terms_ids=node_ids, postfix=postfix,
+        return Sentence(prefix=prefix, initial_terms_ids=initial_terms_ids, terms_ids=node_ids, postfix=postfix,
                         text=compose_sentence(prefix=prefix, term_names=term_labels, postfix=postfix,
                                               additional_prefix=additional_prefix,
                                               ancestors_with_multiple_children=ancestors_with_multiple_children,

diff --git a/genedescriptions/stats.py b/genedescriptions/stats.py
@@ -38,11 +38,10 @@ def __init__(self):
     @staticmethod
     def _get_num_covered_nodes(set_initial_terms, set_final_terms, ontology):
         num_covered_nodes = 0
-        final_t_ancestors = {final_term: ontology.ancestors(final_term) for final_term in set_final_terms}
         for initial_term in set_initial_terms:
             initial_t_ancestors = set(ontology.ancestors(initial_term, reflexive=True))
             for final_term in set_final_terms:
-                if final_term in initial_t_ancestors or initial_term in final_t_ancestors[final_term]:
+                if final_term in initial_t_ancestors:
                     num_covered_nodes += 1
                     break
         return num_covered_nodes

diff --git a/genedescriptions/trimming.py b/genedescriptions/trimming.py
@@ -137,8 +137,8 @@ def trim(self, node_ids: List[str], max_num_nodes: int = 3, min_distance_from_ro
                     best_cands = [(cand_id, candidates_dict[cand_id][1])]
                 for best_cand in best_cands:
                     selected_cands_ids.append(best_cand[0])
-                    for node_id in candidates_dict[best_cand[0]][1]:
-                        cands_ids_to_process -= set(node_to_cands_map[node_id])
+                    cands_ids_to_process = {cand_id for cand_id in cands_ids_to_process if best_cand[0] not in
+                                            self.ontology.ancestors(cand_id, reflexive=True)}
             else:
                 selected_cands_ids.append(cand_id)
         if len(selected_cands_ids) <= max_num_nodes:

diff --git a/tests/config_test.yml b/tests/config_test.yml
@@ -668,12 +668,12 @@ expression_sentences_options:
     - "WBbt:0007849"
     - "WBbt:0005738"
   rename_terms:
-    "^neuron$": "nervous system"
+    "^neuron$": "neurons"
     "(.*)([a-z]+ )neuron$": "\\1\\2neurons"
-    "^germ cell$": "germ cells"
-    "^hypodermis$": "the hypodermis"
     "(.*)hermaphrodite-specific$": "\\1hermaphrodite"
     "(.*)male-specific$": "\\1male"
+    "^ganglion$": "ganglia"
+    "^neuroblast$": "neuroblasts"
   evidence_codes:
     IDA:
       group: EXPERIMENTAL