fix(templates): new templates for GAF 2.2

alliance-genome · Dec 10, 2020 · 79760e1 · 79760e1
1 parent 76c753e
commit 79760e1
Showing 4 changed files with 116 additions and 50 deletions.
diff --git a/genedescriptions/config_parser.py b/genedescriptions/config_parser.py
@@ -109,19 +109,21 @@ def _get_module_property_name(prop: ConfigModuleProperty):
     def get_prepostfix_sentence_map(self, module: Module, special_cases_only: bool = False, humans: bool = False):
         module_name = self._get_module_name(module)
         if special_cases_only:
-            return {(prepost["aspect"], prepost["group"], prepost["qualifier"]): [
+            return {prepost["aspect"] + "|" + prepost["group"] + "|" + prepost["qualifier"]: [
                 (sp_case["id"], sp_case["match_regex"], sp_case["prefix"], sp_case["postfix"])
                 for sp_case in prepost["special_cases"]]
                 for prepost in self.config[module_name]["prepostfix_sentences_map"] if
                 "special_cases" in prepost and prepost["special_cases"]}
         else:
-            prepost_map = {(prepost["aspect"], prepost["group"], prepost["qualifier"]): (
+            prepost_map = {prepost["aspect"] + "|" + prepost["group"] + "|" + prepost["qualifier"]: (
                 prepost["prefix"], prepost["postfix"]) for prepost in self.config[module_name][
                 "prepostfix_sentences_map_humans" if humans else "prepostfix_sentences_map"]}
             special_cases_only = self.get_prepostfix_sentence_map(module=module, special_cases_only=True, humans=humans)
             for key, scs in special_cases_only.items():
+                key_arr = key.split("|")
                 for special_case in scs:
-                    prepost_map[(key[0], key[1] + str(special_case[0]), key[2])] = (special_case[2], special_case[3])
+                    prepost_map[key_arr[0] + "|" + key_arr[1] + str(special_case[0]) + "|" + key_arr[2]] = \
+                        (special_case[2], special_case[3])
             return prepost_map
 
     def get_annotations_priority(self, module: Module) -> List[str]:

diff --git a/genedescriptions/descriptions_generator.py b/genedescriptions/descriptions_generator.py
@@ -18,7 +18,7 @@ def __init__(self, sentences):
         self.sentences = sentences
 
     def get_description(self):
-        return " and ".join([sentence.text for sentence in self.sentences])
+        return ". ".join([sentence.text[0].upper() + sentence.text[1:] for sentence in self.sentences])
 
     def get_ids(self, experimental_only: bool = False):
         return list({term_id for sentence in self.sentences for term_id in sentence.terms_ids if not experimental_only or
@@ -88,11 +88,10 @@ def set_terms_groups(self, module, config, limit_to_group, humans):
                     aspect = annotation["aspect"]
                     ev_group = evidence_codes_groups_map[annotation["evidence"]["type"]]
                     qualifier = "_".join(sorted(annotation["qualifiers"])) if "qualifiers" in annotation else ""
-                    if prepostfix_special_cases_sent_map and (aspect, ev_group, qualifier) in \
-                        prepostfix_special_cases_sent_map:
-                        for special_case in prepostfix_special_cases_sent_map[(aspect, ev_group, qualifier)]:
-                            if re.match(re.escape(special_case[1]), self.ontology.label(annotation["object"]["id"],
-                                                                                        id_if_null=True)):
+                    if prepostfix_special_cases_sent_map and aspect + "|" + ev_group + "|" + qualifier in \
+                       prepostfix_special_cases_sent_map:
+                        for special_case in prepostfix_special_cases_sent_map[aspect + "|" + ev_group + "|" + qualifier]:
+                            if re.match(special_case[1], self.ontology.label(annotation["object"]["id"], id_if_null=True)):
                                 ev_group = evidence_codes_groups_map[annotation["evidence"]["type"]] + \
                                            str(special_case[0])
                                 if ev_group not in self.evidence_groups_priority_list:
@@ -121,29 +120,31 @@ def get_module_sentences(self, aspect: str, qualifier: str = '',
         dist_root = self.config.get_module_property(module=self.module, prop=ConfigModuleProperty.DISTANCE_FROM_ROOT)
         add_mul_comanc = self.config.get_module_property(module=self.module,
                                                          prop=ConfigModuleProperty.ADD_MULTIPLE_TO_COMMON_ANCEST)
+        best_group = ""
         for terms, evidence_group, priority in sorted([(t, eg, evidence_group_priority[eg]) for eg, t in
                                                        self.terms_groups[(aspect, qualifier)].items()],
                                                       key=lambda x: x[2]):
-            trimming_result = self.reduce_num_terms(terms=terms, min_distance_from_root=dist_root[aspect])
-            if (aspect, evidence_group, qualifier) in self.prepostfix_sentences_map \
-                    and len(trimming_result.final_terms) > 0:
-                sentences.append(
-                    _get_single_sentence(
-                        initial_terms_ids=list(terms),
-                        node_ids=trimming_result.final_terms, ontology=self.ontology, aspect=aspect,
-                        evidence_group=evidence_group, qualifier=qualifier,
-                        prepostfix_sentences_map=self.prepostfix_sentences_map,
-                        terms_merged=False, trimmed=trimming_result.trimming_applied,
-                        add_others=trimming_result.partial_coverage,
-                        truncate_others_generic_word=self.config.get_module_property(
-                            module=self.module, prop=ConfigModuleProperty.CUTOFF_SEVERAL_WORD),
-                        truncate_others_aspect_words=self.config.get_module_property(
-                            module=self.module, prop=ConfigModuleProperty.CUTOFF_SEVERAL_CATEGORY_WORD),
-                        ancestors_with_multiple_children=trimming_result.multicovering_nodes if add_mul_comanc else
-                        None, rename_cell=rename_cell, config=self.config,
-                        put_anatomy_male_at_end=True if aspect == 'A' else False))
-                if keep_only_best_group:
-                    return ModuleSentences(sentences)
+            if not best_group or re.match(best_group + r"([0-9]*)?", evidence_group):
+                trimming_result = self.reduce_num_terms(terms=terms, min_distance_from_root=dist_root[aspect])
+                if aspect + "|" + evidence_group + "|" + qualifier in self.prepostfix_sentences_map \
+                        and len(trimming_result.final_terms) > 0:
+                    sentences.append(
+                        _get_single_sentence(
+                            initial_terms_ids=list(terms),
+                            node_ids=trimming_result.final_terms, ontology=self.ontology, aspect=aspect,
+                            evidence_group=evidence_group, qualifier=qualifier,
+                            prepostfix_sentences_map=self.prepostfix_sentences_map,
+                            terms_merged=False, trimmed=trimming_result.trimming_applied,
+                            add_others=trimming_result.partial_coverage,
+                            truncate_others_generic_word=self.config.get_module_property(
+                                module=self.module, prop=ConfigModuleProperty.CUTOFF_SEVERAL_WORD),
+                            truncate_others_aspect_words=self.config.get_module_property(
+                                module=self.module, prop=ConfigModuleProperty.CUTOFF_SEVERAL_CATEGORY_WORD),
+                            ancestors_with_multiple_children=trimming_result.multicovering_nodes if add_mul_comanc else
+                            None, rename_cell=rename_cell, config=self.config,
+                            put_anatomy_male_at_end=True if aspect == 'A' else False))
+                    if keep_only_best_group and not best_group:
+                        best_group = evidence_group
         if merge_groups_with_same_prefix:
             remove_parents = self.config.get_module_property(module=self.module,
                                                              prop=ConfigModuleProperty.DEL_PARENTS_IF_CHILD)
@@ -232,17 +233,17 @@ def merge_sentences_with_same_prefix(self, sentences: List[Sentence], remove_par
         """
         merged_sentences = defaultdict(SentenceMerger)
         for sentence in sentences:
-            prefix = self.prepostfix_sentences_map[(sentence.aspect, sentence.evidence_group, sentence.qualifier)][0]
-            merged_sentences[prefix].postfix_list.append(self.prepostfix_sentences_map[(sentence.aspect,
-                                                                                        sentence.evidence_group,
-                                                                                        sentence.qualifier)][1])
+            prefix = self.prepostfix_sentences_map[sentence.aspect + "|" + sentence.evidence_group + "|" + sentence.qualifier][0]
+            merged_sentences[prefix].postfix_list.append(self.prepostfix_sentences_map[sentence.aspect + "|" +
+                                                                                       sentence.evidence_group + "|" +
+                                                                                       sentence.qualifier][1])
             merged_sentences[prefix].aspect = sentence.aspect
             merged_sentences[prefix].qualifier = sentence.qualifier
             merged_sentences[prefix].terms_ids.update(sentence.terms_ids)
             merged_sentences[prefix].initial_terms_ids.update(sentence.initial_terms_ids)
             for term in sentence.terms_ids:
                 merged_sentences[prefix].term_postfix_dict[term] = self.prepostfix_sentences_map[
-                    (sentence.aspect, sentence.evidence_group, sentence.qualifier)][1]
+                    sentence.aspect + "|" + sentence.evidence_group + "|" + sentence.qualifier][1]
             merged_sentences[prefix].evidence_groups.append(sentence.evidence_group)
             for term in sentence.terms_ids:
                 merged_sentences[prefix].term_evgroup_dict[term] = sentence.evidence_group

diff --git a/genedescriptions/sentence_generation_functions.py b/genedescriptions/sentence_generation_functions.py
@@ -54,8 +54,7 @@ def compose_sentence(prefix: str, additional_prefix: str, term_names: List[str],
 
 
 def _get_single_sentence(initial_terms_ids: List[str], node_ids: List[str], ontology: Ontology, aspect: str,
-                         evidence_group: str, qualifier: str, prepostfix_sentences_map: Dict[Tuple[str, str, str],
-                                                                                             Tuple[str, str]],
+                         evidence_group: str, qualifier: str, prepostfix_sentences_map: Dict[str, Tuple[str, str]],
                          config: GenedescConfigParser, terms_merged: bool = False, add_others: bool = False,
                          truncate_others_generic_word: str = "several",
                          truncate_others_aspect_words: Dict[str, str] = None,
@@ -69,7 +68,7 @@ def _get_single_sentence(initial_terms_ids: List[str], node_ids: List[str], onto
         aspect (str): aspect
         evidence_group (str): evidence group
         qualifier (str): qualifier
-        prepostfix_sentences_map (Dict[Tuple[str, str, str], Tuple[str, str]]): map for prefix and postfix phrases
+        prepostfix_sentences_map (Dict[str, Tuple[str, str]]): map for prefix and postfix phrases
         config (GenedescConfigParser): a gene description configuration object
         terms_merged (bool): whether the terms set has been merged to reduce its size
         add_others (bool): whether to say that there are other terms which have been omitted from the sentence
@@ -84,14 +83,14 @@ def _get_single_sentence(initial_terms_ids: List[str], node_ids: List[str], onto
         Union[Sentence,None]: the combined go sentence
     """
     if len(node_ids) > 0:
-        prefix = prepostfix_sentences_map[(aspect, evidence_group, qualifier)][0]
+        prefix = prepostfix_sentences_map[aspect + "|" + evidence_group + "|" + qualifier][0]
         additional_prefix = ""
         others_word = "entities"
         if aspect in truncate_others_aspect_words:
             others_word = truncate_others_aspect_words[aspect]
         if add_others:
             additional_prefix += truncate_others_generic_word + " " + others_word + ", including"
-        postfix = prepostfix_sentences_map[(aspect, evidence_group, qualifier)][1]
+        postfix = prepostfix_sentences_map[aspect + "|" + evidence_group + "|" + qualifier][1]
         term_labels = [ontology.label(node_id, id_if_null=True) for node_id in node_ids]
         if ancestors_with_multiple_children is None:
             ancestors_with_multiple_children = set()

diff --git a/wormbase/config_wb.yml b/wormbase/config_wb.yml
@@ -9,8 +9,8 @@ generic:
 wb_options:
   release: "WS279"
   raw_files_source: "ftp://ftp.ebi.ac.uk/pub/databases/wormbase/releases"
-  agr_human_go_associations: "http://download.alliancegenome.org/3.1.0/GAF/HUMAN/GAF_HUMAN_3.gaf"
-  agr_go_ontology: "http://download.alliancegenome.org/3.1.0/ONTOLOGY/GO/ONTOLOGY_GO_2.obo"
+  agr_human_go_associations: "http://download.alliancegenome.org/3.2.0/GAF/HUMAN/GAF_HUMAN_1.gaf"
+  agr_go_ontology: "http://download.alliancegenome.org/3.2.0/ONTOLOGY/GO/ONTOLOGY_GO_2.obo"
   organisms_to_process:
     # add the species to be processed to the following list. Entries must have the same text as in 'organisms'
     # option
@@ -211,6 +211,10 @@ go_sentences_options:
           match_regex: "structural constituent"
           prefix: "is a"
           postfix: ""
+        - id: 2
+          match_regex: "^extracellular matrix structural constituent"
+          prefix: "is an"
+          postfix: ""
     - aspect: F
       group: EXPERIMENTAL
       qualifier: "contributes_to"
@@ -221,6 +225,10 @@ go_sentences_options:
           match_regex: "structural constituent"
           prefix: "contributes as a"
           postfix: ""
+        - id: 2
+          match_regex: "^extracellular matrix structural constituent"
+          prefix: "contributes as an"
+          postfix: ""
     - aspect: F
       group: HIGH_THROUGHPUT_EXPERIMENTAL
       qualifier: "enables"
@@ -231,6 +239,10 @@ go_sentences_options:
           match_regex: "structural constituent"
           prefix: "is a"
           postfix: ""
+        - id: 2
+          match_regex: "^extracellular matrix structural constituent"
+          prefix: "is an"
+          postfix: ""
     - aspect: F
       group: HIGH_THROUGHPUT_EXPERIMENTAL
       qualifier: "contributes_to"
@@ -239,7 +251,11 @@ go_sentences_options:
       special_cases:
         - id: 1
           match_regex: "structural constituent"
-          prefix: "is a"
+          prefix: "contributes as a"
+          postfix: ""
+        - id: 2
+          match_regex: "^extracellular matrix structural constituent"
+          prefix: "contributes as an"
           postfix: ""
     - aspect: F
       group: PHYLOGENETIC_ANALYSIS_AND_SEQUENCE_BASED_ANALYSIS
@@ -251,6 +267,10 @@ go_sentences_options:
           match_regex: "structural constituent"
           prefix: "is predicted to be a"
           postfix: ""
+        - id: 2
+          match_regex: "^extracellular matrix structural constituent"
+          prefix: "is predicted to be an"
+          postfix: ""
     - aspect: F
       group: PHYLOGENETIC_ANALYSIS_AND_SEQUENCE_BASED_ANALYSIS
       qualifier: "contributes_to"
@@ -261,6 +281,10 @@ go_sentences_options:
           match_regex: "structural constituent"
           prefix: "is predicted to contribute as a"
           postfix: ""
+        - id: 2
+          match_regex: "^extracellular matrix structural constituent"
+          prefix: "is an"
+          postfix: "is predicted to contribute as an"
     - aspect: F
       group: INFERRED_BY_CURATORS_AND_AUTHORS
       qualifier: "enables"
@@ -271,6 +295,10 @@ go_sentences_options:
           match_regex: "structural constituent"
           prefix: "is predicted to be a"
           postfix: ""
+        - id: 2
+          match_regex: "^extracellular matrix structural constituent"
+          prefix: "is predicted to be an"
+          postfix: ""
     - aspect: F
       group: INFERRED_BY_CURATORS_AND_AUTHORS
       qualifier: "contributes_to"
@@ -281,6 +309,10 @@ go_sentences_options:
           match_regex: "structural constituent"
           prefix: "is predicted to contribute as a"
           postfix: ""
+        - id: 2
+          match_regex: "^extracellular matrix structural constituent"
+          prefix: "is predicted to contribute as an"
+          postfix: ""
     - aspect: F
       group: ELECTRONIC_AND_COMPUTATIONAL_ANALYSIS
       qualifier: "enables"
@@ -291,6 +323,10 @@ go_sentences_options:
           match_regex: "structural constituent"
           prefix: "is predicted to be a"
           postfix: ""
+        - id: 2
+          match_regex: "^extracellular matrix structural constituent"
+          prefix: "is predicted to be an"
+          postfix: ""
     - aspect: F
       group: ELECTRONIC_AND_COMPUTATIONAL_ANALYSIS
       qualifier: "contributes_to"
@@ -301,6 +337,10 @@ go_sentences_options:
           match_regex: "structural constituent"
           prefix: "is predicted to contribute as a"
           postfix: ""
+        - id: 2
+          match_regex: "^extracellular matrix structural constituent"
+          prefix: "is predicted to contribute as an"
+          postfix: ""
     - aspect: P
       group: EXPERIMENTAL
       qualifier: "involved_in"
@@ -316,25 +356,25 @@ go_sentences_options:
     - aspect: P
       group: EXPERIMENTAL
       qualifier: "acts_upstream_of_positive_effect"
-      prefix: "acts upstream of and positively affects"
+      prefix: "acts upstream of with a positive effect on"
       postfix: ""
       special_cases:
     - aspect: P
       group: HIGH_THROUGHPUT_EXPERIMENTAL
       qualifier: "acts_upstream_of_positive_effect"
-      prefix: "acts upstream of and positively affects"
+      prefix: "acts upstream of with a positive effect on"
       postfix: ""
       special_cases:
     - aspect: P
       group: EXPERIMENTAL
       qualifier: "acts_upstream_of_negative_effect"
-      prefix: "acts upstream of and negatively affects"
+      prefix: "acts upstream of with a negative effect on"
       postfix: ""
       special_cases:
     - aspect: P
       group: HIGH_THROUGHPUT_EXPERIMENTAL
       qualifier: "acts_upstream_of_negative_effect"
-      prefix: "acts upstream of and negatively affects"
+      prefix: "acts upstream of with a negative effect on"
       postfix: ""
       special_cases:
     - aspect: P
@@ -352,25 +392,25 @@ go_sentences_options:
     - aspect: P
       group: EXPERIMENTAL
       qualifier: "acts_upstream_of_or_within_negative_effect"
-      prefix: "acts upstream of or within and negatively affects"
+      prefix: "acts upstream of or within with a negative effect on"
       postfix: ""
       special_cases:
     - aspect: P
       group: HIGH_THROUGHPUT_EXPERIMENTAL
       qualifier: "acts_upstream_of_or_within_negative_effect"
-      prefix: "acts upstream of or within and negatively affects"
+      prefix: "acts upstream of or within with a negative effect on"
       postfix: ""
       special_cases:
     - aspect: P
       group: EXPERIMENTAL
       qualifier: "acts_upstream_of_or_within_positive_effect"
-      prefix: "acts upstream of or within and positively affects"
+      prefix: "acts upstream of or within with a positive effect on"
       postfix: ""
       special_cases:
     - aspect: P
       group: HIGH_THROUGHPUT_EXPERIMENTAL
       qualifier: "acts_upstream_of_or_within_positive_effect"
-      prefix: "acts upstream of or within and positively affects"
+      prefix: "acts upstream of or within with a positive effect on"
       postfix: ""
       special_cases:
     - aspect: C
@@ -383,6 +423,10 @@ go_sentences_options:
           match_regex: "intracellular$"
           prefix: "is"
           postfix: ""
+        - id: 2
+          match_regex: ".*component of.*"
+          prefix: "is"
+          postfix: ""
     - aspect: C
       group: EXPERIMENTAL
       qualifier: "colocalizes_with"
@@ -393,6 +437,10 @@ go_sentences_options:
           match_regex: "intracellular$"
           prefix: "is"
           postfix: ""
+        - id: 2
+          match_regex: ".*component of.*"
+          prefix: "is"
+          postfix: ""
     - aspect: C
       group: EXPERIMENTAL
       qualifier: "part_of"
@@ -403,6 +451,10 @@ go_sentences_options:
           match_regex: "intracellular$"
           prefix: "is"
           postfix: ""
+        - id: 2
+          match_regex: ".*component of.*"
+          prefix: "is"
+          postfix: ""
     - aspect: C
       group: HIGH_THROUGHPUT_EXPERIMENTAL
       qualifier: "located_in"
@@ -413,6 +465,10 @@ go_sentences_options:
           match_regex: "intracellular$"
           prefix: "is"
           postfix: ""
+        - id: 2
+          match_regex: ".*component of.*"
+          prefix: "is"
+          postfix: ""
     - aspect: C
       group: HIGH_THROUGHPUT_EXPERIMENTAL
       qualifier: "colocalizes_with"
@@ -423,6 +479,10 @@ go_sentences_options:
           match_regex: "intracellular$"
           prefix: "is"
           postfix: ""
+        - id: 2
+          match_regex: ".*component of.*"
+          prefix: "is"
+          postfix: ""
     - aspect: C
       group: HIGH_THROUGHPUT_EXPERIMENTAL
       qualifier: "part_of"
@@ -433,6 +493,10 @@ go_sentences_options:
           match_regex: "intracellular$"
           prefix: "is"
           postfix: ""
+        - id: 2
+          match_regex: ".*component of.*"
+          prefix: "is"
+          postfix: ""
 
   truncate_others_aggregation_word: several
   truncate_others_terms: