Skip to content

Commit

Permalink
fix(templates): new templates for GAF 2.2
Browse files Browse the repository at this point in the history
valearna committed Dec 10, 2020
1 parent 76c753e commit 79760e1
Showing 4 changed files with 116 additions and 50 deletions.
8 changes: 5 additions & 3 deletions genedescriptions/config_parser.py
Original file line number Diff line number Diff line change
@@ -109,19 +109,21 @@ def _get_module_property_name(prop: ConfigModuleProperty):
def get_prepostfix_sentence_map(self, module: Module, special_cases_only: bool = False, humans: bool = False):
module_name = self._get_module_name(module)
if special_cases_only:
return {(prepost["aspect"], prepost["group"], prepost["qualifier"]): [
return {prepost["aspect"] + "|" + prepost["group"] + "|" + prepost["qualifier"]: [
(sp_case["id"], sp_case["match_regex"], sp_case["prefix"], sp_case["postfix"])
for sp_case in prepost["special_cases"]]
for prepost in self.config[module_name]["prepostfix_sentences_map"] if
"special_cases" in prepost and prepost["special_cases"]}
else:
prepost_map = {(prepost["aspect"], prepost["group"], prepost["qualifier"]): (
prepost_map = {prepost["aspect"] + "|" + prepost["group"] + "|" + prepost["qualifier"]: (
prepost["prefix"], prepost["postfix"]) for prepost in self.config[module_name][
"prepostfix_sentences_map_humans" if humans else "prepostfix_sentences_map"]}
special_cases_only = self.get_prepostfix_sentence_map(module=module, special_cases_only=True, humans=humans)
for key, scs in special_cases_only.items():
key_arr = key.split("|")
for special_case in scs:
prepost_map[(key[0], key[1] + str(special_case[0]), key[2])] = (special_case[2], special_case[3])
prepost_map[key_arr[0] + "|" + key_arr[1] + str(special_case[0]) + "|" + key_arr[2]] = \
(special_case[2], special_case[3])
return prepost_map

def get_annotations_priority(self, module: Module) -> List[str]:
63 changes: 32 additions & 31 deletions genedescriptions/descriptions_generator.py
Original file line number Diff line number Diff line change
@@ -18,7 +18,7 @@ def __init__(self, sentences):
self.sentences = sentences

def get_description(self):
return " and ".join([sentence.text for sentence in self.sentences])
return ". ".join([sentence.text[0].upper() + sentence.text[1:] for sentence in self.sentences])

def get_ids(self, experimental_only: bool = False):
return list({term_id for sentence in self.sentences for term_id in sentence.terms_ids if not experimental_only or
@@ -88,11 +88,10 @@ def set_terms_groups(self, module, config, limit_to_group, humans):
aspect = annotation["aspect"]
ev_group = evidence_codes_groups_map[annotation["evidence"]["type"]]
qualifier = "_".join(sorted(annotation["qualifiers"])) if "qualifiers" in annotation else ""
if prepostfix_special_cases_sent_map and (aspect, ev_group, qualifier) in \
prepostfix_special_cases_sent_map:
for special_case in prepostfix_special_cases_sent_map[(aspect, ev_group, qualifier)]:
if re.match(re.escape(special_case[1]), self.ontology.label(annotation["object"]["id"],
id_if_null=True)):
if prepostfix_special_cases_sent_map and aspect + "|" + ev_group + "|" + qualifier in \
prepostfix_special_cases_sent_map:
for special_case in prepostfix_special_cases_sent_map[aspect + "|" + ev_group + "|" + qualifier]:
if re.match(special_case[1], self.ontology.label(annotation["object"]["id"], id_if_null=True)):
ev_group = evidence_codes_groups_map[annotation["evidence"]["type"]] + \
str(special_case[0])
if ev_group not in self.evidence_groups_priority_list:
@@ -121,29 +120,31 @@ def get_module_sentences(self, aspect: str, qualifier: str = '',
dist_root = self.config.get_module_property(module=self.module, prop=ConfigModuleProperty.DISTANCE_FROM_ROOT)
add_mul_comanc = self.config.get_module_property(module=self.module,
prop=ConfigModuleProperty.ADD_MULTIPLE_TO_COMMON_ANCEST)
best_group = ""
for terms, evidence_group, priority in sorted([(t, eg, evidence_group_priority[eg]) for eg, t in
self.terms_groups[(aspect, qualifier)].items()],
key=lambda x: x[2]):
trimming_result = self.reduce_num_terms(terms=terms, min_distance_from_root=dist_root[aspect])
if (aspect, evidence_group, qualifier) in self.prepostfix_sentences_map \
and len(trimming_result.final_terms) > 0:
sentences.append(
_get_single_sentence(
initial_terms_ids=list(terms),
node_ids=trimming_result.final_terms, ontology=self.ontology, aspect=aspect,
evidence_group=evidence_group, qualifier=qualifier,
prepostfix_sentences_map=self.prepostfix_sentences_map,
terms_merged=False, trimmed=trimming_result.trimming_applied,
add_others=trimming_result.partial_coverage,
truncate_others_generic_word=self.config.get_module_property(
module=self.module, prop=ConfigModuleProperty.CUTOFF_SEVERAL_WORD),
truncate_others_aspect_words=self.config.get_module_property(
module=self.module, prop=ConfigModuleProperty.CUTOFF_SEVERAL_CATEGORY_WORD),
ancestors_with_multiple_children=trimming_result.multicovering_nodes if add_mul_comanc else
None, rename_cell=rename_cell, config=self.config,
put_anatomy_male_at_end=True if aspect == 'A' else False))
if keep_only_best_group:
return ModuleSentences(sentences)
if not best_group or re.match(best_group + r"([0-9]*)?", evidence_group):
trimming_result = self.reduce_num_terms(terms=terms, min_distance_from_root=dist_root[aspect])
if aspect + "|" + evidence_group + "|" + qualifier in self.prepostfix_sentences_map \
and len(trimming_result.final_terms) > 0:
sentences.append(
_get_single_sentence(
initial_terms_ids=list(terms),
node_ids=trimming_result.final_terms, ontology=self.ontology, aspect=aspect,
evidence_group=evidence_group, qualifier=qualifier,
prepostfix_sentences_map=self.prepostfix_sentences_map,
terms_merged=False, trimmed=trimming_result.trimming_applied,
add_others=trimming_result.partial_coverage,
truncate_others_generic_word=self.config.get_module_property(
module=self.module, prop=ConfigModuleProperty.CUTOFF_SEVERAL_WORD),
truncate_others_aspect_words=self.config.get_module_property(
module=self.module, prop=ConfigModuleProperty.CUTOFF_SEVERAL_CATEGORY_WORD),
ancestors_with_multiple_children=trimming_result.multicovering_nodes if add_mul_comanc else
None, rename_cell=rename_cell, config=self.config,
put_anatomy_male_at_end=True if aspect == 'A' else False))
if keep_only_best_group and not best_group:
best_group = evidence_group
if merge_groups_with_same_prefix:
remove_parents = self.config.get_module_property(module=self.module,
prop=ConfigModuleProperty.DEL_PARENTS_IF_CHILD)
@@ -232,17 +233,17 @@ def merge_sentences_with_same_prefix(self, sentences: List[Sentence], remove_par
"""
merged_sentences = defaultdict(SentenceMerger)
for sentence in sentences:
prefix = self.prepostfix_sentences_map[(sentence.aspect, sentence.evidence_group, sentence.qualifier)][0]
merged_sentences[prefix].postfix_list.append(self.prepostfix_sentences_map[(sentence.aspect,
sentence.evidence_group,
sentence.qualifier)][1])
prefix = self.prepostfix_sentences_map[sentence.aspect + "|" + sentence.evidence_group + "|" + sentence.qualifier][0]
merged_sentences[prefix].postfix_list.append(self.prepostfix_sentences_map[sentence.aspect + "|" +
sentence.evidence_group + "|" +
sentence.qualifier][1])
merged_sentences[prefix].aspect = sentence.aspect
merged_sentences[prefix].qualifier = sentence.qualifier
merged_sentences[prefix].terms_ids.update(sentence.terms_ids)
merged_sentences[prefix].initial_terms_ids.update(sentence.initial_terms_ids)
for term in sentence.terms_ids:
merged_sentences[prefix].term_postfix_dict[term] = self.prepostfix_sentences_map[
(sentence.aspect, sentence.evidence_group, sentence.qualifier)][1]
sentence.aspect + "|" + sentence.evidence_group + "|" + sentence.qualifier][1]
merged_sentences[prefix].evidence_groups.append(sentence.evidence_group)
for term in sentence.terms_ids:
merged_sentences[prefix].term_evgroup_dict[term] = sentence.evidence_group
9 changes: 4 additions & 5 deletions genedescriptions/sentence_generation_functions.py
Original file line number Diff line number Diff line change
@@ -54,8 +54,7 @@ def compose_sentence(prefix: str, additional_prefix: str, term_names: List[str],


def _get_single_sentence(initial_terms_ids: List[str], node_ids: List[str], ontology: Ontology, aspect: str,
evidence_group: str, qualifier: str, prepostfix_sentences_map: Dict[Tuple[str, str, str],
Tuple[str, str]],
evidence_group: str, qualifier: str, prepostfix_sentences_map: Dict[str, Tuple[str, str]],
config: GenedescConfigParser, terms_merged: bool = False, add_others: bool = False,
truncate_others_generic_word: str = "several",
truncate_others_aspect_words: Dict[str, str] = None,
@@ -69,7 +68,7 @@ def _get_single_sentence(initial_terms_ids: List[str], node_ids: List[str], onto
aspect (str): aspect
evidence_group (str): evidence group
qualifier (str): qualifier
prepostfix_sentences_map (Dict[Tuple[str, str, str], Tuple[str, str]]): map for prefix and postfix phrases
prepostfix_sentences_map (Dict[str, Tuple[str, str]]): map for prefix and postfix phrases
config (GenedescConfigParser): a gene description configuration object
terms_merged (bool): whether the terms set has been merged to reduce its size
add_others (bool): whether to say that there are other terms which have been omitted from the sentence
@@ -84,14 +83,14 @@ def _get_single_sentence(initial_terms_ids: List[str], node_ids: List[str], onto
Union[Sentence,None]: the combined go sentence
"""
if len(node_ids) > 0:
prefix = prepostfix_sentences_map[(aspect, evidence_group, qualifier)][0]
prefix = prepostfix_sentences_map[aspect + "|" + evidence_group + "|" + qualifier][0]
additional_prefix = ""
others_word = "entities"
if aspect in truncate_others_aspect_words:
others_word = truncate_others_aspect_words[aspect]
if add_others:
additional_prefix += truncate_others_generic_word + " " + others_word + ", including"
postfix = prepostfix_sentences_map[(aspect, evidence_group, qualifier)][1]
postfix = prepostfix_sentences_map[aspect + "|" + evidence_group + "|" + qualifier][1]
term_labels = [ontology.label(node_id, id_if_null=True) for node_id in node_ids]
if ancestors_with_multiple_children is None:
ancestors_with_multiple_children = set()
86 changes: 75 additions & 11 deletions wormbase/config_wb.yml
Original file line number Diff line number Diff line change
@@ -9,8 +9,8 @@ generic:
wb_options:
release: "WS279"
raw_files_source: "ftp://ftp.ebi.ac.uk/pub/databases/wormbase/releases"
agr_human_go_associations: "http://download.alliancegenome.org/3.1.0/GAF/HUMAN/GAF_HUMAN_3.gaf"
agr_go_ontology: "http://download.alliancegenome.org/3.1.0/ONTOLOGY/GO/ONTOLOGY_GO_2.obo"
agr_human_go_associations: "http://download.alliancegenome.org/3.2.0/GAF/HUMAN/GAF_HUMAN_1.gaf"
agr_go_ontology: "http://download.alliancegenome.org/3.2.0/ONTOLOGY/GO/ONTOLOGY_GO_2.obo"
organisms_to_process:
# add the species to be processed to the following list. Entries must have the same text as in 'organisms'
# option
@@ -211,6 +211,10 @@ go_sentences_options:
match_regex: "structural constituent"
prefix: "is a"
postfix: ""
- id: 2
match_regex: "^extracellular matrix structural constituent"
prefix: "is an"
postfix: ""
- aspect: F
group: EXPERIMENTAL
qualifier: "contributes_to"
@@ -221,6 +225,10 @@ go_sentences_options:
match_regex: "structural constituent"
prefix: "contributes as a"
postfix: ""
- id: 2
match_regex: "^extracellular matrix structural constituent"
prefix: "contributes as an"
postfix: ""
- aspect: F
group: HIGH_THROUGHPUT_EXPERIMENTAL
qualifier: "enables"
@@ -231,6 +239,10 @@ go_sentences_options:
match_regex: "structural constituent"
prefix: "is a"
postfix: ""
- id: 2
match_regex: "^extracellular matrix structural constituent"
prefix: "is an"
postfix: ""
- aspect: F
group: HIGH_THROUGHPUT_EXPERIMENTAL
qualifier: "contributes_to"
@@ -239,7 +251,11 @@ go_sentences_options:
special_cases:
- id: 1
match_regex: "structural constituent"
prefix: "is a"
prefix: "contributes as a"
postfix: ""
- id: 2
match_regex: "^extracellular matrix structural constituent"
prefix: "contributes as an"
postfix: ""
- aspect: F
group: PHYLOGENETIC_ANALYSIS_AND_SEQUENCE_BASED_ANALYSIS
@@ -251,6 +267,10 @@ go_sentences_options:
match_regex: "structural constituent"
prefix: "is predicted to be a"
postfix: ""
- id: 2
match_regex: "^extracellular matrix structural constituent"
prefix: "is predicted to be an"
postfix: ""
- aspect: F
group: PHYLOGENETIC_ANALYSIS_AND_SEQUENCE_BASED_ANALYSIS
qualifier: "contributes_to"
@@ -261,6 +281,10 @@ go_sentences_options:
match_regex: "structural constituent"
prefix: "is predicted to contribute as a"
postfix: ""
- id: 2
match_regex: "^extracellular matrix structural constituent"
prefix: "is an"
postfix: "is predicted to contribute as an"
- aspect: F
group: INFERRED_BY_CURATORS_AND_AUTHORS
qualifier: "enables"
@@ -271,6 +295,10 @@ go_sentences_options:
match_regex: "structural constituent"
prefix: "is predicted to be a"
postfix: ""
- id: 2
match_regex: "^extracellular matrix structural constituent"
prefix: "is predicted to be an"
postfix: ""
- aspect: F
group: INFERRED_BY_CURATORS_AND_AUTHORS
qualifier: "contributes_to"
@@ -281,6 +309,10 @@ go_sentences_options:
match_regex: "structural constituent"
prefix: "is predicted to contribute as a"
postfix: ""
- id: 2
match_regex: "^extracellular matrix structural constituent"
prefix: "is predicted to contribute as an"
postfix: ""
- aspect: F
group: ELECTRONIC_AND_COMPUTATIONAL_ANALYSIS
qualifier: "enables"
@@ -291,6 +323,10 @@ go_sentences_options:
match_regex: "structural constituent"
prefix: "is predicted to be a"
postfix: ""
- id: 2
match_regex: "^extracellular matrix structural constituent"
prefix: "is predicted to be an"
postfix: ""
- aspect: F
group: ELECTRONIC_AND_COMPUTATIONAL_ANALYSIS
qualifier: "contributes_to"
@@ -301,6 +337,10 @@ go_sentences_options:
match_regex: "structural constituent"
prefix: "is predicted to contribute as a"
postfix: ""
- id: 2
match_regex: "^extracellular matrix structural constituent"
prefix: "is predicted to contribute as an"
postfix: ""
- aspect: P
group: EXPERIMENTAL
qualifier: "involved_in"
@@ -316,25 +356,25 @@ go_sentences_options:
- aspect: P
group: EXPERIMENTAL
qualifier: "acts_upstream_of_positive_effect"
prefix: "acts upstream of and positively affects"
prefix: "acts upstream of with a positive effect on"
postfix: ""
special_cases:
- aspect: P
group: HIGH_THROUGHPUT_EXPERIMENTAL
qualifier: "acts_upstream_of_positive_effect"
prefix: "acts upstream of and positively affects"
prefix: "acts upstream of with a positive effect on"
postfix: ""
special_cases:
- aspect: P
group: EXPERIMENTAL
qualifier: "acts_upstream_of_negative_effect"
prefix: "acts upstream of and negatively affects"
prefix: "acts upstream of with a negative effect on"
postfix: ""
special_cases:
- aspect: P
group: HIGH_THROUGHPUT_EXPERIMENTAL
qualifier: "acts_upstream_of_negative_effect"
prefix: "acts upstream of and negatively affects"
prefix: "acts upstream of with a negative effect on"
postfix: ""
special_cases:
- aspect: P
@@ -352,25 +392,25 @@ go_sentences_options:
- aspect: P
group: EXPERIMENTAL
qualifier: "acts_upstream_of_or_within_negative_effect"
prefix: "acts upstream of or within and negatively affects"
prefix: "acts upstream of or within with a negative effect on"
postfix: ""
special_cases:
- aspect: P
group: HIGH_THROUGHPUT_EXPERIMENTAL
qualifier: "acts_upstream_of_or_within_negative_effect"
prefix: "acts upstream of or within and negatively affects"
prefix: "acts upstream of or within with a negative effect on"
postfix: ""
special_cases:
- aspect: P
group: EXPERIMENTAL
qualifier: "acts_upstream_of_or_within_positive_effect"
prefix: "acts upstream of or within and positively affects"
prefix: "acts upstream of or within with a positive effect on"
postfix: ""
special_cases:
- aspect: P
group: HIGH_THROUGHPUT_EXPERIMENTAL
qualifier: "acts_upstream_of_or_within_positive_effect"
prefix: "acts upstream of or within and positively affects"
prefix: "acts upstream of or within with a positive effect on"
postfix: ""
special_cases:
- aspect: C
@@ -383,6 +423,10 @@ go_sentences_options:
match_regex: "intracellular$"
prefix: "is"
postfix: ""
- id: 2
match_regex: ".*component of.*"
prefix: "is"
postfix: ""
- aspect: C
group: EXPERIMENTAL
qualifier: "colocalizes_with"
@@ -393,6 +437,10 @@ go_sentences_options:
match_regex: "intracellular$"
prefix: "is"
postfix: ""
- id: 2
match_regex: ".*component of.*"
prefix: "is"
postfix: ""
- aspect: C
group: EXPERIMENTAL
qualifier: "part_of"
@@ -403,6 +451,10 @@ go_sentences_options:
match_regex: "intracellular$"
prefix: "is"
postfix: ""
- id: 2
match_regex: ".*component of.*"
prefix: "is"
postfix: ""
- aspect: C
group: HIGH_THROUGHPUT_EXPERIMENTAL
qualifier: "located_in"
@@ -413,6 +465,10 @@ go_sentences_options:
match_regex: "intracellular$"
prefix: "is"
postfix: ""
- id: 2
match_regex: ".*component of.*"
prefix: "is"
postfix: ""
- aspect: C
group: HIGH_THROUGHPUT_EXPERIMENTAL
qualifier: "colocalizes_with"
@@ -423,6 +479,10 @@ go_sentences_options:
match_regex: "intracellular$"
prefix: "is"
postfix: ""
- id: 2
match_regex: ".*component of.*"
prefix: "is"
postfix: ""
- aspect: C
group: HIGH_THROUGHPUT_EXPERIMENTAL
qualifier: "part_of"
@@ -433,6 +493,10 @@ go_sentences_options:
match_regex: "intracellular$"
prefix: "is"
postfix: ""
- id: 2
match_regex: ".*component of.*"
prefix: "is"
postfix: ""

truncate_others_aggregation_word: several
truncate_others_terms:

0 comments on commit 79760e1

Please sign in to comment.