Skip to content

Commit

Permalink
Merge branch 'develop'
Browse files Browse the repository at this point in the history
  • Loading branch information
valearna committed Jan 3, 2020
2 parents ea69a72 + 3fbd06e commit 9db3559
Show file tree
Hide file tree
Showing 13 changed files with 207,539 additions and 57 deletions.
1 change: 1 addition & 0 deletions genedescriptions/commons.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
@dataclass
class Sentence:
prefix: str
initial_terms_ids: List[str]
terms_ids: List[str]
postfix: str
text: str
Expand Down
14 changes: 11 additions & 3 deletions genedescriptions/descriptions_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,15 +21,20 @@ def get_description(self):
return " and ".join([sentence.text for sentence in self.sentences])

def get_ids(self, experimental_only: bool = False):
return [term_id for sentence in self.sentences for term_id in sentence.terms_ids if not experimental_only or
sentence.evidence_group.startswith("EXPERIMENTAL")]
return list({term_id for sentence in self.sentences for term_id in sentence.terms_ids if not experimental_only or
sentence.evidence_group.startswith("EXPERIMENTAL")})

def get_initial_ids(self, experimental_only: bool = False):
return list({term_id for sentence in self.sentences for term_id in sentence.initial_terms_ids if not
experimental_only or sentence.evidence_group.startswith("EXPERIMENTAL")})

def contains_sentences(self):
return len(self.sentences) > 0


class SentenceMerger(object):
def __init__(self):
self.initial_terms_ids = set()
self.postfix_list = []
self.terms_ids = set()
self.term_postfix_dict = {}
Expand Down Expand Up @@ -124,6 +129,7 @@ def get_module_sentences(self, aspect: str, qualifier: str = '',
and len(trimming_result.final_terms) > 0:
sentences.append(
_get_single_sentence(
initial_terms_ids=list(terms),
node_ids=trimming_result.final_terms, ontology=self.ontology, aspect=aspect,
evidence_group=evidence_group, qualifier=qualifier,
prepostfix_sentences_map=self.prepostfix_sentences_map,
Expand Down Expand Up @@ -233,6 +239,7 @@ def merge_sentences_with_same_prefix(self, sentences: List[Sentence], remove_par
merged_sentences[prefix].aspect = sentence.aspect
merged_sentences[prefix].qualifier = sentence.qualifier
merged_sentences[prefix].terms_ids.update(sentence.terms_ids)
merged_sentences[prefix].initial_terms_ids.update(sentence.initial_terms_ids)
for term in sentence.terms_ids:
merged_sentences[prefix].term_postfix_dict[term] = self.prepostfix_sentences_map[
(sentence.aspect, sentence.evidence_group, sentence.qualifier)][1]
Expand All @@ -252,7 +259,8 @@ def merge_sentences_with_same_prefix(self, sentences: List[Sentence], remove_par
logger.debug("Removed " + str(len(sent_merger.terms_ids) - len(terms_no_ancestors)) +
" parents from terms while merging sentences with same prefix")
sent_merger.terms_ids = terms_no_ancestors
return [Sentence(prefix=prefix, terms_ids=list(sent_merger.terms_ids),
return [Sentence(prefix=prefix, initial_terms_ids=list(sent_merger.initial_terms_ids),
terms_ids=list(sent_merger.terms_ids),
postfix=OntologySentenceGenerator.merge_postfix_phrases(sent_merger.postfix_list),
text=compose_sentence(prefix=prefix,
term_names=[self.ontology.label(node, id_if_null=True) for node in
Expand Down
61 changes: 29 additions & 32 deletions genedescriptions/gene_description.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from typing import List

from genedescriptions.commons import Module
from genedescriptions.commons import Module, Sentence
from genedescriptions.config_parser import GenedescConfigParser
from genedescriptions.descriptions_generator import OntologySentenceGenerator, ModuleSentences
from genedescriptions.sentence_generation_functions import concatenate_words_with_oxford_comma
Expand Down Expand Up @@ -185,52 +185,49 @@ def _get_module_initial_set(aspect: str, sentence_generator: OntologySentenceGen
(aspect, additional_qualifier)].items() for elem in sets if (aspect, key, additional_qualifier) in
sentence_generator.prepostfix_sentences_map]))

def set_initial_stats(self, module: Module, sentence_generator: OntologySentenceGenerator,
sentence_generator_exp_only: OntologySentenceGenerator = None):
def set_or_update_initial_stats(self, module: Module, sent_generator: OntologySentenceGenerator,
module_sentences: ModuleSentences):
"""set initial stats for a specific module
Args:
sent_generator: the main sentence generator
module: the module
sentence_generator: the main sentence generator
sentence_generator_exp_only: sentence generator with experimental evidence codes only
module_sentences (ModuleSentences): the module sentences
Returns:
"""
if module == Module.GO_FUNCTION:
self.stats.set_initial_go_ids_f = self._get_module_initial_set(
aspect="F", additional_qualifier="contributes_to", sentence_generator=sentence_generator)
self.stats.set_initial_experimental_go_ids_f = self._get_module_initial_set(
aspect="F", additional_qualifier="contributes_to", sentence_generator=sentence_generator_exp_only)
self.stats.set_initial_go_ids_f = self._get_merged_ids(module_sentences.get_initial_ids(),
self.stats.set_initial_go_ids_f)
self.stats.set_initial_experimental_go_ids_f = self._get_merged_ids(
module_sentences.get_initial_ids(experimental_only=True), self.stats.set_initial_experimental_go_ids_f)
elif module == Module.GO_COMPONENT:
self.stats.set_initial_go_ids_c = self._get_module_initial_set(
aspect="C", additional_qualifier="colocalizes_with", sentence_generator=sentence_generator)
self.stats.set_initial_experimental_go_ids_c = self._get_module_initial_set(
aspect="C", additional_qualifier="colocalizes_with", sentence_generator=sentence_generator_exp_only)
self.stats.set_initial_go_ids_c = self._get_merged_ids(module_sentences.get_initial_ids(),
self.stats.set_initial_go_ids_c)
self.stats.set_initial_experimental_go_ids_c = self._get_merged_ids(
module_sentences.get_initial_ids(experimental_only=True), self.stats.set_initial_experimental_go_ids_c)
elif module == Module.GO_PROCESS:
self.stats.set_initial_go_ids_p = self._get_module_initial_set(
aspect="P", sentence_generator=sentence_generator)
self.stats.set_initial_experimental_go_ids_p = self._get_module_initial_set(
aspect="P", sentence_generator=sentence_generator_exp_only)
self.stats.set_initial_go_ids_p = self._get_merged_ids(module_sentences.get_initial_ids(),
self.stats.set_initial_go_ids_p)
self.stats.set_initial_experimental_go_ids_p = self._get_merged_ids(
module_sentences.get_initial_ids(experimental_only=True), self.stats.set_initial_experimental_go_ids_p)
elif module == Module.EXPRESSION:
self.stats.set_initial_expression_ids = self._get_module_initial_set(
aspect="A", main_qualifier="Verified", sentence_generator=sentence_generator)
self.stats.set_initial_expression_ids = self._get_merged_ids(module_sentences.get_initial_ids(),
self.stats.set_initial_expression_ids)
elif module == Module.DO_EXPERIMENTAL:
self.stats.total_number_do_exp_bio_annotations += len(sentence_generator.gene_annots)
self.stats.set_initial_do_ids = self._get_merged_ids(
[term_id for terms in sentence_generator.terms_groups.values() for tvalues in terms.values() for
term_id in tvalues], self.stats.set_initial_do_ids)
self.stats.total_number_do_exp_bio_annotations += len(sent_generator.gene_annots)
self.stats.set_initial_do_ids = self._get_merged_ids(module_sentences.get_initial_ids(),
self.stats.set_initial_do_ids)
elif module == Module.DO_BIOMARKER:
self.stats.total_number_do_exp_bio_annotations += len(sentence_generator.gene_annots)
self.stats.set_initial_do_ids = self._get_merged_ids(
[term_id for terms in sentence_generator.terms_groups.values() for tvalues in terms.values() for term_id
in tvalues], self.stats.set_initial_do_ids)
self.stats.total_number_do_exp_bio_annotations += len(sent_generator.gene_annots)
self.stats.set_initial_do_ids = self._get_merged_ids(module_sentences.get_initial_ids(),
self.stats.set_initial_do_ids)
elif module == Module.DO_ORTHOLOGY:
self.stats.total_number_do_via_orth_annotations = len(sentence_generator.gene_annots)
self.stats.set_initial_do_ids = self._get_merged_ids(
[term_id for terms in sentence_generator.terms_groups.values() for tvalues in terms.values() for term_id
in tvalues], self.stats.set_initial_do_ids)
self.stats.total_number_do_via_orth_annotations += len(sent_generator.gene_annots)
self.stats.set_initial_do_ids = self._get_merged_ids(module_sentences.get_initial_ids(),
self.stats.set_initial_do_ids)
self.stats.total_number_do_annotations = self.stats.total_number_do_exp_bio_annotations + \
self.stats.total_number_do_via_orth_annotations
if module == Module.GO_PROCESS or module == Module.GO_FUNCTION or module == Module.GO_COMPONENT:
self.stats.total_number_go_annotations = len(sentence_generator.gene_annots)
self.stats.total_number_go_annotations = len(sent_generator.gene_annots)
4 changes: 3 additions & 1 deletion genedescriptions/ontology_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def get_all_common_ancestors(node_ids: List[str], ontology: Ontology, min_distan
for basic_prop_val in onto_anc["meta"]["basicPropertyValues"]:
if basic_prop_val["pred"] == "OIO:hasOBONamespace":
onto_anc_root = basic_prop_val["val"]
if onto_anc["depth"] >= min_distance_from_root and (
if (ancestor in node_ids or onto_anc["depth"] >= min_distance_from_root) and (
not onto_anc_root or onto_anc_root == common_root) and (not nodeids_blacklist or ancestor not in
nodeids_blacklist):
ancestors[ancestor].append(node_id)
Expand Down Expand Up @@ -104,6 +104,7 @@ def set_all_depths_in_subgraph(ontology: Ontology, root_id: str, relations: List


def set_ic_ontology_struct(ontology: Ontology, relations: List[str] = None):
logger.info("Setting information content values based on ontology structure")
roots = ontology.get_roots(relations=relations)
for root_id in roots:
if "num_subsumers" not in ontology.node(root_id) and ("type" not in ontology.node(root_id) or
Expand All @@ -121,6 +122,7 @@ def set_ic_ontology_struct(ontology: Ontology, relations: List[str] = None):
if "type" not in ontology.node(root_id) or ontology.node_type(root_id) == "CLASS":
_set_information_content_in_subgraph(ontology=ontology, root_id=root_id,
maxleaves=ontology.node(root_id)["num_leaves"], relations=relations)
logger.info("Finished setting information content values")


def set_ic_annot_freq(ontology: Ontology, annotations: AssociationSet):
Expand Down
28 changes: 18 additions & 10 deletions genedescriptions/precanned_modules.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,12 +50,16 @@ def set_gene_ontology_module(dm: DataManager, conf_parser: GenedescConfigParser,
module_sentences=comp_module_sentences, module=Module.GO_COMPONENT)
gene_desc.set_or_extend_module_description_and_final_stats(module_sentences=colocalizes_with_module_sentences,
module=Module.GO_COMPONENT)
gene_desc.set_initial_stats(module=Module.GO_FUNCTION, sentence_generator=go_sent_generator,
sentence_generator_exp_only=go_sent_generator_exp)
gene_desc.set_initial_stats(module=Module.GO_PROCESS, sentence_generator=go_sent_generator,
sentence_generator_exp_only=go_sent_generator_exp)
gene_desc.set_initial_stats(module=Module.GO_COMPONENT, sentence_generator=go_sent_generator,
sentence_generator_exp_only=go_sent_generator_exp)
gene_desc.set_or_update_initial_stats(module=Module.GO_FUNCTION, sent_generator=go_sent_generator,
module_sentences=contributes_to_module_sentences)
gene_desc.set_or_update_initial_stats(module=Module.GO_FUNCTION, sent_generator=go_sent_generator,
module_sentences=func_module_sentences)
gene_desc.set_or_update_initial_stats(module=Module.GO_PROCESS, sent_generator=go_sent_generator,
module_sentences=proc_module_sentences)
gene_desc.set_or_update_initial_stats(module=Module.GO_COMPONENT, sent_generator=go_sent_generator,
module_sentences=colocalizes_with_module_sentences)
gene_desc.set_or_update_initial_stats(module=Module.GO_COMPONENT, sent_generator=go_sent_generator,
module_sentences=comp_module_sentences)


def set_disease_module(df: DataManager, conf_parser: GenedescConfigParser, gene_desc: GeneDescription, gene: Gene,
Expand All @@ -81,9 +85,12 @@ def set_disease_module(df: DataManager, conf_parser: GenedescConfigParser, gene_
aspect='D', merge_groups_with_same_prefix=True, keep_only_best_group=False)
gene_desc.set_or_extend_module_description_and_final_stats(module=Module.DO_ORTHOLOGY,
module_sentences=disease_via_orth_module_sentences)
gene_desc.set_initial_stats(module=Module.DO_EXPERIMENTAL, sentence_generator=do_sentence_exp_generator)
gene_desc.set_initial_stats(module=Module.DO_BIOMARKER, sentence_generator=do_sentence_bio_generator)
gene_desc.set_initial_stats(module=Module.DO_ORTHOLOGY, sentence_generator=do_via_orth_sentence_generator)
gene_desc.set_or_update_initial_stats(module=Module.DO_EXPERIMENTAL, sent_generator=do_sentence_exp_generator,
module_sentences=disease_exp_module_sentences)
gene_desc.set_or_update_initial_stats(module=Module.DO_BIOMARKER, sent_generator=do_sentence_bio_generator,
module_sentences=disease_bio_module_sentences)
gene_desc.set_or_update_initial_stats(module=Module.DO_ORTHOLOGY, sent_generator=do_via_orth_sentence_generator,
module_sentences=disease_via_orth_module_sentences)


def set_expression_module(df: DataManager, conf_parser: GenedescConfigParser, gene_desc: GeneDescription, gene: Gene):
Expand All @@ -93,7 +100,8 @@ def set_expression_module(df: DataManager, conf_parser: GenedescConfigParser, ge
aspect='A', qualifier="Verified", merge_groups_with_same_prefix=True, keep_only_best_group=False)
gene_desc.set_or_extend_module_description_and_final_stats(module_sentences=expression_module_sentences,
module=Module.EXPRESSION)
gene_desc.set_initial_stats(module=Module.EXPRESSION, sentence_generator=expr_sentence_generator)
gene_desc.set_or_update_initial_stats(module=Module.EXPRESSION, sent_generator=expr_sentence_generator,
module_sentences=expression_module_sentences)


def set_alliance_human_orthology_module(orthologs: List[List[str]], gene_desc: GeneDescription,
Expand Down
7 changes: 4 additions & 3 deletions genedescriptions/sentence_generation_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,9 @@ def compose_sentence(prefix: str, additional_prefix: str, term_names: List[str],
separator=config.get_terms_delimiter()) + postfix


def _get_single_sentence(node_ids: List[str], ontology: Ontology, aspect: str, evidence_group: str, qualifier: str,
prepostfix_sentences_map: Dict[Tuple[str, str, str], Tuple[str, str]],
def _get_single_sentence(initial_terms_ids: List[str], node_ids: List[str], ontology: Ontology, aspect: str,
evidence_group: str, qualifier: str, prepostfix_sentences_map: Dict[Tuple[str, str, str],
Tuple[str, str]],
config: GenedescConfigParser, terms_merged: bool = False, add_others: bool = False,
truncate_others_generic_word: str = "several",
truncate_others_aspect_words: Dict[str, str] = None,
Expand Down Expand Up @@ -94,7 +95,7 @@ def _get_single_sentence(node_ids: List[str], ontology: Ontology, aspect: str, e
term_labels = [ontology.label(node_id, id_if_null=True) for node_id in node_ids]
if ancestors_with_multiple_children is None:
ancestors_with_multiple_children = set()
return Sentence(prefix=prefix, terms_ids=node_ids, postfix=postfix,
return Sentence(prefix=prefix, initial_terms_ids=initial_terms_ids, terms_ids=node_ids, postfix=postfix,
text=compose_sentence(prefix=prefix, term_names=term_labels, postfix=postfix,
additional_prefix=additional_prefix,
ancestors_with_multiple_children=ancestors_with_multiple_children,
Expand Down
3 changes: 1 addition & 2 deletions genedescriptions/stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,10 @@ def __init__(self):
@staticmethod
def _get_num_covered_nodes(set_initial_terms, set_final_terms, ontology):
num_covered_nodes = 0
final_t_ancestors = {final_term: ontology.ancestors(final_term) for final_term in set_final_terms}
for initial_term in set_initial_terms:
initial_t_ancestors = set(ontology.ancestors(initial_term, reflexive=True))
for final_term in set_final_terms:
if final_term in initial_t_ancestors or initial_term in final_t_ancestors[final_term]:
if final_term in initial_t_ancestors:
num_covered_nodes += 1
break
return num_covered_nodes
Expand Down
4 changes: 2 additions & 2 deletions genedescriptions/trimming.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,8 +137,8 @@ def trim(self, node_ids: List[str], max_num_nodes: int = 3, min_distance_from_ro
best_cands = [(cand_id, candidates_dict[cand_id][1])]
for best_cand in best_cands:
selected_cands_ids.append(best_cand[0])
for node_id in candidates_dict[best_cand[0]][1]:
cands_ids_to_process -= set(node_to_cands_map[node_id])
cands_ids_to_process = {cand_id for cand_id in cands_ids_to_process if best_cand[0] not in
self.ontology.ancestors(cand_id, reflexive=True)}
else:
selected_cands_ids.append(cand_id)
if len(selected_cands_ids) <= max_num_nodes:
Expand Down
6 changes: 3 additions & 3 deletions tests/config_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -668,12 +668,12 @@ expression_sentences_options:
- "WBbt:0007849"
- "WBbt:0005738"
rename_terms:
"^neuron$": "nervous system"
"^neuron$": "neurons"
"(.*)([a-z]+ )neuron$": "\\1\\2neurons"
"^germ cell$": "germ cells"
"^hypodermis$": "the hypodermis"
"(.*)hermaphrodite-specific$": "\\1hermaphrodite"
"(.*)male-specific$": "\\1male"
"^ganglion$": "ganglia"
"^neuroblast$": "neuroblasts"
evidence_codes:
IDA:
group: EXPERIMENTAL
Expand Down
Loading

0 comments on commit 9db3559

Please sign in to comment.