Skip to content

Commit

Permalink
feat(*): updated WB pipeline and added option to load IBA from gaf files
Browse files Browse the repository at this point in the history
  • Loading branch information
valearna committed Mar 26, 2021
1 parent 7db8e06 commit caaeb1a
Show file tree
Hide file tree
Showing 6 changed files with 35 additions and 15 deletions.
4 changes: 2 additions & 2 deletions genedescriptions/api_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ def __init__(self, textpresso_api_token):
self.textpresso_api_token = textpresso_api_token
self.tpc_cache = {}
self.class_cache = {}
self.tpc_api_endpoint = "https://textpressocentral.org:18080/v1/textpresso/api/get_documents_count"
self.tpc_api_endpoint = "http://textpressocentral.org:9001/v1/textpresso/api/get_documents_count"
if not os.environ.get('PYTHONHTTPSVERIFY', '') and getattr(ssl, '_create_unverified_context', None):
ssl._create_default_https_context = ssl._create_unverified_context

Expand All @@ -29,7 +29,7 @@ def get_textpresso_popularity(self, keyword: str):
return self.tpc_cache[keyword]
else:
data = json.dumps({"token": self.textpresso_api_token, "query": {
"keywords": keyword, "type": "document", "corpora": ["C. elegans"]}})
"keywords": keyword, "type": "document", "corpora": ["C. elegans and Suppl"]}})
data = data.encode('utf-8')
req = urllib.request.Request(self.tpc_api_endpoint, data, headers={'Content-type': 'application/json',
'Accept': 'application/json'})
Expand Down
8 changes: 4 additions & 4 deletions genedescriptions/config_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,10 @@ class GenedescConfigParser(object):
def __init__(self, file_path):
with open(file_path) as conf_file:
self.config = yaml.safe_load(conf_file)
# self.add_go_do_not_annotate_to_blacklist(
# 'http://current.geneontology.org/ontology/subsets/gocheck_do_not_annotate.json')
# self.add_go_do_not_annotate_to_blacklist(
# 'http://current.geneontology.org/ontology/subsets/gocheck_do_not_manually_annotate.json')
self.add_go_do_not_annotate_to_blacklist(
'http://current.geneontology.org/ontology/subsets/gocheck_do_not_annotate.json')
self.add_go_do_not_annotate_to_blacklist(
'http://current.geneontology.org/ontology/subsets/gocheck_do_not_manually_annotate.json')

def add_go_do_not_annotate_to_blacklist(self, slim_url):
response = urllib.request.urlopen(slim_url)
Expand Down
9 changes: 6 additions & 3 deletions genedescriptions/data_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
from collections import defaultdict
from typing import List, Iterable, Dict
from ontobio import AssociationSetFactory
from ontobio.io.assocparser import AssocParserConfig
from ontobio.io.gafparser import GafParser
from ontobio.ontol_factory import OntologyFactory
from ontobio.ontol import Ontology
from ontobio.assocmodel import AssociationSet
Expand Down Expand Up @@ -312,9 +314,10 @@ def load_associations_from_file(self, associations_type: DataType, associations_
associations_cache_path (str): path to cache file for the associations
config (GenedescConfigParser): configuration object where to read properties
"""
assocs = AssociationSetFactory().create_from_file(file=self._get_cached_file(
cache_path=associations_cache_path, file_source_url=associations_url),
ontology=self.get_ontology(associations_type), skim=False)
assoc_config = AssocParserConfig(remove_double_prefixes=True, paint=True)
assocs = AssociationSetFactory().create_from_assocs(assocs=GafParser(config=assoc_config).parse(
file=self._get_cached_file(cache_path=associations_cache_path, file_source_url=associations_url),
skipheader=True), ontology=self.get_ontology(associations_type))
self.set_associations(associations_type=associations_type, associations=assocs, config=config)

def get_annotations_for_gene(self, gene_id: str, annot_type: DataType = DataType.GO,
Expand Down
25 changes: 21 additions & 4 deletions wormbase/config_wb.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ generic:

# options for WormBase gene description generation - used when data_fetcher is set to 'wb_data_fetcher'
wb_options:
release: "WS279"
release: "WS280"
raw_files_source: "ftp://ftp.ebi.ac.uk/pub/databases/wormbase/releases"
agr_human_go_associations: "http://download.alliancegenome.org/3.2.0/GAF/HUMAN/GAF_HUMAN_1.gaf"
agr_go_ontology: "http://download.alliancegenome.org/3.2.0/ONTOLOGY/GO/ONTOLOGY_GO_2.obo"
Expand Down Expand Up @@ -114,6 +114,23 @@ go_sentences_options:
- "GO:0040014"
- "GO:0040015"
- "GO:0040018"
- "GO:0048522"
- "GO:0048523"
- "GO:0050794"
- "GO:0050789"
- "GO:0048519"
- "GO:0048518"
- "GO:0044057"
- "GO:0042221"
- "GO:0050795"
- "GO:0048856"
- "GO:0044237"
- "GO:0031323"
- "GO:0065009"
- "GO:0044092"
- "GO:0044093"
- "GO:0065008"

remap_terms:
"GO:0018996": "GO:0042303"
"GO:0007591": "GO:0042303"
Expand Down Expand Up @@ -511,9 +528,9 @@ go_sentences_options:
trimming_algorithm: ic
max_num_terms: 3
trim_min_distance_from_root:
F: 3
P: 5
C: 5
F: 2
P: 2
C: 2
add_multiple_if_covers_more_children: false
remove_overlapped_terms: true
slim_url: http://current.geneontology.org/ontology/subsets/goslim_generic.obo
Expand Down
2 changes: 1 addition & 1 deletion wormbase/wb_data_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ def __init__(self, config: GenedescConfigParser, species: str, go_relations: Lis
"disease_associations.by_orthology." + release_version +
".tsv.txt")
self.do_associations_url = raw_files_source + '/' + release_version + \
'/ONTOLOGY/disease_associations.by_orthology.' + release_version + '.tsv.txt'
'/ONTOLOGY/disease_association.by_orthology.' + release_version + '.tsv.txt'
self.do_associations_new_cache_path = os.path.join(cache_location, "wormbase", release_version, 'ONTOLOGY',
'disease_association.' + release_version + '.daf.txt')
self.do_associations_new_url = raw_files_source + '/' + release_version + '/ONTOLOGY/disease_association.' + \
Expand Down
2 changes: 1 addition & 1 deletion wormbase/wormbase_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,7 +285,7 @@ def main():
if "json" in args.output_formats:
logger.info("Writing descriptions to json")
desc_writer.write_json(os.path.join(conf_parser.get_out_dir(), date_prefix + "_" + organism + ".json"),
pretty=True, include_single_gene_stats=True, data_manager=dm)
include_single_gene_stats=True, data_manager=dm)
if "txt" in args.output_formats:
logger.info("Writing descriptions to txt")
desc_writer.write_plain_text(os.path.join(conf_parser.get_out_dir(), date_prefix + "_" + organism + ".txt"))
Expand Down

0 comments on commit caaeb1a

Please sign in to comment.