From 8daa8ce0444762414c582b5bc0377387d3b86a12 Mon Sep 17 00:00:00 2001 From: callahantiff Date: Wed, 13 Oct 2021 23:57:01 -0600 Subject: [PATCH 1/8] fixing language error for builds --- builds/data_preprocessing.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/builds/data_preprocessing.py b/builds/data_preprocessing.py index 7b7f55e1..3fe19caa 100755 --- a/builds/data_preprocessing.py +++ b/builds/data_preprocessing.py @@ -1526,11 +1526,11 @@ def _creates_relations_metadata_dict(self) -> Dict: [x for x in gets_object_properties(ro_graph) if '/RO_' in str(x)] master_synonyms = [x for x in ro_graph if 'synonym' in str(x[1]).lower() and isinstance(x[0], URIRef)] for x in tqdm(cls): - cls_label = list(ro_graph.objects(x, RDFS.label)) + cls_label = [x for x in ro_graph.objects(x, RDFS.label) if '@' not in n3(x) or '@en' in n3(x)] labels = str(cls_label[0]) if len(cls_label) > 0 else 'None' cls_syn = [str(i[2]) for i in master_synonyms if x == i[0]] synonym = str(cls_syn[0]) if len(cls_syn) > 0 else 'None' - cls_desc = list(ro_graph.objects(x, obo.IAO_0000115)) + cls_desc = [x for x in ro_graph.objects(x, obo.IAO_0000115) if '@' not in n3(x) or '@en' in n3(x)] desc = '|'.join([str(cls_desc[0])]) if len(cls_desc) > 0 else 'None' relation_metadata_dict[str(x)] = {'Label': labels, 'Description': desc, 'Synonym': synonym} From bf92a6d655ad7973d07b1bde9f62eb7363aa2502 Mon Sep 17 00:00:00 2001 From: callahantiff Date: Wed, 13 Oct 2021 23:57:28 -0600 Subject: [PATCH 2/8] fixing language error for builds --- notebooks/Data_Preparation.ipynb | 4 ++-- notebooks/OWLNETS_Example_Application.ipynb | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/notebooks/Data_Preparation.ipynb b/notebooks/Data_Preparation.ipynb index ef5d679b..e4bb5600 100644 --- a/notebooks/Data_Preparation.ipynb +++ b/notebooks/Data_Preparation.ipynb @@ -4278,13 +4278,13 @@ "\n", "for x in tqdm(cls):\n", " # labels\n", - " cls_label = list(ro_graph.objects(x, RDFS.label))\n", + " cls_label = [x for x in ro_graph.objects(x, RDFS.label) if '@' not in n3(x) or '@en' in n3(x)]\n", " labels = str(cls_label[0]) if len(cls_label) > 0 else 'None'\n", " # synonyms\n", " cls_syn = [str(i[2]) for i in master_synonyms if x == i[0]]\n", " synonym = str(cls_syn[0]) if len(cls_syn) > 0 else 'None'\n", " # description\n", - " cls_desc = list(ro_graph.objects(x, obo.IAO_0000115))\n", + " cls_desc = [x for x in ro_graph.objects(x, obo.IAO_0000115) if '@' not in n3(x) or '@en' in n3(x)]\n", " desc = '|'.join([str(cls_desc[0])]) if len(cls_desc) > 0 else 'None'\n", " \n", " relation_metadata_dict[str(x)] = {\n", diff --git a/notebooks/OWLNETS_Example_Application.ipynb b/notebooks/OWLNETS_Example_Application.ipynb index 21cd1ea8..9b40251f 100644 --- a/notebooks/OWLNETS_Example_Application.ipynb +++ b/notebooks/OWLNETS_Example_Application.ipynb @@ -321,7 +321,7 @@ "outputs": [], "source": [ "ont_classes = pkt.utils.gets_ontology_classes(graph)\n", - "ont_labels = {str(x[0]): str(x[2]) for x in list(graph.triples((None, RDFS.label, None)))}\n", + "ont_labels = {str(x[0]): str(x[2]) for x in list(graph.triples((None, RDFS.label, None))) if '@' not in pkt.utils.n3(x[2]) or '@en' in pkt.utils.n3(x[2])}\n", "ont_synonyms = pkt.utils.gets_ontology_class_synonyms(graph)\n", "ont_dbxrefs = pkt.utils.gets_ontology_class_dbxrefs(graph)\n", "ont_defs = pkt.utils.gets_ontology_definitions(graph)" @@ -377,7 +377,7 @@ "source": [ "for obj in tqdm(ont_objects):\n", " # get object label\n", - " label_hits = list(graph.objects(obj, RDFS.label))\n", + " label_hits = [x for x in graph.objects(obj, RDFS.label) if '@' not in pkt.utils.n3(x) or '@en' in pkt.utils.n3(x)]\n", " label = str(label_hits[0]) if len(label_hits) > 0 else 'None'\n", " \n", " # get object namespace\n", From 176b971e4dbe83b7885744b0755507c80c2e54c5 Mon Sep 17 00:00:00 2001 From: callahantiff Date: Wed, 13 Oct 2021 23:58:08 -0600 Subject: [PATCH 3/8] updating defn func --- pkt_kg/utils/kg_utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pkt_kg/utils/kg_utils.py b/pkt_kg/utils/kg_utils.py index 0d9aab9b..4aa898b4 100644 --- a/pkt_kg/utils/kg_utils.py +++ b/pkt_kg/utils/kg_utils.py @@ -53,6 +53,7 @@ from tqdm import tqdm # type: ignore from typing import Dict, List, Optional, Set, Tuple, Union +from pkt_kg.utils import * # set-up environment variables obo = Namespace('http://purl.obolibrary.org/obo/') @@ -94,7 +95,8 @@ def gets_ontology_definitions(graph: Graph) -> Dict: ...} """ - obj_defs = {x[0]: x[2] for x in graph.triples((None, obo.IAO_0000115, None))} + obj_defs = {x[0]: x[2] for x in graph.triples((None, obo.IAO_0000115, None)) + if '@' not in n3(x[2]) or '@en' in n3(x[2])} return obj_defs From 231277643b2dbe7559fe5adf5000bb6c26120f65 Mon Sep 17 00:00:00 2001 From: callahantiff Date: Wed, 13 Oct 2021 23:59:00 -0600 Subject: [PATCH 4/8] fixing non-english language bug --- pkt_kg/metadata.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pkt_kg/metadata.py b/pkt_kg/metadata.py index 161efe40..770f5e7d 100644 --- a/pkt_kg/metadata.py +++ b/pkt_kg/metadata.py @@ -127,9 +127,11 @@ def extract_metadata(self, graph: Graph) -> None: for key, entities in domains: temp_dict = dict() for i in tqdm(entities): - labels = [x for x in list(graph.triples((i, RDFS.label, None)))] - descriptions = [x for x in list(graph.triples((i, obo.IAO_0000115, None)))] - synonyms = [x for x in list(graph.triples((i, None, None))) if 'synonym' in str(x[1]).lower()] + labels = [x for x in graph.triples((i, RDFS.label, None)) + if '@' not in n3(x[2]) or '@en' in n3(x[2])] + descriptions = [x for x in graph.triples((i, obo.IAO_0000115, None)) + if '@' not in n3(x[2]) or '@en' in n3(x[2])] + synonyms = [x for x in graph.triples((i, None, None)) if 'synonym' in str(x[1]).lower()] if len(labels) != 0: temp_dict[str(i)] = { 'Label': str(labels[0][2]) if len(labels) > 0 else None, From e4721c0e00aee79326c1d645c16cd69620d1b7e6 Mon Sep 17 00:00:00 2001 From: callahantiff Date: Thu, 14 Oct 2021 00:13:04 -0600 Subject: [PATCH 5/8] repairing broken ftp link --- builds/data_to_download.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/builds/data_to_download.txt b/builds/data_to_download.txt index 72f85b8c..2418deb9 100755 --- a/builds/data_to_download.txt +++ b/builds/data_to_download.txt @@ -13,7 +13,7 @@ http://purl.obolibrary.org/obo/ro.owl # linked open data # human transcript, gene, and protein identifier mapping -hgnc_complete_set.txt, ftp://ftp.ebi.ac.uk/pub/databases/genenames/new/tsv/hgnc_complete_set.txt +hgnc_complete_set.txt, https://ftp.ebi.ac.uk/pub/databases/genenames/new/tsv/hgnc_complete_set.txt Homo_sapiens.GRCh38.102.gtf, ftp://ftp.ensembl.org/pub/release-102/gtf/homo_sapiens/Homo_sapiens.GRCh38.102.gtf.gz Homo_sapiens.GRCh38.102.uniprot.tsv, ftp://ftp.ensembl.org/pub/release-102/tsv/homo_sapiens/Homo_sapiens.GRCh38.102.uniprot.tsv.gz Homo_sapiens.GRCh38.102.entrez.tsv, ftp://ftp.ensembl.org/pub/release-102/tsv/homo_sapiens/Homo_sapiens.GRCh38.102.entrez.tsv.gz From 7546e7c3241ded44a511400caa3ded781c9e70b4 Mon Sep 17 00:00:00 2001 From: callahantiff Date: Thu, 14 Oct 2021 00:42:49 -0600 Subject: [PATCH 6/8] revised ftp verification --- pkt_kg/utils/data_utils.py | 4 ++-- tests/test_data_utils_downloading.py | 30 ++++++++++++++-------------- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/pkt_kg/utils/data_utils.py b/pkt_kg/utils/data_utils.py index 4123bd5b..82ebcd92 100644 --- a/pkt_kg/utils/data_utils.py +++ b/pkt_kg/utils/data_utils.py @@ -197,10 +197,10 @@ def data_downloader(url: str, write_location: str, filename: str = '') -> None: file = re.sub(zip_pat, '', filename) if filename != '' else re.sub(zip_pat, '', url.split('/')[-1]) if '.zip' in url: zipped_url_download(url, write_location, file) elif '.gz' in url or '.gz' in filename: - if 'ftp' in url: gzipped_ftp_url_download(url, write_location, file) + if url.startswith('ftp'): gzipped_ftp_url_download(url, write_location, file) else: gzipped_url_download(url, write_location, file) else: - if 'ftp' in url: ftp_url_download(url, write_location, file) + if url.startswith('ftp'): ftp_url_download(url, write_location, file) else: url_download(url, write_location, file) return None diff --git a/tests/test_data_utils_downloading.py b/tests/test_data_utils_downloading.py index 5cf6ab25..70799bf7 100644 --- a/tests/test_data_utils_downloading.py +++ b/tests/test_data_utils_downloading.py @@ -40,7 +40,7 @@ def setUp(self): # set some urls self.url = 'https://proconsortium.org/download/current/promapping.txt' - self.ftp_url = 'ftp://ftp.ebi.ac.uk/pub/databases/genenames/new/tsv/hgnc_complete_set.txt' + self.ftp_url = 'http://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/tsv/hgnc_complete_set.txt' self.gzipped_ftp_url = 'ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz' self.zipped_url = 'https://reactome.org/download/current/ReactomePathways.gmt.zip' self.gzipped_url = 'https://www.disgenet.org/static/disgenet_ap1/files/downloads/disease_mappings.tsv.gz' @@ -259,23 +259,23 @@ def test_data_downloader(self): data_downloader(self.url, self.write_location) self.assertTrue(os.path.exists(self.write_location + self.url.split('/')[-1])) - # # ftp url data + # ftp url data data_downloader(self.ftp_url, self.write_location) self.assertTrue(os.path.exists(self.write_location + self.ftp_url.split('/')[-1])) - # gzipped ftp url data - file = self.gzipped_ftp_url.replace('ftp://', '').split('/')[-1] - write_loc = self.write_location + '{filename}'.format(filename=file) - data_downloader(self.gzipped_ftp_url, self.write_location) - self.assertTrue(os.path.exists(os.path.exists(write_loc[:-3]))) - - # zipped data - data_downloader(self.zipped_url, self.write_location) - self.assertTrue(os.path.exists(self.write_location + self.zipped_url.split('/')[-1][:-4])) - - # gzipped data - data_downloader(self.gzipped_url, self.write_location) - self.assertTrue(os.path.exists(self.write_location + self.gzipped_url.split('/')[-1][:-3])) + # # gzipped ftp url data + # file = self.gzipped_ftp_url.replace('ftp://', '').split('/')[-1] + # write_loc = self.write_location + '{filename}'.format(filename=file) + # data_downloader(self.gzipped_ftp_url, self.write_location) + # self.assertTrue(os.path.exists(os.path.exists(write_loc[:-3]))) + # + # # zipped data + # data_downloader(self.zipped_url, self.write_location) + # self.assertTrue(os.path.exists(self.write_location + self.zipped_url.split('/')[-1][:-4])) + # + # # gzipped data + # data_downloader(self.gzipped_url, self.write_location) + # self.assertTrue(os.path.exists(self.write_location + self.gzipped_url.split('/')[-1][:-3])) return None From 6d4abffe82ee35d1c232d7b368cc41664415181e Mon Sep 17 00:00:00 2001 From: callahantiff Date: Thu, 14 Oct 2021 00:43:25 -0600 Subject: [PATCH 7/8] updating bad ftp link --- builds/data_to_download.txt | 2 +- notebooks/Data_Preparation.ipynb | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/builds/data_to_download.txt b/builds/data_to_download.txt index 2418deb9..2b9728bb 100755 --- a/builds/data_to_download.txt +++ b/builds/data_to_download.txt @@ -13,7 +13,7 @@ http://purl.obolibrary.org/obo/ro.owl # linked open data # human transcript, gene, and protein identifier mapping -hgnc_complete_set.txt, https://ftp.ebi.ac.uk/pub/databases/genenames/new/tsv/hgnc_complete_set.txt +hgnc_complete_set.txt, http://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/tsv/hgnc_complete_set.txt Homo_sapiens.GRCh38.102.gtf, ftp://ftp.ensembl.org/pub/release-102/gtf/homo_sapiens/Homo_sapiens.GRCh38.102.gtf.gz Homo_sapiens.GRCh38.102.uniprot.tsv, ftp://ftp.ensembl.org/pub/release-102/tsv/homo_sapiens/Homo_sapiens.GRCh38.102.uniprot.tsv.gz Homo_sapiens.GRCh38.102.entrez.tsv, ftp://ftp.ensembl.org/pub/release-102/tsv/homo_sapiens/Homo_sapiens.GRCh38.102.entrez.tsv.gz diff --git a/notebooks/Data_Preparation.ipynb b/notebooks/Data_Preparation.ipynb index e4bb5600..32ba5656 100644 --- a/notebooks/Data_Preparation.ipynb +++ b/notebooks/Data_Preparation.ipynb @@ -988,7 +988,7 @@ "outputs": [], "source": [ "# download data\n", - "url = 'ftp://ftp.ebi.ac.uk/pub/databases/genenames/new/tsv/hgnc_complete_set.txt'\n", + "url = 'http://ftp.ebi.ac.uk/pub/databases/genenames/new/tsv/hgnc_complete_set.txt'\n", "if not os.path.exists(unprocessed_data_location + 'hgnc_complete_set.txt'):\n", " data_downloader(url, unprocessed_data_location)\n", "\n", From 34183aab71dbb1c46a7fe566dde1c7a9c1a5941e Mon Sep 17 00:00:00 2001 From: callahantiff Date: Thu, 14 Oct 2021 00:50:25 -0600 Subject: [PATCH 8/8] bumping version to patch bug Issue #118 --- pkt_kg/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkt_kg/__version__.py b/pkt_kg/__version__.py index b4770d11..e2fa8560 100644 --- a/pkt_kg/__version__.py +++ b/pkt_kg/__version__.py @@ -1,2 +1,2 @@ """Current version of package pkt_kg""" -__version__ = "3.0.1" +__version__ = "3.0.2"