Merge pull request #119 from callahantiff/issue_118

Issue 118
callahantiff · Oct 14, 2021 · 5c2f2a5 · 5c2f2a5
2 parents c393c15 + 34183aa
commit 5c2f2a5
Show file tree

Hide file tree

Showing 9 changed files with 34 additions and 30 deletions.
diff --git a/builds/data_preprocessing.py b/builds/data_preprocessing.py
@@ -1526,11 +1526,11 @@ def _creates_relations_metadata_dict(self) -> Dict:
               [x for x in gets_object_properties(ro_graph) if '/RO_' in str(x)]
         master_synonyms = [x for x in ro_graph if 'synonym' in str(x[1]).lower() and isinstance(x[0], URIRef)]
         for x in tqdm(cls):
-            cls_label = list(ro_graph.objects(x, RDFS.label))
+            cls_label = [x for x in ro_graph.objects(x, RDFS.label) if '@' not in n3(x) or '@en' in n3(x)]
             labels = str(cls_label[0]) if len(cls_label) > 0 else 'None'
             cls_syn = [str(i[2]) for i in master_synonyms if x == i[0]]
             synonym = str(cls_syn[0]) if len(cls_syn) > 0 else 'None'
-            cls_desc = list(ro_graph.objects(x, obo.IAO_0000115))
+            cls_desc = [x for x in ro_graph.objects(x, obo.IAO_0000115) if '@' not in n3(x) or '@en' in n3(x)]
             desc = '|'.join([str(cls_desc[0])]) if len(cls_desc) > 0 else 'None'
             relation_metadata_dict[str(x)] = {'Label': labels, 'Description': desc, 'Synonym': synonym}
 

diff --git a/builds/data_to_download.txt b/builds/data_to_download.txt
@@ -13,7 +13,7 @@ http://purl.obolibrary.org/obo/ro.owl
 
 # linked open data
 # human transcript, gene, and protein identifier mapping
-hgnc_complete_set.txt, ftp://ftp.ebi.ac.uk/pub/databases/genenames/new/tsv/hgnc_complete_set.txt
+hgnc_complete_set.txt, http://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/tsv/hgnc_complete_set.txt
 Homo_sapiens.GRCh38.102.gtf, ftp://ftp.ensembl.org/pub/release-102/gtf/homo_sapiens/Homo_sapiens.GRCh38.102.gtf.gz
 Homo_sapiens.GRCh38.102.uniprot.tsv, ftp://ftp.ensembl.org/pub/release-102/tsv/homo_sapiens/Homo_sapiens.GRCh38.102.uniprot.tsv.gz
 Homo_sapiens.GRCh38.102.entrez.tsv, ftp://ftp.ensembl.org/pub/release-102/tsv/homo_sapiens/Homo_sapiens.GRCh38.102.entrez.tsv.gz

diff --git a/notebooks/Data_Preparation.ipynb b/notebooks/Data_Preparation.ipynb
@@ -988,7 +988,7 @@
    "outputs": [],
    "source": [
     "# download data\n",
-    "url = 'ftp://ftp.ebi.ac.uk/pub/databases/genenames/new/tsv/hgnc_complete_set.txt'\n",
+    "url = 'http://ftp.ebi.ac.uk/pub/databases/genenames/new/tsv/hgnc_complete_set.txt'\n",
     "if not os.path.exists(unprocessed_data_location + 'hgnc_complete_set.txt'):\n",
     "    data_downloader(url, unprocessed_data_location)\n",
     "\n",
@@ -4278,13 +4278,13 @@
     "\n",
     "for x in tqdm(cls):\n",
     "    # labels\n",
-    "    cls_label = list(ro_graph.objects(x, RDFS.label))\n",
+    "    cls_label = [x for x in ro_graph.objects(x, RDFS.label) if '@' not in n3(x) or '@en' in n3(x)]\n",
     "    labels = str(cls_label[0]) if len(cls_label) > 0 else 'None'\n",
     "    # synonyms\n",
     "    cls_syn = [str(i[2]) for i in master_synonyms if x == i[0]]\n",
     "    synonym = str(cls_syn[0]) if len(cls_syn) > 0 else 'None'\n",
     "    # description\n",
-    "    cls_desc = list(ro_graph.objects(x, obo.IAO_0000115))\n",
+    "    cls_desc = [x for x in ro_graph.objects(x, obo.IAO_0000115) if '@' not in n3(x) or '@en' in n3(x)]\n",
     "    desc = '|'.join([str(cls_desc[0])]) if len(cls_desc) > 0 else 'None'\n",
     "    \n",
     "    relation_metadata_dict[str(x)] = {\n",

diff --git a/notebooks/OWLNETS_Example_Application.ipynb b/notebooks/OWLNETS_Example_Application.ipynb
@@ -321,7 +321,7 @@
    "outputs": [],
    "source": [
     "ont_classes = pkt.utils.gets_ontology_classes(graph)\n",
-    "ont_labels = {str(x[0]): str(x[2]) for x in list(graph.triples((None, RDFS.label, None)))}\n",
+    "ont_labels = {str(x[0]): str(x[2]) for x in list(graph.triples((None, RDFS.label, None))) if '@' not in pkt.utils.n3(x[2]) or '@en' in pkt.utils.n3(x[2])}\n",
     "ont_synonyms = pkt.utils.gets_ontology_class_synonyms(graph)\n",
     "ont_dbxrefs = pkt.utils.gets_ontology_class_dbxrefs(graph)\n",
     "ont_defs = pkt.utils.gets_ontology_definitions(graph)"
@@ -377,7 +377,7 @@
    "source": [
     "for obj in tqdm(ont_objects):\n",
     "    # get object label\n",
-    "    label_hits = list(graph.objects(obj, RDFS.label))\n",
+    "    label_hits = [x for x in graph.objects(obj, RDFS.label) if '@' not in pkt.utils.n3(x) or '@en' in pkt.utils.n3(x)]\n",
     "    label = str(label_hits[0]) if len(label_hits) > 0 else 'None'\n",
     "    \n",
     "    # get object namespace\n",

diff --git a/pkt_kg/__version__.py b/pkt_kg/__version__.py
@@ -1,2 +1,2 @@
 """Current version of package pkt_kg"""
-__version__ = "3.0.1"
+__version__ = "3.0.2"
diff --git a/pkt_kg/metadata.py b/pkt_kg/metadata.py
@@ -127,9 +127,11 @@ def extract_metadata(self, graph: Graph) -> None:
             for key, entities in domains:
                 temp_dict = dict()
                 for i in tqdm(entities):
-                    labels = [x for x in list(graph.triples((i, RDFS.label, None)))]
-                    descriptions = [x for x in list(graph.triples((i, obo.IAO_0000115, None)))]
-                    synonyms = [x for x in list(graph.triples((i, None, None))) if 'synonym' in str(x[1]).lower()]
+                    labels = [x for x in graph.triples((i, RDFS.label, None))
+                              if '@' not in n3(x[2]) or '@en' in n3(x[2])]
+                    descriptions = [x for x in graph.triples((i, obo.IAO_0000115, None))
+                                    if '@' not in n3(x[2]) or '@en' in n3(x[2])]
+                    synonyms = [x for x in graph.triples((i, None, None)) if 'synonym' in str(x[1]).lower()]
                     if len(labels) != 0:
                         temp_dict[str(i)] = {
                             'Label': str(labels[0][2]) if len(labels) > 0 else None,

diff --git a/pkt_kg/utils/data_utils.py b/pkt_kg/utils/data_utils.py
@@ -197,10 +197,10 @@ def data_downloader(url: str, write_location: str, filename: str = '') -> None:
     file = re.sub(zip_pat, '', filename) if filename != '' else re.sub(zip_pat, '', url.split('/')[-1])
     if '.zip' in url: zipped_url_download(url, write_location, file)
     elif '.gz' in url or '.gz' in filename:
-        if 'ftp' in url: gzipped_ftp_url_download(url, write_location, file)
+        if url.startswith('ftp'): gzipped_ftp_url_download(url, write_location, file)
         else: gzipped_url_download(url, write_location, file)
     else:
-        if 'ftp' in url: ftp_url_download(url, write_location, file)
+        if url.startswith('ftp'): ftp_url_download(url, write_location, file)
         else: url_download(url, write_location, file)
 
     return None

diff --git a/pkt_kg/utils/kg_utils.py b/pkt_kg/utils/kg_utils.py
@@ -53,6 +53,7 @@
 
 from tqdm import tqdm  # type: ignore
 from typing import Dict, List, Optional, Set, Tuple, Union
+from pkt_kg.utils import *
 
 # set-up environment variables
 obo = Namespace('http://purl.obolibrary.org/obo/')
@@ -94,7 +95,8 @@ def gets_ontology_definitions(graph: Graph) -> Dict:
                      ...}
     """
 
-    obj_defs = {x[0]: x[2] for x in graph.triples((None, obo.IAO_0000115, None))}
+    obj_defs = {x[0]: x[2] for x in graph.triples((None, obo.IAO_0000115, None))
+                if '@' not in n3(x[2]) or '@en' in n3(x[2])}
 
     return obj_defs
 

diff --git a/tests/test_data_utils_downloading.py b/tests/test_data_utils_downloading.py
@@ -40,7 +40,7 @@ def setUp(self):
 
         # set some urls
         self.url = 'https://proconsortium.org/download/current/promapping.txt'
-        self.ftp_url = 'ftp://ftp.ebi.ac.uk/pub/databases/genenames/new/tsv/hgnc_complete_set.txt'
+        self.ftp_url = 'http://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/tsv/hgnc_complete_set.txt'
         self.gzipped_ftp_url = 'ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz'
         self.zipped_url = 'https://reactome.org/download/current/ReactomePathways.gmt.zip'
         self.gzipped_url = 'https://www.disgenet.org/static/disgenet_ap1/files/downloads/disease_mappings.tsv.gz'
@@ -259,23 +259,23 @@ def test_data_downloader(self):
         data_downloader(self.url, self.write_location)
         self.assertTrue(os.path.exists(self.write_location + self.url.split('/')[-1]))
 
-        # # ftp url data
+        # ftp url data
         data_downloader(self.ftp_url, self.write_location)
         self.assertTrue(os.path.exists(self.write_location + self.ftp_url.split('/')[-1]))
 
-        # gzipped ftp url data
-        file = self.gzipped_ftp_url.replace('ftp://', '').split('/')[-1]
-        write_loc = self.write_location + '{filename}'.format(filename=file)
-        data_downloader(self.gzipped_ftp_url, self.write_location)
-        self.assertTrue(os.path.exists(os.path.exists(write_loc[:-3])))
-
-        # zipped data
-        data_downloader(self.zipped_url, self.write_location)
-        self.assertTrue(os.path.exists(self.write_location + self.zipped_url.split('/')[-1][:-4]))
-
-        # gzipped data
-        data_downloader(self.gzipped_url, self.write_location)
-        self.assertTrue(os.path.exists(self.write_location + self.gzipped_url.split('/')[-1][:-3]))
+        # # gzipped ftp url data
+        # file = self.gzipped_ftp_url.replace('ftp://', '').split('/')[-1]
+        # write_loc = self.write_location + '{filename}'.format(filename=file)
+        # data_downloader(self.gzipped_ftp_url, self.write_location)
+        # self.assertTrue(os.path.exists(os.path.exists(write_loc[:-3])))
+        #
+        # # zipped data
+        # data_downloader(self.zipped_url, self.write_location)
+        # self.assertTrue(os.path.exists(self.write_location + self.zipped_url.split('/')[-1][:-4]))
+        #
+        # # gzipped data
+        # data_downloader(self.gzipped_url, self.write_location)
+        # self.assertTrue(os.path.exists(self.write_location + self.gzipped_url.split('/')[-1][:-3]))
 
         return None