From e0749b4ee3c8bf61aaf66c8b614b20a18df39c52 Mon Sep 17 00:00:00 2001 From: Adriano Rutz Date: Mon, 13 May 2024 12:38:34 +0200 Subject: [PATCH] WIP #50 --- update/config.py | 99 +++++++++++++++++++ update/queries/structures_ids_cas.rq | 2 +- update/queries/structures_ids_drugbank.rq | 6 ++ .../queries/structures_ids_dsstox_compound.rq | 6 ++ .../structures_ids_dsstox_substance.rq | 6 ++ update/queries/structures_ids_ec.rq | 6 ++ update/queries/structures_ids_echa.rq | 6 ++ update/queries/structures_ids_lipidmaps.rq | 6 ++ update/queries/structures_ids_nsc.rq | 6 ++ .../structures_ids_probes_and_drugs.rq | 6 ++ update/queries/structures_ids_swisslipids.rq | 6 ++ update/queries/structures_ids_unichem.rq | 6 ++ update/queries/structures_ids_unii.rq | 6 ++ update/queries/urls_formatters.rq | 13 ++- 14 files changed, 178 insertions(+), 2 deletions(-) create mode 100644 update/queries/structures_ids_drugbank.rq create mode 100644 update/queries/structures_ids_dsstox_compound.rq create mode 100644 update/queries/structures_ids_dsstox_substance.rq create mode 100644 update/queries/structures_ids_ec.rq create mode 100644 update/queries/structures_ids_echa.rq create mode 100644 update/queries/structures_ids_lipidmaps.rq create mode 100644 update/queries/structures_ids_nsc.rq create mode 100644 update/queries/structures_ids_probes_and_drugs.rq create mode 100644 update/queries/structures_ids_swisslipids.rq create mode 100644 update/queries/structures_ids_unichem.rq create mode 100644 update/queries/structures_ids_unii.rq diff --git a/update/config.py b/update/config.py index 8404042..02b2acd 100644 --- a/update/config.py +++ b/update/config.py @@ -137,6 +137,51 @@ "output_file": "structures_ids_csd.csv", }, ), + Task( + name="structures_ids_drugbank", + f=download_query_as_csv.run, + group=DownloadGroup, + params={ + "query_file": "update/queries/structures_ids_drugbank.rq", + "output_file": "structures_ids_drugbank.csv", + }, + ), + Task( + name="structures_ids_dsstox_compound", + f=download_query_as_csv.run, + group=DownloadGroup, + params={ + "query_file": "update/queries/structures_ids_dsstox_compound.rq", + "output_file": "structures_ids_dsstox_compound.csv", + }, + ), + Task( + name="structures_ids_dsstox_substance", + f=download_query_as_csv.run, + group=DownloadGroup, + params={ + "query_file": "update/queries/structures_ids_dsstox_substance.rq", + "output_file": "structures_ids_dsstox_substance.csv", + }, + ), + Task( + name="structures_ids_ec", + f=download_query_as_csv.run, + group=DownloadGroup, + params={ + "query_file": "update/queries/structures_ids_ec.rq", + "output_file": "structures_ids_ec.csv", + }, + ), + Task( + name="structures_ids_echa", + f=download_query_as_csv.run, + group=DownloadGroup, + params={ + "query_file": "update/queries/structures_ids_echa.rq", + "output_file": "structures_ids_echa.csv", + }, + ), Task( name="structures_ids_hmdb", f=download_query_as_csv.run, @@ -164,6 +209,15 @@ "output_file": "structures_ids_knapsack.csv", }, ), + Task( + name="structures_ids_lipidmaps", + f=download_query_as_csv.run, + group=DownloadGroup, + params={ + "query_file": "update/queries/structures_ids_lipidmaps.rq", + "output_file": "structures_ids_lipidmaps.csv", + }, + ), Task( name="structures_ids_massbank", f=download_query_as_csv.run, @@ -191,6 +245,15 @@ "output_file": "structures_ids_npatlas.csv", }, ), + Task( + name="structures_ids_nsc", + f=download_query_as_csv.run, + group=DownloadGroup, + params={ + "query_file": "update/queries/structures_ids_nsc.rq", + "output_file": "structures_ids_nsc.csv", + }, + ), Task( name="structures_ids_pdb_ligand", f=download_query_as_csv.run, @@ -209,6 +272,15 @@ "output_file": "structures_ids_pdb_structure.csv", }, ), + Task( + name="structures_ids_probes_and_drugs", + f=download_query_as_csv.run, + group=DownloadGroup, + params={ + "query_file": "update/queries/structures_ids_probes_and_drugs.rq", + "output_file": "structures_ids_probes_and_drugs.csv", + }, + ), Task( name="structures_ids_pubchem", f=download_query_as_csv.run, @@ -236,6 +308,33 @@ "output_file": "structures_ids_surechembl.csv", }, ), + Task( + name="structures_ids_swisslipids", + f=download_query_as_csv.run, + group=DownloadGroup, + params={ + "query_file": "update/queries/structures_ids_swisslipids.rq", + "output_file": "structures_ids_swisslipids.csv", + }, + ), + Task( + name="structures_ids_unichem", + f=download_query_as_csv.run, + group=DownloadGroup, + params={ + "query_file": "update/queries/structures_ids_unichem.rq", + "output_file": "structures_ids_unichem.csv", + }, + ), + Task( + name="structures_ids_unii", + f=download_query_as_csv.run, + group=DownloadGroup, + params={ + "query_file": "update/queries/structures_ids_unii.rq", + "output_file": "structures_ids_unii.csv", + }, + ), Task( name="structures_ids_zinc", f=download_query_as_csv.run, diff --git a/update/queries/structures_ids_cas.rq b/update/queries/structures_ids_cas.rq index 1f59277..88b933a 100644 --- a/update/queries/structures_ids_cas.rq +++ b/update/queries/structures_ids_cas.rq @@ -2,6 +2,6 @@ PREFIX hint: PREFIX wdt: SELECT ?structure ?structure_id_cas WHERE { - ?structure wdt:P231 ?structure_id_cas. hint:Prior hint:rangeSafe TRUE. # int + ?structure wdt:P231 ?structure_id_cas. hint:Prior hint:rangeSafe TRUE. # str } # LIMIT 2000000 diff --git a/update/queries/structures_ids_drugbank.rq b/update/queries/structures_ids_drugbank.rq new file mode 100644 index 0000000..5a661fb --- /dev/null +++ b/update/queries/structures_ids_drugbank.rq @@ -0,0 +1,6 @@ +PREFIX hint: +PREFIX wdt: + +SELECT ?structure ?structure_id_drugbank WHERE { + ?structure wdt:P715 ?structure_id_drugbank. hint:Prior hint:rangeSafe TRUE. # str +} diff --git a/update/queries/structures_ids_dsstox_compound.rq b/update/queries/structures_ids_dsstox_compound.rq new file mode 100644 index 0000000..932a79d --- /dev/null +++ b/update/queries/structures_ids_dsstox_compound.rq @@ -0,0 +1,6 @@ +PREFIX hint: +PREFIX wdt: + +SELECT ?structure ?structure_id_dsstox_compound WHERE { + ?structure wdt:P8494 ?structure_id_dsstox_compound. hint:Prior hint:rangeSafe TRUE. # str +} diff --git a/update/queries/structures_ids_dsstox_substance.rq b/update/queries/structures_ids_dsstox_substance.rq new file mode 100644 index 0000000..b45c204 --- /dev/null +++ b/update/queries/structures_ids_dsstox_substance.rq @@ -0,0 +1,6 @@ +PREFIX hint: +PREFIX wdt: + +SELECT ?structure ?structure_id_dsstox_substance WHERE { + ?structure wdt:P231 ?structure_id_dsstox_substance. hint:Prior hint:rangeSafe TRUE. # str +} diff --git a/update/queries/structures_ids_ec.rq b/update/queries/structures_ids_ec.rq new file mode 100644 index 0000000..f57d8e6 --- /dev/null +++ b/update/queries/structures_ids_ec.rq @@ -0,0 +1,6 @@ +PREFIX hint: +PREFIX wdt: + +SELECT ?structure ?structure_id_ec WHERE { + ?structure wdt:P232 ?structure_id_ec. hint:Prior hint:rangeSafe TRUE. # str +} diff --git a/update/queries/structures_ids_echa.rq b/update/queries/structures_ids_echa.rq new file mode 100644 index 0000000..e7d887b --- /dev/null +++ b/update/queries/structures_ids_echa.rq @@ -0,0 +1,6 @@ +PREFIX hint: +PREFIX wdt: + +SELECT ?structure ?structure_id_echa WHERE { + ?structure wdt:P2566 ?structure_id_echa. hint:Prior hint:rangeSafe TRUE. # str +} diff --git a/update/queries/structures_ids_lipidmaps.rq b/update/queries/structures_ids_lipidmaps.rq new file mode 100644 index 0000000..431b8d1 --- /dev/null +++ b/update/queries/structures_ids_lipidmaps.rq @@ -0,0 +1,6 @@ +PREFIX hint: +PREFIX wdt: + +SELECT ?structure ?structure_id_lipidmaps WHERE { + ?structure wdt:P2063 ?structure_id_lipidmaps. hint:Prior hint:rangeSafe TRUE. # str +} diff --git a/update/queries/structures_ids_nsc.rq b/update/queries/structures_ids_nsc.rq new file mode 100644 index 0000000..a77f0d1 --- /dev/null +++ b/update/queries/structures_ids_nsc.rq @@ -0,0 +1,6 @@ +PREFIX hint: +PREFIX wdt: + +SELECT ?structure ?structure_id_nsc WHERE { + ?structure wdt:P2840 ?structure_id_nsc. hint:Prior hint:rangeSafe TRUE. # int +} diff --git a/update/queries/structures_ids_probes_and_drugs.rq b/update/queries/structures_ids_probes_and_drugs.rq new file mode 100644 index 0000000..0e13d88 --- /dev/null +++ b/update/queries/structures_ids_probes_and_drugs.rq @@ -0,0 +1,6 @@ +PREFIX hint: +PREFIX wdt: + +SELECT ?structure ?structure_id_probes_and_drugs WHERE { + ?structure wdt:P11199 ?structure_id_probes_and_drugs. hint:Prior hint:rangeSafe TRUE. # str +} diff --git a/update/queries/structures_ids_swisslipids.rq b/update/queries/structures_ids_swisslipids.rq new file mode 100644 index 0000000..130bded --- /dev/null +++ b/update/queries/structures_ids_swisslipids.rq @@ -0,0 +1,6 @@ +PREFIX hint: +PREFIX wdt: + +SELECT ?structure ?structure_id_swisslipids WHERE { + ?structure wdt:P8691 ?structure_id_swisslipids. hint:Prior hint:rangeSafe TRUE. # str +} diff --git a/update/queries/structures_ids_unichem.rq b/update/queries/structures_ids_unichem.rq new file mode 100644 index 0000000..67f10cc --- /dev/null +++ b/update/queries/structures_ids_unichem.rq @@ -0,0 +1,6 @@ +PREFIX hint: +PREFIX wdt: + +SELECT ?structure ?structure_id_unichem WHERE { + ?structure wdt:P11089 ?structure_id_unichem. hint:Prior hint:rangeSafe TRUE. # int +} diff --git a/update/queries/structures_ids_unii.rq b/update/queries/structures_ids_unii.rq new file mode 100644 index 0000000..862fe93 --- /dev/null +++ b/update/queries/structures_ids_unii.rq @@ -0,0 +1,6 @@ +PREFIX hint: +PREFIX wdt: + +SELECT ?structure ?structure_id_unii WHERE { + ?structure wdt:P652 ?structure_id_unii. hint:Prior hint:rangeSafe TRUE. # str +} diff --git a/update/queries/urls_formatters.rq b/update/queries/urls_formatters.rq index e86c016..c249cc6 100644 --- a/update/queries/urls_formatters.rq +++ b/update/queries/urls_formatters.rq @@ -6,17 +6,20 @@ SELECT * WHERE { # All properties we use with a formatter URL VALUES ?property { wd:P231 # STRUCTURE CAS + wd:P232 # STRUCTURE EC number wd:P233 # STRUCTURE SMILES (canonical) wd:P234 # STRUCTURE InChI wd:P235 # STRUCTURE InChIKey wd:P356 # REFERENCE DOI wd:P592 # STRUCTURE ChEMBL wd:P638 # STRUCTURE PDB structure + wd:P652 # STRUCTURE UNII wd:P661 # STRUCTURE ChemSpider wd:P662 # STRUCTURE PubChem CID - wd:P683 # STRUCTURE ChEBI wd:P665 # STRUCTURE KEGG + wd:P683 # STRUCTURE ChEBI wd:P685 # TAXON NCBI + wd:P715 # STRUCTURE DrugBank wd:P815 # TAXON ITIS wd:P830 # TAXON EOL wd:P846 # TAXON GBIF @@ -25,9 +28,13 @@ SELECT * WHERE { wd:P961 # TAXON IPNI wd:P2017 # STRUCTURE SMILES (isomeric) wd:P2057 # STRUCTURE HMDB + wd:P2063 # STRUCTURE LIPID MAPS wd:P2064 # STRUCTURE KNApSAcK wd:P2084 # STRUCTURE ZINC + wd:P2566 # STRUCTURE ECHA + wd:P2840 # STRUCTURE NSC wd:P2877 # STRUCTURE SureChEMBL + wd:P3117 # STRUCTURE DSSTox substance wd:P3151 # TAXON iNat wd:P3636 # STRUCTURE PDB ligand wd:P4964 # STRUCTURE SPLASH @@ -36,10 +43,14 @@ SELECT * WHERE { wd:P6689 # STRUCTURE MassBank wd:P7715 # TAXON WFO wd:P7746 # STRUCTURE NPAtlas + wd:P8494 # STRUCTURE DSSTox compound wd:P8533 # STRUCTURE SMARTS + wd:P8691 # STRUCTURE SwissLipids wd:P9157 # TAXON OTL wd:P9405 # STRUCTURE NMRShiftDB wd:P10718 # STRUCTURE CXSMILES + wd:P11089 # STRUCTURE UniChem + wd:P11199 # STRUCTURE Probes And Drugs wd:P11375 # STRUCTURE CSD } ?property wdt:P1630 ?formatter. hint:Prior hint:rangeSafe TRUE.