Skip to content

Commit

Permalink
add archived_datasets folder with old mappings
Browse files Browse the repository at this point in the history
  • Loading branch information
vemonet committed Jan 18, 2021
1 parent c2a2a26 commit f440139
Show file tree
Hide file tree
Showing 129 changed files with 28,437 additions and 18 deletions.
42 changes: 42 additions & 0 deletions .github/workflows/sparql-map-uniprot.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
name: UniProt to BioLink RDF
# TODO: Add step to manage versioning (delete previous graph, load new graph, generate metadata)
on:
workflow_dispatch:
inputs:
endpoint:
description: 'Upload to SPARQL endpoint'
required: true
default: 'https://graphdb.dumontierlab.com/repositories/ncats-red-kg/statements'
graph:
description: 'In the Graph'
required: true
default: 'https://w3id.org/d2s/graph/uniprot'

jobs:
run-sparql:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2

- name: Run SPARQL queries to convert UniProt
uses: vemonet/sparql-operations-action@v1
with:
file: datasets/uniprot/mapping
endpoint: ${{ github.event.inputs.endpoint }}
user: ${{ secrets.GRAPHDB_USER }}
password: ${{ secrets.GRAPHDB_PASSWORD }}
inputvar: https://sparql.uniprot.org
outputvar: ${{ github.event.inputs.graph }}
servicevar: https://sparql.uniprot.org

- name: Compute and insert HCLS descriptive metadata
uses: vemonet/sparql-operations-action@v1
with:
file: https://github.com/MaastrichtU-IDS/d2s-scripts-repository/tree/master/sparql/compute-hcls-stats
endpoint: ${{ github.event.inputs.endpoint }}
user: ${{ secrets.GRAPHDB_USER }}
password: ${{ secrets.GRAPHDB_PASSWORD }}
inputvar: ${{ github.event.inputs.graph }}
outputvar: https://w3id.org/d2s/metadata
servicevar: ${{ github.event.inputs.endpoint }}
# servicevar: http://localhost:7200/repositories/ncats-red-kg
42 changes: 42 additions & 0 deletions .github/workflows/sparql-map-wikipathways.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
name: WikiPathways to BioLink RDF
# TODO: Add step to manage versioning (delete previous graph, load new graph, generate metadata)
on:
workflow_dispatch:
inputs:
endpoint:
description: 'Upload to SPARQL endpoint'
required: true
default: 'https://graphdb.dumontierlab.com/repositories/ncats-red-kg/statements'
graph:
description: 'In the Graph'
required: true
default: 'https://w3id.org/d2s/graph/wikipathways'

jobs:
run-sparql:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2

- name: Run SPARQL queries to convert Wikipathways
uses: vemonet/sparql-operations-action@v1
with:
file: datasets/wikipathways/mapping
endpoint: ${{ github.event.inputs.endpoint }}
user: ${{ secrets.GRAPHDB_USER }}
password: ${{ secrets.GRAPHDB_PASSWORD }}
inputvar: http://rdf.wikipathways.org/
outputvar: ${{ github.event.inputs.graph }}
servicevar: http://sparql.wikipathways.org/sparql

- name: Compute and insert HCLS descriptive metadata
uses: vemonet/sparql-operations-action@v1
with:
file: https://github.com/MaastrichtU-IDS/d2s-scripts-repository/tree/master/sparql/compute-hcls-stats
endpoint: ${{ github.event.inputs.endpoint }}
user: ${{ secrets.GRAPHDB_USER }}
password: ${{ secrets.GRAPHDB_PASSWORD }}
inputvar: ${{ github.event.inputs.graph }}
outputvar: https://w3id.org/d2s/metadata
servicevar: ${{ github.event.inputs.endpoint }}
# servicevar: http://localhost:7200/repositories/ncats-red-kg
55 changes: 55 additions & 0 deletions archived-datasets/date/config.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@

## For Tabular files workflows
dataset_to_process: "date"

## For XML workflows
# dir_to_process:
# class: Directory
# path: ../../workspace/input/date

sparql_final_graph_uri: "https://w3id.org/d2s/graph/date"


# Final SPARQL endpoint to load the BioLink RDF
# Add /statements for RDF4J server like GraphDB
sparql_final_triplestore_url: "https://graphdb.dumontierlab.com/repositories/ncats-red-kg/statements"
# sparql_final_triplestore_url: "http://graphdb:7200/repositories/ncats-red-kg/statements"

sparql_final_triplestore_username: "import_user"
sparql_final_triplestore_password: "dba"

# R2RML params
input_data_jdbc: "jdbc:drill:drillbit=drill:31010"

# Temporary triplestore (e.g. Virtuoso). TODO: improve
sparql_tmp_service_url: "http://tmp-virtuoso:8890/sparql"
# sparql_tmp_service_url: "http://blazegraph:8082/bigdata/sparql"

sparql_tmp_triplestore_url: "http://tmp-virtuoso:8890/sparql"
# sparql_tmp_triplestore_url: "http://blazegraph:8082/bigdata/sparql"
sparql_tmp_triplestore_username: "dba"
sparql_tmp_triplestore_password: "dba"

sparql_tmp_graph_uri: "https://w3id.org/d2s/graph/date"
# sparql_tmp_graph_uri: "https://w3id.org/d2s/graph/xml2rdf"
tmp_triplestore_container_id: "d2s-tmp-virtuoso"
tmp_triplestore_load_dir: "/usr/local/virtuoso-opensource/var/lib/virtuoso/db"

# Split params
# split_property: "https://w3id.org/biolink/vocab/has_participant"
# split_class: "https://w3id.org/biolink/vocab/GeneGrouping"
# split_delimiter: ","
# split_quote: '"'

sparql_transform_queries_path: "mapping"
sparql_insert_metadata_path: "metadata"

cwl_workflow_filename: "csv-virtuoso.cwl"

config_dir:
class: Directory
path: .

cwl_dir:
class: Directory
path: ../../d2s-core
15 changes: 15 additions & 0 deletions archived-datasets/date/download/download.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#!/bin/bash

wget -N http://tatonettilab.org/resources/DATE/date_resource.zip

# Unzip
# All in same dir
unzip -o \*.zip

# Convert TSV to CSV
sed -e 's/"/\\"/g' -e 's/\t/","/g' -e 's/^/"/' -e 's/$/"/' -e 's/\r//' date_resource/Drug_target_reactome_pathway.tsv > date.csv

# rm date_resource/Drug_target_reactome_pathway_filtered.tsv
# Should contains 2 tsv
# date_resource/Drug_target_reactome_pathway.tsv
# date_resource/Drug_target_reactome_pathway_filtered.tsv
91 changes: 91 additions & 0 deletions archived-datasets/date/mapping/insert-date.rq
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
PREFIX d2smodel: <https://w3id.org/d2s/model/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
PREFIX dc: <http://purl.org/dc/elements/1.1/>
PREFIX dcterms: <http://purl.org/dc/terms/>
PREFIX bl: <https://w3id.org/biolink/vocab/>
PREFIX w3idvocab: <https://w3id.org/d2s/vocab/>
INSERT
{
GRAPH <?_output> {
?drugUri a bl:Drug ;
bl:id ?drugId ;
bl:name ?drugName .

?targetUniprotUri a bl:Protein .

?targetSymbolUri a bl:Gene ;
bl:id ?targetSymbol ;
bl:has_gene_product ?targetUniprotUri ;
bl:systematic_synonym ?targetSymbol .
# expressed_in tissue?

?pathwayUri a bl:Pathway ; # tissue-specific molecular pathway
bl:name ?pathwayName ;
bl:part_of ?tissueUri .

?tissueUri a bl:GrossAnatomicalStructure ; # tissue
bl:name ?tissueName .


?interactionUri a bl:ChemicalToGeneAssociation ;
bl:relation bl:interacts_with ;
bl:subject ?drugUri ;
bl:object ?targetUniprotUri ;
bl:part_of ?pathwayUri .

# TODO: We also could create a bl:ChemicalToPathwayAssociation. What would be the best choice?

# TODO: CellLine https://biolink.github.io/biolink-model/docs/CellLineToDiseaseOrPhenotypicFeatureAssociation.html
#?cellLineUri a bl:CellLine ;
#bl:id ?cellLineId .
}
}
WHERE {
SERVICE <?_service> {
GRAPH <?_input> {
# To filter out Drug_target_reactome_pathway_filtered.tsv file
# TODO: Be careful the file path in the type can change:
# ?s a <http://data2services/data/ncats/date/Drug_target_reactome_pathway.tsv> ;
?s d2smodel:Drug_idStitch ?drugId ; # eg: CID000004927
d2smodel:Drug_name ?drugName .
BIND( iri(concat("https://identifiers.org/pubchem.compound/", replace(?drugId, "CID", "") ) ) AS ?drugUri )

?s d2smodel:TargetUniprot ?targetUniprotId .
BIND( iri(concat("https://identifiers.org/uniprot/", ?targetUniprotId ) ) AS ?targetUniprotUri )

?s d2smodel:Pathway ?pathwayName . # eg: Retrograde neurotrophin signalling
BIND( iri(concat("https://w3id.org/d2s/data/pathway/", md5(?pathwayName) ) ) AS ?pathwayUri )

BIND( iri(concat("https://w3id.org/d2s/data/protein/interaction/", md5(concat(?drugId, ?targetUniprotId, ?pathwayName)) ) ) AS ?interactionUri )
BIND( iri(concat("https://w3id.org/d2s/data/pathway/association/", md5(concat(?drugId, ?targetUniprotId, ?pathwayName)) ) ) AS ?pathwayAssociationUri )

OPTIONAL {
?s d2smodel:Cell_line_id ?cellLineId . # NA (majority), HT1080, SHSYSY.RA, astrocytes, GM2313...
BIND( if( ?cellLineId="NA",
iri("") ,
iri(concat("https://w3id.org/d2s/data/cell_line/", ?cellLineId) )
) AS ?cellLineUri )
}
OPTIONAL {
?s d2smodel:Tissue ?tissueName . # Fetalbrain
BIND( iri(concat("https://w3id.org/d2s/data/tissue/", md5(?tissueName) ) ) AS ?tissueUri )
}
OPTIONAL {
?s d2smodel:Pathway_size ?pathwaySize . # 42...
}
OPTIONAL {
?s d2smodel:TargetSymbol ?targetSymbol . # CHRM3
FILTER(!contains(?targetSymbol, "c(")) # filter out symbol like c(\"CALM1\", \"CALM2\", \"CALM3\")
BIND( iri(concat("https://identifiers.org/hgnc.symbol/", ?targetSymbol ) ) AS ?targetSymbolUri )
}
OPTIONAL {
?s d2smodel:Target_class ?targetClass . # gpcr, enzyme, lgic, vgic, transporter, other_protein, catalytic_receptor, nhr, other_ic
BIND( iri(concat("https://w3id.org/d2s/data/protein/class/", md5(?targetClass) ) ) AS ?targetClassUri )
}
OPTIONAL {
?s d2smodel:Dataset ?dataset . # GTEx, U133A, NCI60, HPM_PRT
}
}
}
}
88 changes: 88 additions & 0 deletions archived-datasets/date/mapping/map-date.rml.ttl
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
@prefix rr: <http://www.w3.org/ns/r2rml#>.
@prefix rml: <http://semweb.mmlab.be/ns/rml#>.
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>.
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>.
@prefix ql: <http://semweb.mmlab.be/ns/ql#>.
@prefix map: <http://mapping.example.com/>.

map:fn_0 rml:logicalSource map:source_0;
rr:predicateObjectMap map:pomexec_0, map:pom_7.
map:map_interactions_0 rml:logicalSource map:source_0;
a rr:TriplesMap;
rdfs:label "interactions";
rr:subjectMap map:s_0;
rr:predicateObjectMap map:pom_0, map:pom_1, map:pom_2, map:pom_3, map:pom_4, map:pom_5, map:pom_6.
map:om_0 a rr:ObjectMap;
rr:constant "https://w3id.org/biolink/vocab/PairwiseGeneToGeneInteraction";
rr:termType rr:IRI.
map:om_1 a rr:ObjectMap;
rr:template "https://identifiers.org/pubchem.compound/{Drug_ID_Stitch}";
rr:termType rr:IRI.
map:om_2 a rr:ObjectMap;
rr:template "https://identifiers.org/uniprot:{Target(uniprot)}";
rr:termType rr:IRI.
map:om_3 a rr:ObjectMap;
rr:constant "https://w3id.org/biolink/vocab/interacts_with";
rr:termType rr:IRI.
map:om_4 a rr:ObjectMap;
rr:template "https://w3id.org/d2s/dataset/date/{Dataset}";
rr:termType rr:IRI.
map:om_5 a rr:ObjectMap;
rml:reference "Pathway";
rr:termType rr:Literal.
map:om_6 a <http://semweb.mmlab.be/ns/fnml#FunctionTermMap>;
rr:termType rr:IRI;
<http://semweb.mmlab.be/ns/fnml#functionValue> map:fn_0.
map:om_7 a rr:ObjectMap;
rr:template "https://w3id.org/d2s/data/date/pathway/{Pathway}";
rr:termType rr:Literal.
map:omexec_0 rr:constant "http://example.com/idlab/function/toUpperCaseURL";
rr:termType rr:IRI.
map:pm_0 a rr:PredicateMap;
rr:constant rdf:type.
map:pm_1 a rr:PredicateMap;
rr:constant <https://w3id.org/biolink/vocab/subject>.
map:pm_2 a rr:PredicateMap;
rr:constant <https://w3id.org/biolink/vocab/object>.
map:pm_3 a rr:PredicateMap;
rr:constant <https://w3id.org/biolink/vocab/relation>.
map:pm_4 a rr:PredicateMap;
rr:constant <https://w3id.org/biolink/vocab/provided_by>.
map:pm_5 a rr:PredicateMap;
rr:constant <https://w3id.org/biolink/vocab/part_of>.
map:pm_6 a rr:PredicateMap;
rr:constant <https://w3id.org/biolink/vocab/part_of>.
map:pm_7 a rr:PredicateMap;
rr:constant <http://example.com/idlab/function/str>.
map:pmexec_0 rr:constant <https://w3id.org/function/ontology#executes>.
map:pom_0 a rr:PredicateObjectMap;
rr:predicateMap map:pm_0;
rr:objectMap map:om_0.
map:pom_1 a rr:PredicateObjectMap;
rr:predicateMap map:pm_1;
rr:objectMap map:om_1.
map:pom_2 a rr:PredicateObjectMap;
rr:predicateMap map:pm_2;
rr:objectMap map:om_2.
map:pom_3 a rr:PredicateObjectMap;
rr:predicateMap map:pm_3;
rr:objectMap map:om_3.
map:pom_4 a rr:PredicateObjectMap;
rr:predicateMap map:pm_4;
rr:objectMap map:om_4.
map:pom_5 a rr:PredicateObjectMap;
rr:predicateMap map:pm_5;
rr:objectMap map:om_5.
map:pom_6 a rr:PredicateObjectMap;
rr:predicateMap map:pm_6;
rr:objectMap map:om_6.
map:pom_7 a rr:PredicateObjectMap;
rr:predicateMap map:pm_7;
rr:objectMap map:om_7.
map:pomexec_0 rr:predicateMap map:pmexec_0;
rr:objectMap map:omexec_0.
map:s_0 a rr:SubjectMap;
rr:template "https://w3id.org/d2s/data/date/interaction/{Drug_ID_Stitch}_{Target(uniprot)}".
map:source_0 a rml:LogicalSource;
rml:source "date.csv";
rml:referenceFormulation ql:CSV.
52 changes: 52 additions & 0 deletions archived-datasets/date/mapping/map-date.yarrr.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
prefixes:
grel: "http://users.ugent.be/~bjdmeest/function/grel.ttl#"
idlab: "http://example.com/idlab/function/"
xsd: "http://www.w3.org/2001/XMLSchema#"
rdfs: "http://www.w3.org/2000/01/rdf-schema#"
bl: "https://w3id.org/biolink/vocab/"
d2s: "https://w3id.org/d2s/"
pubmed: "https://identifiers.org/pubmed:"

mappings:
interactions:
sources:
- ['date.csv~csv']
# Dataset Drug_name Drug_ID(Stitch) Tissue Cell_line_ID Target(uniprot) Target(symbol) Target_class Pathway Pathway_size
# U133A leuprolide acetate CID000003911 Pituitary NA P30968 GNRHR gpcr Eukaryotic Translation Elongation 89
# U133A leuprolide acetate CID000003911 Pituitary NA P30968 GNRHR gpcr Growth hormone receptor signaling 41

s: https://w3id.org/d2s/data/date/interaction/$(Drug_ID_Stitch)_$(Target(uniprot\))
po:
- [a, bl:ChemicalToGeneAssociation]
- p: bl:subject
o: https://identifiers.org/pubchem.compound/$(Drug_ID_Stitch)~iri
# TODO: remove CID from the ID for proper URI
- p: bl:object
o: https://identifiers.org/uniprot:$(Target(uniprot\))~iri
- p: bl:relation
o: bl:interacts_with~iri
- p: bl:provided_by
o: d2s:dataset/date/$(Dataset)~iri
- p: bl:participates_in
o: $(Pathway)
- p: bl:part_of
o: $(Tissue)
# - p: bl:part_of
# o:
# function: idlab:toUpperCaseURL
# parameters:
# - [idlab:str, "https://w3id.org/d2s/data/date/pathway/$(Pathway)"]
# type: iri

# Also pathway part_of tissue
# TODO: generate a URI for Pathway (do it through preprocessing?)
# We could have a python script which iterates over Pathway row to resolve the URI

drugs:
sources:
- ['date.csv~csv']
s: https://identifiers.org/pubchem.compound/$(Drug_ID_Stitch)~iri
po:
- [a, bl:ChemicalSubstance]
- p: bl:name
o: $(Drug_name)
Loading

0 comments on commit f440139

Please sign in to comment.