diff --git a/.editorconfig b/.editorconfig
new file mode 100644
index 0000000..96bd366
--- /dev/null
+++ b/.editorconfig
@@ -0,0 +1,14 @@
+[*]
+end_of_line = lf
+charset = utf-8
+trim_trailing_whitespace = true
+insert_final_newline = true
+indent_style = space
+indent_size = 2
+max_line_length = 120
+
+[Makefile]
+indent_style = tab
+
+[*.{diff,md}]
+trim_trailing_whitespace = false
diff --git a/.envrc b/.envrc
new file mode 100644
index 0000000..3550a30
--- /dev/null
+++ b/.envrc
@@ -0,0 +1 @@
+use flake
diff --git a/.gitignore b/.gitignore
index 0cfe183..1c7d08d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,12 @@
+.*
+!.editorconfig
+!.envrc
+!.github
+!.gitignore
+
logs
/download
/temp
/brick
-/raw
\ No newline at end of file
+/raw
+/rdf
diff --git a/dvc.lock b/dvc.lock
index e89d8e8..d8d65c3 100644
--- a/dvc.lock
+++ b/dvc.lock
@@ -49,8 +49,8 @@ stages:
nfiles: 337
- path: stages/3_write_brick.R
hash: md5
- md5: c5dbdeded250cf30b3af70cc436cb77f
- size: 942
+ md5: 1377f37bad12cec9d4a0e8afae48140d
+ size: 1159
- path: temp
hash: md5
md5: 01a6bc69d09016a79132483c28820178.dir
@@ -59,13 +59,39 @@ stages:
outs:
- path: brick/tox21.parquet
hash: md5
- md5: 247348fd1254be594b21d202d514d55d
- size: 1231260948
+ md5: 0eb67f9b2fc3b16aff313d8ea9cdc359
+ size: 1237116063
- path: brick/tox21_aggregated.parquet
hash: md5
md5: 4de160eb524d30b610ed369d19776af8
size: 34091575
- path: brick/tox21lib.parquet
hash: md5
- md5: 0f67d0d71773dfff4e9658334b1c8895
+ md5: be3b8da4eebac921c7eca80177280e93
size: 825475
+ rml:
+ cmd: bash stages/4_rml.sh
+ deps:
+ - path: brick/
+ hash: md5
+ md5: 268c1928221d9976b99ecaedc9edf48f.dir
+ size: 1272033113
+ nfiles: 3
+ - path: morph-kgc.ini
+ hash: md5
+ md5: 4db7b2ab3384fb468b716ddc7161a00d
+ size: 132
+ - path: stages/4_rml.sh
+ hash: md5
+ md5: 1a6f8e5e301c61b24fc3023ad970d2e8
+ size: 112
+ - path: tox21.rml.ttl
+ hash: md5
+ md5: 6279d0e422d2cb7a33ae813afe3611d3
+ size: 2121
+ outs:
+ - path: rdf/
+ hash: md5
+ md5: 7e612fd7f1b73c26ddf5a1a2caee495f.dir
+ size: 355139697
+ nfiles: 1
diff --git a/dvc.yaml b/dvc.yaml
index 915523d..4c7adcb 100644
--- a/dvc.yaml
+++ b/dvc.yaml
@@ -8,7 +8,7 @@ stages:
outs:
- download
- temp
- unzip:
+ unzip:
cmd: stages/2_unzip.sh
deps:
- stages/2_unzip.sh
@@ -16,7 +16,7 @@ stages:
- temp
outs:
- raw
- build:
+ build:
cmd: Rscript stages/3_write_brick.R
deps:
- stages/3_write_brick.R
@@ -26,3 +26,12 @@ stages:
- brick/tox21.parquet
- brick/tox21_aggregated.parquet
- brick/tox21lib.parquet
+ rml:
+ cmd: bash stages/4_rml.sh
+ deps:
+ - brick/
+ - stages/4_rml.sh
+ - tox21.rml.ttl
+ - morph-kgc.ini
+ outs:
+ - rdf/
diff --git a/flake.lock b/flake.lock
new file mode 100644
index 0000000..5468abc
--- /dev/null
+++ b/flake.lock
@@ -0,0 +1,245 @@
+{
+ "nodes": {
+ "biobricks-R": {
+ "inputs": {
+ "flake-utils": [
+ "flake-utils"
+ ],
+ "nixpkgs": [
+ "nixpkgs"
+ ],
+ "poetry2nix": "poetry2nix"
+ },
+ "locked": {
+ "lastModified": 1695748200,
+ "narHash": "sha256-mxVCkoeLZBSInR6TgSs3L3BBQCzUx+ZXXt5vbPHFX38=",
+ "owner": "biobricks-ai",
+ "repo": "biobricks-R",
+ "rev": "5aedfac7af4edbddb2922129af5ffe366038f8e0",
+ "type": "github"
+ },
+ "original": {
+ "owner": "biobricks-ai",
+ "repo": "biobricks-R",
+ "type": "github"
+ }
+ },
+ "flake-utils": {
+ "inputs": {
+ "systems": "systems"
+ },
+ "locked": {
+ "lastModified": 1689068808,
+ "narHash": "sha256-6ixXo3wt24N/melDWjq70UuHQLxGV8jZvooRanIHXw0=",
+ "owner": "numtide",
+ "repo": "flake-utils",
+ "rev": "919d646de7be200f3bf08cb76ae1f09402b6f9b4",
+ "type": "github"
+ },
+ "original": {
+ "owner": "numtide",
+ "repo": "flake-utils",
+ "type": "github"
+ }
+ },
+ "flake-utils_2": {
+ "inputs": {
+ "systems": "systems_2"
+ },
+ "locked": {
+ "narHash": "sha256-zsNZZGTGnMOf9YpHKJqMSsa0dXbfmxeoJ7xHlrt+xmY=",
+ "rev": "ff7b65b44d01cf9ba6a71320833626af21126384",
+ "revCount": 87,
+ "type": "tarball",
+ "url": "https://api.flakehub.com/f/pinned/numtide/flake-utils/0.1.87+rev-ff7b65b44d01cf9ba6a71320833626af21126384/018a8a74-649b-792e-a959-2f97793b1129/source.tar.gz"
+ },
+ "original": {
+ "type": "tarball",
+ "url": "https://flakehub.com/f/numtide/flake-utils/*.tar.gz"
+ }
+ },
+ "hdt-cpp": {
+ "inputs": {
+ "flake-utils": [
+ "flake-utils"
+ ],
+ "nixpkgs": [
+ "nixpkgs"
+ ]
+ },
+ "locked": {
+ "lastModified": 1695663107,
+ "narHash": "sha256-uYHG4+WvYykdWWU2LjmirgenMKpZ6+E1otmztIdQBPc=",
+ "owner": "insilica",
+ "repo": "nix-hdt",
+ "rev": "5451dd285423e5a2c386dc05b1bf10d93654df56",
+ "type": "github"
+ },
+ "original": {
+ "owner": "insilica",
+ "repo": "nix-hdt",
+ "type": "github"
+ }
+ },
+ "morph-kgc": {
+ "inputs": {
+ "flake-utils": [
+ "flake-utils"
+ ],
+ "nixpkgs": [
+ "nixpkgs"
+ ],
+ "poetry2nix": "poetry2nix_2"
+ },
+ "locked": {
+ "lastModified": 1695678755,
+ "narHash": "sha256-x8Vw79qjOVtK5FPUe3q0vBUpbuPBPVZ/FEupiMN+6R4=",
+ "owner": "insilica",
+ "repo": "nix-morph-kgc",
+ "rev": "884a9570682288dbd770523f031fa38bdd38e4f9",
+ "type": "github"
+ },
+ "original": {
+ "owner": "insilica",
+ "repo": "nix-morph-kgc",
+ "type": "github"
+ }
+ },
+ "nix-github-actions": {
+ "inputs": {
+ "nixpkgs": [
+ "biobricks-R",
+ "poetry2nix",
+ "nixpkgs"
+ ]
+ },
+ "locked": {
+ "lastModified": 1688870561,
+ "narHash": "sha256-4UYkifnPEw1nAzqqPOTL2MvWtm3sNGw1UTYTalkTcGY=",
+ "owner": "nix-community",
+ "repo": "nix-github-actions",
+ "rev": "165b1650b753316aa7f1787f3005a8d2da0f5301",
+ "type": "github"
+ },
+ "original": {
+ "owner": "nix-community",
+ "repo": "nix-github-actions",
+ "type": "github"
+ }
+ },
+ "nixpkgs": {
+ "locked": {
+ "lastModified": 1689192006,
+ "narHash": "sha256-QM0f0d8oPphOTYJebsHioR9+FzJcy1QNIzREyubB91U=",
+ "owner": "NixOS",
+ "repo": "nixpkgs",
+ "rev": "2de8efefb6ce7f5e4e75bdf57376a96555986841",
+ "type": "github"
+ },
+ "original": {
+ "owner": "NixOS",
+ "ref": "nixos-unstable",
+ "repo": "nixpkgs",
+ "type": "github"
+ }
+ },
+ "nixpkgs_2": {
+ "locked": {
+ "narHash": "sha256-kXZ1pUoImD9OEbPCwpTz4tHsNTr4CIyIfXb3ocuR8sI=",
+ "rev": "261abe8a44a7e8392598d038d2e01f7b33cf26d0",
+ "revCount": 491123,
+ "type": "tarball",
+ "url": "https://api.flakehub.com/f/pinned/NixOS/nixpkgs/0.2305.491123+rev-261abe8a44a7e8392598d038d2e01f7b33cf26d0/018acd67-263f-7f69-b7a8-1dc50d54a19a/source.tar.gz"
+ },
+ "original": {
+ "type": "tarball",
+ "url": "https://flakehub.com/f/NixOS/nixpkgs/*.tar.gz"
+ }
+ },
+ "poetry2nix": {
+ "inputs": {
+ "flake-utils": "flake-utils",
+ "nix-github-actions": "nix-github-actions",
+ "nixpkgs": "nixpkgs"
+ },
+ "locked": {
+ "lastModified": 1695386222,
+ "narHash": "sha256-5lgnhCCGW0NH5+m5iTED8u6NSSM/dbH9LBPvX0x0XXg=",
+ "owner": "nix-community",
+ "repo": "poetry2nix",
+ "rev": "093383b3d7fdd36846a7d84e128ca11865800538",
+ "type": "github"
+ },
+ "original": {
+ "id": "poetry2nix",
+ "type": "indirect"
+ }
+ },
+ "poetry2nix_2": {
+ "inputs": {
+ "flake-utils": [
+ "morph-kgc",
+ "flake-utils"
+ ],
+ "nixpkgs": [
+ "morph-kgc",
+ "nixpkgs"
+ ]
+ },
+ "locked": {
+ "lastModified": 1674537260,
+ "narHash": "sha256-DTki81bWzHHRka0ZLayYS5La3t+npRVZvfDH8mx/Las=",
+ "owner": "nix-community",
+ "repo": "poetry2nix",
+ "rev": "a20e27e0555621d35de171270cd041631fc7cb23",
+ "type": "github"
+ },
+ "original": {
+ "owner": "nix-community",
+ "repo": "poetry2nix",
+ "type": "github"
+ }
+ },
+ "root": {
+ "inputs": {
+ "biobricks-R": "biobricks-R",
+ "flake-utils": "flake-utils_2",
+ "hdt-cpp": "hdt-cpp",
+ "morph-kgc": "morph-kgc",
+ "nixpkgs": "nixpkgs_2"
+ }
+ },
+ "systems": {
+ "locked": {
+ "lastModified": 1681028828,
+ "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
+ "owner": "nix-systems",
+ "repo": "default",
+ "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
+ "type": "github"
+ },
+ "original": {
+ "owner": "nix-systems",
+ "repo": "default",
+ "type": "github"
+ }
+ },
+ "systems_2": {
+ "locked": {
+ "lastModified": 1681028828,
+ "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
+ "owner": "nix-systems",
+ "repo": "default",
+ "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
+ "type": "github"
+ },
+ "original": {
+ "owner": "nix-systems",
+ "repo": "default",
+ "type": "github"
+ }
+ }
+ },
+ "root": "root",
+ "version": 7
+}
diff --git a/flake.nix b/flake.nix
new file mode 100644
index 0000000..8dfd8cc
--- /dev/null
+++ b/flake.nix
@@ -0,0 +1,35 @@
+{
+ description = "Tox21 biobrick";
+
+ inputs = {
+ nixpkgs.url = "https://flakehub.com/f/NixOS/nixpkgs/*.tar.gz";
+ flake-utils.url = "https://flakehub.com/f/numtide/flake-utils/*.tar.gz";
+ biobricks-R = {
+ url = "github:biobricks-ai/biobricks-R";
+ inputs.flake-utils.follows = "flake-utils";
+ inputs.nixpkgs.follows = "nixpkgs";
+ };
+ hdt-cpp = {
+ url = "github:insilica/nix-hdt";
+ inputs.flake-utils.follows = "flake-utils";
+ inputs.nixpkgs.follows = "nixpkgs";
+ };
+ morph-kgc = {
+ url = "github:insilica/nix-morph-kgc";
+ inputs.flake-utils.follows = "flake-utils";
+ inputs.nixpkgs.follows = "nixpkgs";
+ };
+ };
+
+ outputs = { self, nixpkgs, flake-utils, biobricks-R, hdt-cpp, morph-kgc }:
+ flake-utils.lib.eachDefaultSystem (system:
+ with import nixpkgs { inherit system; }; {
+ devShells.default = mkShell {
+ buildInputs = [
+ biobricks-R.packages.${system}.rEnv
+ hdt-cpp.packages.${system}.default
+ morph-kgc.packages.${system}.default
+ ];
+ };
+ });
+}
diff --git a/morph-kgc.ini b/morph-kgc.ini
new file mode 100644
index 0000000..d0b7a76
--- /dev/null
+++ b/morph-kgc.ini
@@ -0,0 +1,7 @@
+# Configuration for Morph-KGC
+[CONFIGURATION]
+output_file: rdf/tox21.nt
+na_values: None,na
+
+[DataSource1]
+mappings: ./tox21.rml.ttl
diff --git a/stages/3_write_brick.R b/stages/3_write_brick.R
index fe7ab92..9b26969 100644
--- a/stages/3_write_brick.R
+++ b/stages/3_write_brick.R
@@ -20,4 +20,8 @@ rawfiles <- discard(rawfiles,~grepl("description",.x))
rawtable <- map(rawfiles,~readr::read_tsv(.x))
rawtable <- keep(rawtable,~nrow(.x)>0)
rawmerge <- bind_rows(rawtable)
+rawmerge <- rawmerge %>%
+ mutate(PUBCHEM_CID = as.character(as.integer(PUBCHEM_CID)),
+ PUBCHEM_SID = as.character(as.integer(PUBCHEM_SID)),
+ SAMPLE_DATA_ID = as.character(as.integer(SAMPLE_DATA_ID)))
arrow::write_parquet(rawmerge,"brick/tox21.parquet")
diff --git a/stages/4_rml.sh b/stages/4_rml.sh
new file mode 100755
index 0000000..1a4f1f2
--- /dev/null
+++ b/stages/4_rml.sh
@@ -0,0 +1,6 @@
+#!/usr/bin/env bash
+
+morph-kgc morph-kgc.ini
+mkdir -p rdf
+rdf2hdt rdf/tox21.nt rdf/tox21.hdt
+rm -f rdf/tox21.nt
diff --git a/tox21.rml.ttl b/tox21.rml.ttl
new file mode 100644
index 0000000..22b5fc2
--- /dev/null
+++ b/tox21.rml.ttl
@@ -0,0 +1,94 @@
+@base .
+@prefix : .
+@prefix ex: .
+
+@prefix rml: .
+@prefix rr: .
+@prefix rdf: .
+
+@prefix bao: .
+@prefix dbo: .
+@prefix edam: .
+@prefix enano: .
+@prefix idot: .
+@prefix obo: .
+
+:TriplesMap a rr:TriplesMap;
+ rml:logicalSource [
+ rml:source "brick/tox21.parquet";
+ ];
+
+ rr:subjectMap [
+ rr:template "http://example.com/tox21/record/{SAMPLE_DATA_ID}";
+ rr:class ex:record;
+ ];
+
+ rr:predicateObjectMap [
+ rr:predicate rdf:type;
+ rr:objectMap [ rml:reference "Record" ];
+ ];
+
+ rr:predicateObjectMap [
+ rr:predicate bao:BAO_0000186;
+ rr:objectMap [
+ rml:reference "AC50"
+ ]
+ ];
+
+ rr:predicateObjectMap [
+ rr:predicate edam:data_1002;
+ rr:objectMap [
+ rml:reference "CAS"
+ ]
+ ];
+
+ rr:predicateObjectMap [
+ rr:predicate bao:BAO_0000656;
+ rr:objectMap [
+ rml:reference "EFFICACY"
+ ]
+ ];
+
+ rr:predicateObjectMap [
+ rr:predicate bao:BAO_0000523;
+ rr:objectMap [
+ rml:reference "PROTOCOL_NAME"
+ ]
+ ];
+
+ rr:predicateObjectMap [
+ rr:predicate enano:ENM_9000071;
+ rr:objectMap [
+ rml:reference "SAMPLE_ID"
+ ]
+ ];
+
+ rr:predicateObjectMap [
+ rr:predicate obo:MS_1000002;
+ rr:objectMap [
+ rml:reference "SAMPLE_NAME"
+ ]
+ ];
+
+ rr:predicateObjectMap [
+ rr:predicate edam:format_1196;
+ rr:objectMap [
+ rml:reference "SMILES"
+ ]
+ ];
+
+ rr:predicateObjectMap [
+ rr:predicate idot:pubchem.compound;
+ rr:objectMap [
+ rr:template "http://identifiers.org/pubchem.compound/{PUBCHEM_CID}";
+ rr:termType rr:IRI;
+ ]
+ ];
+
+ rr:predicateObjectMap [
+ rr:predicate idot:pubchem.substance;
+ rr:objectMap [
+ rr:template "http://identifiers.org/pubchem.substance/{PUBCHEM_SID}";
+ rr:termType rr:IRI;
+ ]
+ ].