diff --git a/.editorconfig b/.editorconfig new file mode 100644 index 0000000..96bd366 --- /dev/null +++ b/.editorconfig @@ -0,0 +1,14 @@ +[*] +end_of_line = lf +charset = utf-8 +trim_trailing_whitespace = true +insert_final_newline = true +indent_style = space +indent_size = 2 +max_line_length = 120 + +[Makefile] +indent_style = tab + +[*.{diff,md}] +trim_trailing_whitespace = false diff --git a/.envrc b/.envrc new file mode 100644 index 0000000..3550a30 --- /dev/null +++ b/.envrc @@ -0,0 +1 @@ +use flake diff --git a/.gitignore b/.gitignore index 0cfe183..1c7d08d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,12 @@ +.* +!.editorconfig +!.envrc +!.github +!.gitignore + logs /download /temp /brick -/raw \ No newline at end of file +/raw +/rdf diff --git a/dvc.lock b/dvc.lock index e89d8e8..d8d65c3 100644 --- a/dvc.lock +++ b/dvc.lock @@ -49,8 +49,8 @@ stages: nfiles: 337 - path: stages/3_write_brick.R hash: md5 - md5: c5dbdeded250cf30b3af70cc436cb77f - size: 942 + md5: 1377f37bad12cec9d4a0e8afae48140d + size: 1159 - path: temp hash: md5 md5: 01a6bc69d09016a79132483c28820178.dir @@ -59,13 +59,39 @@ stages: outs: - path: brick/tox21.parquet hash: md5 - md5: 247348fd1254be594b21d202d514d55d - size: 1231260948 + md5: 0eb67f9b2fc3b16aff313d8ea9cdc359 + size: 1237116063 - path: brick/tox21_aggregated.parquet hash: md5 md5: 4de160eb524d30b610ed369d19776af8 size: 34091575 - path: brick/tox21lib.parquet hash: md5 - md5: 0f67d0d71773dfff4e9658334b1c8895 + md5: be3b8da4eebac921c7eca80177280e93 size: 825475 + rml: + cmd: bash stages/4_rml.sh + deps: + - path: brick/ + hash: md5 + md5: 268c1928221d9976b99ecaedc9edf48f.dir + size: 1272033113 + nfiles: 3 + - path: morph-kgc.ini + hash: md5 + md5: 4db7b2ab3384fb468b716ddc7161a00d + size: 132 + - path: stages/4_rml.sh + hash: md5 + md5: 1a6f8e5e301c61b24fc3023ad970d2e8 + size: 112 + - path: tox21.rml.ttl + hash: md5 + md5: 6279d0e422d2cb7a33ae813afe3611d3 + size: 2121 + outs: + - path: rdf/ + hash: md5 + md5: 7e612fd7f1b73c26ddf5a1a2caee495f.dir + size: 355139697 + nfiles: 1 diff --git a/dvc.yaml b/dvc.yaml index 915523d..4c7adcb 100644 --- a/dvc.yaml +++ b/dvc.yaml @@ -8,7 +8,7 @@ stages: outs: - download - temp - unzip: + unzip: cmd: stages/2_unzip.sh deps: - stages/2_unzip.sh @@ -16,7 +16,7 @@ stages: - temp outs: - raw - build: + build: cmd: Rscript stages/3_write_brick.R deps: - stages/3_write_brick.R @@ -26,3 +26,12 @@ stages: - brick/tox21.parquet - brick/tox21_aggregated.parquet - brick/tox21lib.parquet + rml: + cmd: bash stages/4_rml.sh + deps: + - brick/ + - stages/4_rml.sh + - tox21.rml.ttl + - morph-kgc.ini + outs: + - rdf/ diff --git a/flake.lock b/flake.lock new file mode 100644 index 0000000..5468abc --- /dev/null +++ b/flake.lock @@ -0,0 +1,245 @@ +{ + "nodes": { + "biobricks-R": { + "inputs": { + "flake-utils": [ + "flake-utils" + ], + "nixpkgs": [ + "nixpkgs" + ], + "poetry2nix": "poetry2nix" + }, + "locked": { + "lastModified": 1695748200, + "narHash": "sha256-mxVCkoeLZBSInR6TgSs3L3BBQCzUx+ZXXt5vbPHFX38=", + "owner": "biobricks-ai", + "repo": "biobricks-R", + "rev": "5aedfac7af4edbddb2922129af5ffe366038f8e0", + "type": "github" + }, + "original": { + "owner": "biobricks-ai", + "repo": "biobricks-R", + "type": "github" + } + }, + "flake-utils": { + "inputs": { + "systems": "systems" + }, + "locked": { + "lastModified": 1689068808, + "narHash": "sha256-6ixXo3wt24N/melDWjq70UuHQLxGV8jZvooRanIHXw0=", + "owner": "numtide", + "repo": "flake-utils", + "rev": "919d646de7be200f3bf08cb76ae1f09402b6f9b4", + "type": "github" + }, + "original": { + "owner": "numtide", + "repo": "flake-utils", + "type": "github" + } + }, + "flake-utils_2": { + "inputs": { + "systems": "systems_2" + }, + "locked": { + "narHash": "sha256-zsNZZGTGnMOf9YpHKJqMSsa0dXbfmxeoJ7xHlrt+xmY=", + "rev": "ff7b65b44d01cf9ba6a71320833626af21126384", + "revCount": 87, + "type": "tarball", + "url": "https://api.flakehub.com/f/pinned/numtide/flake-utils/0.1.87+rev-ff7b65b44d01cf9ba6a71320833626af21126384/018a8a74-649b-792e-a959-2f97793b1129/source.tar.gz" + }, + "original": { + "type": "tarball", + "url": "https://flakehub.com/f/numtide/flake-utils/*.tar.gz" + } + }, + "hdt-cpp": { + "inputs": { + "flake-utils": [ + "flake-utils" + ], + "nixpkgs": [ + "nixpkgs" + ] + }, + "locked": { + "lastModified": 1695663107, + "narHash": "sha256-uYHG4+WvYykdWWU2LjmirgenMKpZ6+E1otmztIdQBPc=", + "owner": "insilica", + "repo": "nix-hdt", + "rev": "5451dd285423e5a2c386dc05b1bf10d93654df56", + "type": "github" + }, + "original": { + "owner": "insilica", + "repo": "nix-hdt", + "type": "github" + } + }, + "morph-kgc": { + "inputs": { + "flake-utils": [ + "flake-utils" + ], + "nixpkgs": [ + "nixpkgs" + ], + "poetry2nix": "poetry2nix_2" + }, + "locked": { + "lastModified": 1695678755, + "narHash": "sha256-x8Vw79qjOVtK5FPUe3q0vBUpbuPBPVZ/FEupiMN+6R4=", + "owner": "insilica", + "repo": "nix-morph-kgc", + "rev": "884a9570682288dbd770523f031fa38bdd38e4f9", + "type": "github" + }, + "original": { + "owner": "insilica", + "repo": "nix-morph-kgc", + "type": "github" + } + }, + "nix-github-actions": { + "inputs": { + "nixpkgs": [ + "biobricks-R", + "poetry2nix", + "nixpkgs" + ] + }, + "locked": { + "lastModified": 1688870561, + "narHash": "sha256-4UYkifnPEw1nAzqqPOTL2MvWtm3sNGw1UTYTalkTcGY=", + "owner": "nix-community", + "repo": "nix-github-actions", + "rev": "165b1650b753316aa7f1787f3005a8d2da0f5301", + "type": "github" + }, + "original": { + "owner": "nix-community", + "repo": "nix-github-actions", + "type": "github" + } + }, + "nixpkgs": { + "locked": { + "lastModified": 1689192006, + "narHash": "sha256-QM0f0d8oPphOTYJebsHioR9+FzJcy1QNIzREyubB91U=", + "owner": "NixOS", + "repo": "nixpkgs", + "rev": "2de8efefb6ce7f5e4e75bdf57376a96555986841", + "type": "github" + }, + "original": { + "owner": "NixOS", + "ref": "nixos-unstable", + "repo": "nixpkgs", + "type": "github" + } + }, + "nixpkgs_2": { + "locked": { + "narHash": "sha256-kXZ1pUoImD9OEbPCwpTz4tHsNTr4CIyIfXb3ocuR8sI=", + "rev": "261abe8a44a7e8392598d038d2e01f7b33cf26d0", + "revCount": 491123, + "type": "tarball", + "url": "https://api.flakehub.com/f/pinned/NixOS/nixpkgs/0.2305.491123+rev-261abe8a44a7e8392598d038d2e01f7b33cf26d0/018acd67-263f-7f69-b7a8-1dc50d54a19a/source.tar.gz" + }, + "original": { + "type": "tarball", + "url": "https://flakehub.com/f/NixOS/nixpkgs/*.tar.gz" + } + }, + "poetry2nix": { + "inputs": { + "flake-utils": "flake-utils", + "nix-github-actions": "nix-github-actions", + "nixpkgs": "nixpkgs" + }, + "locked": { + "lastModified": 1695386222, + "narHash": "sha256-5lgnhCCGW0NH5+m5iTED8u6NSSM/dbH9LBPvX0x0XXg=", + "owner": "nix-community", + "repo": "poetry2nix", + "rev": "093383b3d7fdd36846a7d84e128ca11865800538", + "type": "github" + }, + "original": { + "id": "poetry2nix", + "type": "indirect" + } + }, + "poetry2nix_2": { + "inputs": { + "flake-utils": [ + "morph-kgc", + "flake-utils" + ], + "nixpkgs": [ + "morph-kgc", + "nixpkgs" + ] + }, + "locked": { + "lastModified": 1674537260, + "narHash": "sha256-DTki81bWzHHRka0ZLayYS5La3t+npRVZvfDH8mx/Las=", + "owner": "nix-community", + "repo": "poetry2nix", + "rev": "a20e27e0555621d35de171270cd041631fc7cb23", + "type": "github" + }, + "original": { + "owner": "nix-community", + "repo": "poetry2nix", + "type": "github" + } + }, + "root": { + "inputs": { + "biobricks-R": "biobricks-R", + "flake-utils": "flake-utils_2", + "hdt-cpp": "hdt-cpp", + "morph-kgc": "morph-kgc", + "nixpkgs": "nixpkgs_2" + } + }, + "systems": { + "locked": { + "lastModified": 1681028828, + "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=", + "owner": "nix-systems", + "repo": "default", + "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e", + "type": "github" + }, + "original": { + "owner": "nix-systems", + "repo": "default", + "type": "github" + } + }, + "systems_2": { + "locked": { + "lastModified": 1681028828, + "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=", + "owner": "nix-systems", + "repo": "default", + "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e", + "type": "github" + }, + "original": { + "owner": "nix-systems", + "repo": "default", + "type": "github" + } + } + }, + "root": "root", + "version": 7 +} diff --git a/flake.nix b/flake.nix new file mode 100644 index 0000000..8dfd8cc --- /dev/null +++ b/flake.nix @@ -0,0 +1,35 @@ +{ + description = "Tox21 biobrick"; + + inputs = { + nixpkgs.url = "https://flakehub.com/f/NixOS/nixpkgs/*.tar.gz"; + flake-utils.url = "https://flakehub.com/f/numtide/flake-utils/*.tar.gz"; + biobricks-R = { + url = "github:biobricks-ai/biobricks-R"; + inputs.flake-utils.follows = "flake-utils"; + inputs.nixpkgs.follows = "nixpkgs"; + }; + hdt-cpp = { + url = "github:insilica/nix-hdt"; + inputs.flake-utils.follows = "flake-utils"; + inputs.nixpkgs.follows = "nixpkgs"; + }; + morph-kgc = { + url = "github:insilica/nix-morph-kgc"; + inputs.flake-utils.follows = "flake-utils"; + inputs.nixpkgs.follows = "nixpkgs"; + }; + }; + + outputs = { self, nixpkgs, flake-utils, biobricks-R, hdt-cpp, morph-kgc }: + flake-utils.lib.eachDefaultSystem (system: + with import nixpkgs { inherit system; }; { + devShells.default = mkShell { + buildInputs = [ + biobricks-R.packages.${system}.rEnv + hdt-cpp.packages.${system}.default + morph-kgc.packages.${system}.default + ]; + }; + }); +} diff --git a/morph-kgc.ini b/morph-kgc.ini new file mode 100644 index 0000000..d0b7a76 --- /dev/null +++ b/morph-kgc.ini @@ -0,0 +1,7 @@ +# Configuration for Morph-KGC +[CONFIGURATION] +output_file: rdf/tox21.nt +na_values: None,na + +[DataSource1] +mappings: ./tox21.rml.ttl diff --git a/stages/3_write_brick.R b/stages/3_write_brick.R index fe7ab92..9b26969 100644 --- a/stages/3_write_brick.R +++ b/stages/3_write_brick.R @@ -20,4 +20,8 @@ rawfiles <- discard(rawfiles,~grepl("description",.x)) rawtable <- map(rawfiles,~readr::read_tsv(.x)) rawtable <- keep(rawtable,~nrow(.x)>0) rawmerge <- bind_rows(rawtable) +rawmerge <- rawmerge %>% + mutate(PUBCHEM_CID = as.character(as.integer(PUBCHEM_CID)), + PUBCHEM_SID = as.character(as.integer(PUBCHEM_SID)), + SAMPLE_DATA_ID = as.character(as.integer(SAMPLE_DATA_ID))) arrow::write_parquet(rawmerge,"brick/tox21.parquet") diff --git a/stages/4_rml.sh b/stages/4_rml.sh new file mode 100755 index 0000000..1a4f1f2 --- /dev/null +++ b/stages/4_rml.sh @@ -0,0 +1,6 @@ +#!/usr/bin/env bash + +morph-kgc morph-kgc.ini +mkdir -p rdf +rdf2hdt rdf/tox21.nt rdf/tox21.hdt +rm -f rdf/tox21.nt diff --git a/tox21.rml.ttl b/tox21.rml.ttl new file mode 100644 index 0000000..22b5fc2 --- /dev/null +++ b/tox21.rml.ttl @@ -0,0 +1,94 @@ +@base . +@prefix : . +@prefix ex: . + +@prefix rml: . +@prefix rr: . +@prefix rdf: . + +@prefix bao: . +@prefix dbo: . +@prefix edam: . +@prefix enano: . +@prefix idot: . +@prefix obo: . + +:TriplesMap a rr:TriplesMap; + rml:logicalSource [ + rml:source "brick/tox21.parquet"; + ]; + + rr:subjectMap [ + rr:template "http://example.com/tox21/record/{SAMPLE_DATA_ID}"; + rr:class ex:record; + ]; + + rr:predicateObjectMap [ + rr:predicate rdf:type; + rr:objectMap [ rml:reference "Record" ]; + ]; + + rr:predicateObjectMap [ + rr:predicate bao:BAO_0000186; + rr:objectMap [ + rml:reference "AC50" + ] + ]; + + rr:predicateObjectMap [ + rr:predicate edam:data_1002; + rr:objectMap [ + rml:reference "CAS" + ] + ]; + + rr:predicateObjectMap [ + rr:predicate bao:BAO_0000656; + rr:objectMap [ + rml:reference "EFFICACY" + ] + ]; + + rr:predicateObjectMap [ + rr:predicate bao:BAO_0000523; + rr:objectMap [ + rml:reference "PROTOCOL_NAME" + ] + ]; + + rr:predicateObjectMap [ + rr:predicate enano:ENM_9000071; + rr:objectMap [ + rml:reference "SAMPLE_ID" + ] + ]; + + rr:predicateObjectMap [ + rr:predicate obo:MS_1000002; + rr:objectMap [ + rml:reference "SAMPLE_NAME" + ] + ]; + + rr:predicateObjectMap [ + rr:predicate edam:format_1196; + rr:objectMap [ + rml:reference "SMILES" + ] + ]; + + rr:predicateObjectMap [ + rr:predicate idot:pubchem.compound; + rr:objectMap [ + rr:template "http://identifiers.org/pubchem.compound/{PUBCHEM_CID}"; + rr:termType rr:IRI; + ] + ]; + + rr:predicateObjectMap [ + rr:predicate idot:pubchem.substance; + rr:objectMap [ + rr:template "http://identifiers.org/pubchem.substance/{PUBCHEM_SID}"; + rr:termType rr:IRI; + ] + ].