Skip to content

Commit

Permalink
Use clj-ontology to map remaining columns
Browse files Browse the repository at this point in the history
  • Loading branch information
john-shaffer committed Sep 28, 2023
1 parent 03f673f commit b242973
Show file tree
Hide file tree
Showing 9 changed files with 245 additions and 165 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
!.github
!.gitignore

classes
logs
/download
/temp
Expand Down
8 changes: 8 additions & 0 deletions deps.edn
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{:paths ["stages"]
:deps
{co.insilica/ontology {:git/url "https://github.com/insilica/clj-ontology.git"
:git/sha "37aa3788e7ca579b5fd9be0522a06da2faaecb9a"}}
:aliases
{:rdf
{:main-opts ["-m" "rml"]
:jvm-opts ["-Xms600m" "-Xmx9600m"]}}}
20 changes: 10 additions & 10 deletions dvc.lock
Original file line number Diff line number Diff line change
Expand Up @@ -77,21 +77,21 @@ stages:
md5: 268c1928221d9976b99ecaedc9edf48f.dir
size: 1272033113
nfiles: 3
- path: morph-kgc.ini
- path: deps.edn
hash: md5
md5: 4db7b2ab3384fb468b716ddc7161a00d
size: 132
md5: f6373a4d8bf398b6b30fd237eba1a82a
size: 269
- path: stages/4_rml.sh
hash: md5
md5: 1a6f8e5e301c61b24fc3023ad970d2e8
size: 112
- path: tox21.rml.ttl
md5: eeb4ad8a5e8632f32197111263ad9c8e
size: 116
- path: stages/rml.clj
hash: md5
md5: 6279d0e422d2cb7a33ae813afe3611d3
size: 2121
md5: 4be9817074d4890258bd37d925be03a9
size: 6080
outs:
- path: rdf/
hash: md5
md5: 7e612fd7f1b73c26ddf5a1a2caee495f.dir
size: 355139697
md5: 27ae0fa7d2afc7802d92e73f3ddb670a.dir
size: 1863223028
nfiles: 1
4 changes: 2 additions & 2 deletions dvc.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@ stages:
cmd: bash stages/4_rml.sh
deps:
- brick/
- deps.edn
- stages/4_rml.sh
- tox21.rml.ttl
- morph-kgc.ini
- stages/rml.clj
outs:
- rdf/
50 changes: 0 additions & 50 deletions flake.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

27 changes: 19 additions & 8 deletions flake.nix
Original file line number Diff line number Diff line change
Expand Up @@ -14,22 +14,33 @@
inputs.flake-utils.follows = "flake-utils";
inputs.nixpkgs.follows = "nixpkgs";
};
morph-kgc = {
url = "github:insilica/nix-morph-kgc";
inputs.flake-utils.follows = "flake-utils";
inputs.nixpkgs.follows = "nixpkgs";
};
};

outputs = { self, nixpkgs, flake-utils, biobricks-R, hdt-cpp, morph-kgc }:
outputs = { self, nixpkgs, flake-utils, biobricks-R, hdt-cpp }:
flake-utils.lib.eachDefaultSystem (system:
with import nixpkgs { inherit system; }; {
with import nixpkgs { inherit system; };
let
# tmducken requires duckdb 0.8.1 or later, and nixos-23.05 only had 0.7
duckdb-version = "0.8.1";
duckdb = (pkgs.duckdb.overrideAttrs (oldAttrs: rec {
version = duckdb-version;
src = fetchFromGitHub {
owner = "duckdb";
repo = "duckdb";
rev = "v${duckdb-version}";
sha256 = "sha256-LEv9yURkYvONObTbIA4CS+umwCRMH8gRQaDtzbCzID4=";
};
}));
in {
devShells.default = mkShell {
buildInputs = [
biobricks-R.packages.${system}.rEnv
clojure
hdt-cpp.packages.${system}.default
morph-kgc.packages.${system}.default
duckdb
jdk
];
env = { DUCKDB_HOME = "${duckdb}/lib"; };
};
});
}
2 changes: 1 addition & 1 deletion stages/4_rml.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env bash

morph-kgc morph-kgc.ini
mkdir -p rdf
clojure -M:rdf rdf/tox21.nt
rdf2hdt rdf/tox21.nt rdf/tox21.hdt
rm -f rdf/tox21.nt
204 changes: 204 additions & 0 deletions stages/rml.clj
Original file line number Diff line number Diff line change
@@ -0,0 +1,204 @@
(ns rml
(:require [clojure.string :as str]
[insilica.ontology.biochem :as bc]
[insilica.ontology.core :as ont :refer [clazz def-classes]]
[tech.v3.dataset :as ds]
[tmducken.duckdb :as duckdb]))

(def classes
[(clazz
"Tox21AggregatedRecord"
nil
"http://example.com/Tox21/AggregatedRecord"
:resource-prefix "http://example.com/Tox21/AggregatedRecord/")
(clazz
"Tox21AssayOutcome"
nil
"http://example.com/Tox21/AssayOutcome"
:property-type ont/XMLString)
(clazz
"Tox21ChannelOutcome"
nil
"http://example.com/Tox21/ChannelOutcome"
:property-type ont/XMLString)
(clazz
"Tox21ConcArray"
nil
"http://example.com/Tox21/ConcArray"
:property-type ont/XMLString)
(clazz
"Tox21CurveClass2"
nil
"http://example.com/Tox21/CurveClass2"
:property-type ont/XMLDouble)
(clazz
"Tox21CurveRank"
nil
"http://example.com/Tox21/CurveRank"
:property-type ont/XMLDouble)
(clazz
"Tox21DataArray"
nil
"http://example.com/Tox21/DataArray"
:property-type ont/XMLString)
(clazz
"Tox21ID"
nil
"http://example.com/Tox21/ID"
:property-type ont/XMLString)
(clazz
"Tox21HillCoeff"
nil
"http://example.com/Tox21/HillCoeff"
:property-type ont/XMLDouble)
(clazz
"Tox21InfActivity"
nil
"http://example.com/Tox21/InfActivity"
:property-type ont/XMLDouble)
(clazz
"Tox21PHill"
nil
"http://example.com/Tox21/PHill"
:property-type ont/XMLDouble)
(clazz
"Tox21PurityRating"
nil
"http://example.com/Tox21/PurityRating"
:property-type ont/XMLString)
(clazz
"Tox21R2"
nil
"http://example.com/Tox21/R2"
:property-type ont/XMLDouble)
(clazz
"Tox21Record"
nil
"http://example.com/Tox21/Record"
:resource-prefix "http://example.com/Tox21/Record/")
(clazz
"Tox21Reproducibility"
nil
"http://example.com/Tox21/Reproducibility"
:property-type ont/XMLString)
(clazz
"Tox21SampleDataType"
nil
"http://example.com/Tox21/SampleDataType"
:property-type ont/XMLString)
(clazz
"Tox21ZeroActivity"
nil
"http://example.com/Tox21/ZeroActivity"
:property-type ont/XMLDouble)])

(def-classes classes)

(defn get-tox21 [conn]
(duckdb/sql->dataset conn "select * from 'brick/tox21.parquet'"))

(defn get-tox21-aggregated [conn]
(duckdb/sql->dataset conn "select * from 'brick/tox21_aggregated.parquet'"))

(defn get-tox21-lib [conn]
(duckdb/sql->dataset conn "select * from 'brick/tox21lib.parquet'"))

(def re-conc #"CONC(\d+)")
(def re-data #"DATA(\d+)")

(def tox21-mapping
{bc/hasAC50 "AC50"
bc/hasCASNumber "CAS"
bc/hasEfficacy "EFFICACY"
bc/hasProtocolName "PROTOCOL_NAME"
bc/hasPubchemCID #(-> % (get "PUBCHEM_CID") str parse-long)
bc/hasPubchemSID #(-> % (get "PUBCHEM_SID") str parse-long)
bc/hasSampleDataID "SAMPLE_DATA_ID"
bc/hasSampleID "SAMPLE_ID"
bc/hasSampleName "SAMPLE_NAME"
bc/hasSMILES "SMILES"
hasTox21ConcArray (fn [m]
(some->> m keys
(keep #(some-> (re-find re-conc %) second parse-long))
seq sort
(map #(get m (str "CONC" %)))
(str/join " ")))
hasTox21CurveClass2 "CURVE_CLASS2"
hasTox21DataArray (fn [m]
(some->> m keys
(keep #(some-> (re-find re-data %) second parse-long))
seq sort
(map #(get m (str "DATA" %)))
(str/join " ")))
hasTox21HillCoeff "HILL_COEF"
hasTox21ID "TOX21_ID"
hasTox21InfActivity "INF_ACTIVITY"
hasTox21PHill "P_HILL"
hasTox21PurityRating "PURITY_RATING"
hasTox21R2 "R2"
hasTox21SampleDataType "SAMPLE_DATA_TYPE"
hasTox21ZeroActivity "ZERO_ACTIVITY"})

(defn tox21-row->triples
[{:as data :strs [SAMPLE_DATA_ID]}]
(let [subj (ont/resource-iri Tox21Record SAMPLE_DATA_ID)]
(concat
[(ont/isA subj Tox21Record)]
(ont/subj-mappings subj data tox21-mapping))))

(def tox21-aggregated-mapping
{bc/hasAC50 "AC50"
bc/hasCASNumber "CAS"
bc/hasEfficacy "EFFICACY"
bc/hasProtocolName "PROTOCOL_NAME"
bc/hasPubchemCID #(-> % (get "PUBCHEM_CID") str parse-long)
bc/hasPubchemSID #(-> % (get "PUBCHEM_SID") str parse-long)
bc/hasSampleDataID "SAMPLE_DATA_ID"
bc/hasSampleID "SAMPLE_ID"
bc/hasSampleName "SAMPLE_NAME"
bc/hasSMILES "SMILES"
hasTox21AssayOutcome "ASSAY_OUTCOME"
hasTox21ChannelOutcome "CHANNEL_OUTCOME"
hasTox21CurveRank "CURVE_RANK"
hasTox21ID "TOX21_ID"
hasTox21PurityRating "PURITY_RATING"
hasTox21Reproducibility "REPRODUCIBILITY"
hasTox21SampleDataType "SAMPLE_DATA_TYPE"})

(defn tox21-aggregated-row->triples
[{:as data :strs [SAMPLE_ID]}]
(let [subj (ont/resource-iri Tox21AggregatedRecord SAMPLE_ID)]
(concat
[(ont/isA subj Tox21AggregatedRecord)]
(ont/subj-mappings subj data tox21-aggregated-mapping))))

(defn -main [filename]
(duckdb/initialize!)
(let [db (duckdb/open-db)
conn (duckdb/connect db)]
(try
(->> (mapcat tox21-row->triples (ds/rows (get-tox21 conn)))
(concat (mapcat tox21-aggregated-row->triples (ds/rows (get-tox21-aggregated conn))))
(ont/write-nt filename))
(finally
(duckdb/close-db db)))))

(comment
(do
(def filename "rdf/tox21.nt")
(duckdb/initialize!)
(def db (duckdb/open-db))
(def conn (duckdb/connect db))
(def tox21 (get-tox21 conn))
(def tox21-aggregated (get-tox21-aggregated conn))
(def tox21-lib (get-tox21-lib conn)))

(def row1 (-> tox21 ds/rows first))
(clojure.string/join " " (sort (keys row1)))
(->> (tox21-row->triples row1) ont/nt-seq)
(first (mapcat tox21-row->triples (ds/rows (get-tox21 conn))))

(-> (-> tox21-aggregated ds/rows first)
tox21-aggregated-row->triples ont/nt-seq)
(-> tox21-lib ds/rows first)
)
Loading

0 comments on commit b242973

Please sign in to comment.