From 64fe298b5eadd59b2b5a190a5fb68e5c49d8a9fd Mon Sep 17 00:00:00 2001 From: Lukasz Wiklendt Date: Wed, 9 Oct 2024 12:46:27 +1030 Subject: [PATCH 1/2] init build MIMIC-IV-ED duckdb --- mimic-iv-ed/buildmimic/duckdb/README.md | 113 +++++++++++++++++ .../buildmimic/duckdb/import_duckdb.sh | 118 ++++++++++++++++++ 2 files changed, 231 insertions(+) create mode 100644 mimic-iv-ed/buildmimic/duckdb/README.md create mode 100644 mimic-iv-ed/buildmimic/duckdb/import_duckdb.sh diff --git a/mimic-iv-ed/buildmimic/duckdb/README.md b/mimic-iv-ed/buildmimic/duckdb/README.md new file mode 100644 index 000000000..951b4e987 --- /dev/null +++ b/mimic-iv-ed/buildmimic/duckdb/README.md @@ -0,0 +1,113 @@ +# DuckDB + +The script in this folder creates the schema for MIMIC-IV-ED and +loads the data into the appropriate tables for +[DuckDB](https://duckdb.org/). +DuckDB, like SQLite, is serverless and +stores all information in a single file. +Unlike SQLite, an OLTP database, +DuckDB is an OLAP database, and therefore optimized for analytical queries. +This will result in faster queries for researchers using MIMIC-IV-ED +with DuckDB compared to SQLite. +To learn more, please read their ["why duckdb"](https://duckdb.org/docs/why_duckdb) +page. + +The instructions to load MIMIC-IV-ED into a DuckDB +only require: +1. DuckDB to be installed and +2. Your computer to have a POSIX-compliant terminal shell, + which is already found by default on any Mac OSX, Linux, or BSD installation. + +To use these instructions on Windows, +you need a Unix command line environment, +which you can obtain by either installing +[Windows Subsystem for Linux](https://docs.microsoft.com/en-us/windows/wsl/install-win10) +or [Cygwin](https://www.cygwin.com/). + +## Set-up + +### Quick overview + +1. [Install](https://duckdb.org/docs/installation/) the CLI version of DuckDB +2. [Download](https://physionet.org/content/mimic-iv-ed/2.2/) the MIMIC-IV-ED files +3. Create DuckDB database and load data + +### Install DuckDB + +Follow instructions on their website to +[install](https://duckdb.org/docs/installation/) +the CLI version of DuckDB. + +You will need to place the `duckdb` binary in a folder on your environment path, +e.g. `/usr/local/bin`. + +### Download MIMIC-IV-ED files + +Download the CSV files for [MIMIC-IV-ED](https://physionet.org/content/mimic-iv-ed/2.2/) +by any method you wish. +These instructions were tested with MIMIC-IV-ED v2.2. + +The CSV files should be a folder structure as follows: + +``` +mimic_data_dir + ed + diagnosis.csv.gz + ... + vitalsign.csv.gz +``` + +The CSV files can be uncompressed (end in `.csv`) or compressed (end in `.csv.gz`). + +The easiest way to download them is to open a terminal then run: + +``` +wget -r -N -c -np --user YOURUSERNAME --ask-password https://physionet.org/files/mimic-iv-ed/2.2/ +``` + +Replace `YOURUSERNAME` with your physionet username. + +This will make you `mimic_data_dir` be `physionet.org/files/mimic-iv-ed/2.2`. + +# Create DuckDB database and load data + +The last step requires creating a DuckDB database and +loading the data into it. + +You can do all of this will one shell script, `import_duckdb.sh`, +located in this repository. + +See the help for it below: + +```sh +$ ./import_duckdb.sh -h +./import_duckdb.sh: +USAGE: ./import_duckdb.sh mimic_data_dir [output_db] +WHERE: + mimic_data_dir directory that contains csv.gz or csv files + output_db: optional filename for duckdb file (default: mimic4_ed.db) +$ +``` + +Here's an example invocation that will make the database in the default "mimic4_ed.db": + +```sh +$ ./import_duckdb.sh physionet.org/files/mimic-iv-ed/2.2 + + <... output of script snipped ...> +Successfully finished loading data into mimic4_ed.db. + +$ ls -lh mimic4_ed.db +-rw-rw-r--. 1 myuser mygroup 93G May 26 16:11 mimic4_ed.db +``` + +The script will print out progress as it goes. +Be patient, this can take minutes to hours to load +depending on your computer's configuration. + +* It took 16m25s on a Fedora 34 workstation with duckdb v 0.2.6, a btrfs filesystem with ztsd level 1 compression, AMD Ryzen 3900X, 32 GB RAM, Samsung 970 Evo NVMe SSD. +* It took ~10m on a Mac M1 Max 2021, 32 GB RAM. + +# Help + +Please see the [issues page](https://github.com/MIT-LCP/mimic-code/issues) to discuss other issues you may be having. diff --git a/mimic-iv-ed/buildmimic/duckdb/import_duckdb.sh b/mimic-iv-ed/buildmimic/duckdb/import_duckdb.sh new file mode 100644 index 000000000..f134b9f7d --- /dev/null +++ b/mimic-iv-ed/buildmimic/duckdb/import_duckdb.sh @@ -0,0 +1,118 @@ +#!/bin/sh + +# Copyright (c) 2023 MIT Laboratory for Computational Physiology +# Copyright (c) 2021 Thomas Ward +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +yell () { echo "$0: $*" >&2; } +die () { yell "$*"; exit 111; } +try () { "$@" || die "Exiting. Failed to run: \"$*\""; } + +usage () { + die " +USAGE: ./import_duckdb.sh mimic_data_dir [output_db] +WHERE: + mimic_data_dir directory that contains csv.tar.gz or csv files + output_db: optional filename for duckdb file (default: mimic4_ed.db)\ +" +} + +# Print help if requested +echo "$0 $* " | grep -Eq " -h | --help " && usage + +# rename CLI positional args to more friendly variable names +MIMIC_DIR=$1 +# allow optional specification of duckdb name, otherwise default to mimic4_ed.db +OUTFILE=mimic4_ed.db +if [ -n "$2" ]; then + OUTFILE=$2 +fi + + +# basic error checking before running +if [ -z "$MIMIC_DIR" ]; then + yell "Please specify a mimic data directory" + die "Usage: ./import_duckdb.sh mimic_data_dir [output_db]" +elif [ ! -d "$MIMIC_DIR" ]; then + yell "Specified directory \"$MIMIC_DIR\" does not exist." + die "Usage: ./import_duckdb.sh mimic_data_dir [output_db]" +elif [ -n "$3" ]; then + yell "import.sh takes a maximum of two arguments." + die "Usage: ./import_duckdb.sh mimic_data_dir [output_db]" +elif [ -s "$OUTFILE" ]; then + yell "File \"$OUTFILE\" already exists." + read -p "Continue? (y/d/n) 'y' continues, 'd' deletes original file, 'n' stops: " yn + case $yn in + [Yy]* ) ;; # OK + [Nn]* ) exit;; + [Dd]* ) rm "$OUTFILE";; + * ) die "Unrecognized input.";; + esac +fi + +# we will copy the postgresql create.sql file, and apply regex +# to fix the following issues: +# 1. Remove optional precision value from TIMESTAMP(NN) -> TIMESTAMP +# duckdb does not support this. +export REGEX_TIMESTAMP='s/TIMESTAMP\([0-9]+\)/TIMESTAMP/g' +# 2. Remove NOT NULL constraint from mimiciv_hosp.microbiologyevents.spec_type_desc +# as there is one (!) zero-length string which is treated as a NULL by the import. +export REGEX_SPEC_TYPE='s/spec_type_desc(.+)NOT NULL/spec_type_desc\1/g' +# 3. Remove NOT NULL constraint from mimiciv_hosp.prescriptions.drug +# as there are zero-length strings which are treated as NULLs by the import. +export REGEX_DRUG='s/drug +(VARCHAR.+)NOT NULL/drug \1/g' + +# use sed + above regex to create tables within db +sed -r -e "${REGEX_TIMESTAMP}" ../postgres/create.sql | \ + sed -r -e "${REGEX_SPEC_TYPE}" | \ + sed -r -e "${REGEX_DRUG}" | \ + duckdb "$OUTFILE" + +# goal: get path from find, e.g., ./1.0/icu/d_items +# and return database table name for it, e.g., mimiciv_icu.d_items +make_table_name () { + # strip leading directories (e.g., ./icu/hello.csv.gz -> hello.csv.gz) + BASENAME=${1##*/} + # strip suffix (e.g., hello.csv.gz -> hello; hello.csv -> hello) + TABLE_NAME=${BASENAME%%.*} + # strip basename (e.g., ./icu/hello.csv.gz -> ./icu) + PATHNAME=${1%/*} + # strip leading directories from PATHNAME (e.g. ./icu -> icu) + DIRNAME=${PATHNAME##*/} + TABLE_NAME="mimiciv_$DIRNAME.$TABLE_NAME" +} + + +# load data into database +find "$MIMIC_DIR" -type f -name '*.csv???' | sort | while IFS= read -r FILE; do + make_table_name "$FILE" + + # skip directories which we do not expect in mimic-iv-ed + # avoids syntax errors if mimic-iv in the same dir + case $DIRNAME in + (ed) ;; # OK + (*) continue; + esac + echo "Loading $FILE .. \c" + try duckdb "$OUTFILE" <<-EOSQL + COPY $TABLE_NAME FROM '$FILE' (HEADER); +EOSQL + echo "done!" +done && echo "Successfully finished loading data into $OUTFILE." From dfc78cbc7f64c3cc2a0c2605140b88b05d4deb50 Mon Sep 17 00:00:00 2001 From: Lukasz Wiklendt Date: Wed, 9 Oct 2024 14:47:33 +1030 Subject: [PATCH 2/2] update readme --- mimic-iv-ed/buildmimic/duckdb/README.md | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) diff --git a/mimic-iv-ed/buildmimic/duckdb/README.md b/mimic-iv-ed/buildmimic/duckdb/README.md index 951b4e987..31d6bcc8c 100644 --- a/mimic-iv-ed/buildmimic/duckdb/README.md +++ b/mimic-iv-ed/buildmimic/duckdb/README.md @@ -89,24 +89,7 @@ WHERE: $ ``` -Here's an example invocation that will make the database in the default "mimic4_ed.db": - -```sh -$ ./import_duckdb.sh physionet.org/files/mimic-iv-ed/2.2 - - <... output of script snipped ...> -Successfully finished loading data into mimic4_ed.db. - -$ ls -lh mimic4_ed.db --rw-rw-r--. 1 myuser mygroup 93G May 26 16:11 mimic4_ed.db -``` - -The script will print out progress as it goes. -Be patient, this can take minutes to hours to load -depending on your computer's configuration. - -* It took 16m25s on a Fedora 34 workstation with duckdb v 0.2.6, a btrfs filesystem with ztsd level 1 compression, AMD Ryzen 3900X, 32 GB RAM, Samsung 970 Evo NVMe SSD. -* It took ~10m on a Mac M1 Max 2021, 32 GB RAM. +The script will print out progress as it goes. It should only take a few seconds to load. # Help