diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..cb20499 Binary files /dev/null and b/.DS_Store differ diff --git a/.Rbuildignore b/.Rbuildignore index 8c1c453..46bbf0c 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -9,3 +9,5 @@ docs _pkgdown.yml cran-comments.md ^CRAN-RELEASE$ +work/* +^CRAN-SUBMISSION$ diff --git a/.gitignore b/.gitignore index 248e29f..803d7af 100644 --- a/.gitignore +++ b/.gitignore @@ -11,4 +11,5 @@ R/TC statement_*.sql errorReport.txt +work/* diff --git a/CRAN-SUBMISSION b/CRAN-SUBMISSION new file mode 100644 index 0000000..311ff14 --- /dev/null +++ b/CRAN-SUBMISSION @@ -0,0 +1,3 @@ +Version: 2.0.0 +Date: 2024-04-16 20:03:15 UTC +SHA: 53f57cba7a55b79e60ec84baea9399f4ae19743c diff --git a/DESCRIPTION b/DESCRIPTION index 6f730fa..2a21e10 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,26 +1,34 @@ Package: Eunomia Type: Package -Title: A Standard Dataset in the OMOP Common Data Model -Version: 1.0.3 -Date: 2024-03-27 +Title: Standard Dataset Manager for Observational Medical Outcomes Partnership Common Data Model Sample Datasets +Version: 2.0.0 +Date: 2024-04-22 Authors@R: c( person("Frank", "DeFalco", , "fdefalco@ohdsi.org", role = c("aut", "cre")), person("Martijn", "Schuemie", , "schuemie@ohdsi.org", role = c("aut")), + person("Anthony", "Sena", , "sena@ohdsi.org", role=c("aut")), + person("Natthawut", "Adulyanukosol", , "na339@unc.edu", role=c("aut")), + person("Star", "Liu", , "sliu197@jhmi.edu", role=c("aut")), + person("Adam", "Black", , "black@ohdsi.org", role = c("aut")), person("Observational Health Data Science and Informatics", role = c("cph")) ) Maintainer: Frank DeFalco -Description: A sample dataset in the OMOP (Observational Medical Outcomes Partnership) Common Data Model (CDM) format. The CDM enables uniform storage of observational health care data, and is widely used for health care analytics. 'Eunomia' contains simulated data as well as a subset of the OMOP Vocabulary, and enables testing of additional packages and is used for educational and demonstration purposes. +Description: Facilitates access to sample datasets from the 'EunomiaDatasets' repository (). License: Apache License 2.0 URL: https://github.com/OHDSI/Eunomia BugReports: https://github.com/OHDSI/Eunomia/issues -Depends: - DatabaseConnector (>= 2.2.0) Imports: - SqlRender, - RSQLite (> 2.1.1), - readr + readr, + rlang, + RSQLite, + DBI, + arrow, + CommonDataModel Suggests: - testthat + testthat, + withr, + duckdb, + DatabaseConnector Encoding: UTF-8 LazyData: true RoxygenNote: 7.3.1 diff --git a/NAMESPACE b/NAMESPACE index c5f9042..3246eb0 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,11 +1,16 @@ # Generated by roxygen2: do not edit by hand export(createCohorts) -export(exportToCsv) +export(downloadEunomiaData) +export(exportDataFiles) +export(extractLoadData) +export(getDatabaseFile) export(getEunomiaConnectionDetails) -import(DatabaseConnector) +export(loadDataFiles) import(RSQLite) -importFrom(readr,write_csv) +importFrom(readr,read_csv) +importFrom(tools,file_ext) +importFrom(utils,download.file) importFrom(utils,read.csv) importFrom(utils,untar) importFrom(utils,unzip) diff --git a/NEWS.md b/NEWS.md index 2725610..db0150b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,13 @@ +Eunomia 2.0 +============= +Changes +- Updated package to no longer contain a dataset rather facilitate access to sample datasets + stored in the https://github.com/OHDSI/EunomiaDatasets repository +- Backward compatibility maintained with getEunomiaConnectionDetails function +- New function added for getDatabaseFile +- Embedded sample dataset removed +- Remove dependency on DatabaseConnector and Java + Eunomia 1.0.3 ============= diff --git a/R/Cohorts.R b/R/Cohorts.R index 271d552..f1ed411 100644 --- a/R/Cohorts.R +++ b/R/Cohorts.R @@ -1,4 +1,4 @@ -# Copyright 2020 Observational Health Data Sciences and Informatics +# Copyright 2023 Observational Health Data Sciences and Informatics # # This file is part of Eunomia # @@ -14,75 +14,65 @@ # See the License for the specific language governing permissions and # limitations under the License. - #' Construct cohorts #' #' @description -#' Creates a set of predefined cohorts in a cohort table. -#' WARNING: this will delete all existing cohorts in the table! +#' Creates a set of predefined cohorts in a cohort table. WARNING: this will delete all existing +#' cohorts in the table! #' #' @param connectionDetails The connection details to connect to the (Eunomia) database. -#' @param cdmDatabaseSchema The name of the database schema holding the CDM data. -#' @param cohortDatabaseSchema The name of the database schema where the cohorts will be written. -#' @param cohortTable The name of the table in the cohortDatabaseSchema where the cohorts -#' will be written. +#' @param cdmDatabaseSchema Deprecated. The cdm must be created in the main schema. +#' @param cohortDatabaseSchema Deprecated. The cohort table will be created in the main schema. +#' @param cohortTable Deprecated. Cohort table will be named "cohort". #' #' @return #' A data frame listing all created cohorts. #' -#' @examples -#' connectionDetails <- getEunomiaConnectionDetails() -#' createCohorts(connectionDetails) -#' -#' connection <- connect(connectionDetails) -#' -#' sql <- "SELECT COUNT(*) -#' FROM main.cohort -#' WHERE cohort_definition_id = 1;" -#' -#' renderTranslateQuerySql(connection, sql) -#' -#' disconnect(connection) -#' #' @export createCohorts <- function(connectionDetails, cdmDatabaseSchema = "main", cohortDatabaseSchema = "main", cohortTable = "cohort") { - connection <- DatabaseConnector::connect(connectionDetails) - on.exit(DatabaseConnector::disconnect(connection)) - # Create study cohort table structure: - sql <- SqlRender::loadRenderTranslateSql(sqlFilename = "CreateCohortTable.sql", - packageName = "Eunomia", - dbms = connectionDetails$dbms, - cohort_database_schema = cohortDatabaseSchema, - cohort_table = cohortTable) - DatabaseConnector::executeSql(connection, sql, progressBar = FALSE, reportOverallTime = FALSE) + if (!("ConnectionDetails" %in% class(connectionDetails))) { + stop("connectionDetails is not valid.") + } + + if (connectionDetails$dbms != "sqlite") { + stop("createCohorts only supports sqlite") + } + + if (cdmDatabaseSchema != "main" || cohortDatabaseSchema != "main") { + stop("sqlite only supports the main schema") + } + + if (cohortTable != "cohort") { + warning("The cohortTable argument to createCohorts was deprecated in Eunomia v2.1.0") + } + + connection <- DBI::dbConnect(RSQLite::SQLite(), connectionDetails$server()) + on.exit(DBI::dbDisconnect(connection)) - # Instantiate cohorts: - pathToCsv <- system.file("settings", "CohortsToCreate.csv", package = "Eunomia") - cohortsToCreate <- read.csv(pathToCsv) - for (i in 1:nrow(cohortsToCreate)) { - writeLines(paste("Creating cohort:", cohortsToCreate$name[i])) - sql <- SqlRender::loadRenderTranslateSql(sqlFilename = paste0(cohortsToCreate$name[i], ".sql"), - packageName = "Eunomia", - dbms = connectionDetails$dbms, - cdm_database_schema = cdmDatabaseSchema, - cohort_database_schema = cohortDatabaseSchema, - cohort_table = cohortTable, - cohort_definition_id = cohortsToCreate$cohortId[i]) - DatabaseConnector::executeSql(connection, sql) + # Create example cohort table + pathToSql <- system.file("sql", "CreateCohortTable.sql",package = "Eunomia", mustWork = TRUE) + sql <- readChar(pathToSql, file.info(pathToSql)$size) + sql <- gsub("--[a-zA-Z0-9 ]*", "", sql) # remove comments in sql + sql <- strsplit(gsub("\n", " ", sql), ";")[[1]] # remove newlines, split on semicolon + sql <- trimws(sql) # trim white space + sql <- sql[-which(sql == "")] # remove empty lines + + for (i in seq_along(sql)) { + DBI::dbExecute(connection, sql[i]) } # Fetch cohort counts: - sql <- "SELECT cohort_definition_id, COUNT(*) AS count FROM @cohort_database_schema.@cohort_table GROUP BY cohort_definition_id" - counts <- DatabaseConnector::renderTranslateQuerySql(connection, - sql, - cohort_database_schema = cohortDatabaseSchema, - cohort_table = cohortTable, - snakeCaseToCamelCase = TRUE) - counts <- merge(cohortsToCreate, counts, by.x = "cohortId", by.y = "cohortDefinitionId") - writeLines(sprintf("Cohorts created in table %s.%s", cohortDatabaseSchema, cohortTable)) + sql <- "SELECT cohort_definition_id, COUNT(*) AS count + FROM main.cohort + GROUP BY cohort_definition_id" + counts <- DBI::dbGetQuery(connection, sql) + + cohortsToCreate <- read.csv(system.file("settings", "CohortsToCreate.csv", package = "Eunomia", mustWork = T)) + counts <- merge(cohortsToCreate, counts, by.x = "cohortId", by.y = "cohort_definition_id") + writeLines("Cohorts created in table main.cohort") return(counts) } diff --git a/R/Connection.R b/R/Connection.R index 3fdc65a..074ddfd 100644 --- a/R/Connection.R +++ b/R/Connection.R @@ -1,4 +1,4 @@ -# Copyright 2020 Observational Health Data Sciences and Informatics +# Copyright 2023 Observational Health Data Sciences and Informatics # # This file is part of Eunomia # @@ -15,31 +15,151 @@ # limitations under the License. -#' Get Eunomia Connection Details +#' Get Default Eunomia Connection Details #' #' @description -#' Creates a copy of the Eunomia database, and provides details for connecting to that copy. +#' Creates a copy of the default (GiBleed) Eunomia database, and provides details for connecting to +#' that copy. Function provides backwards compatibility to prior releases of Eunomia default (GiBleed) +#' dataset #' +#' @param databaseFile The path where the database file will be copied to. By default, the database will +#' be copied to a temporary folder, and will be deleted at the end of the R session. +#' @param dbms The target dialect, by default "sqlite". +#' +#' @return +#' A ConnectionDetails object, to be used with the \code{DatabaseConnector} package. +#' +#' @export +getEunomiaConnectionDetails <- function(databaseFile = tempfile(fileext = ".sqlite"), dbms = "sqlite") { + + if (interactive() & !("DatabaseConnector" %in% rownames(utils::installed.packages()))) { + message("The DatabaseConnector package is required but not installed.") + if (!isTRUE(utils::askYesNo("Would you like to install DatabaseConnector?"))) { + return(invisible(NULL)) + } else { + utils::install.packages("DatabaseConnector") + } + } + + datasetLocation <- getDatabaseFile(datasetName = "GiBleed", dbms = dbms, databaseFile = databaseFile) + DatabaseConnector::createConnectionDetails(dbms = dbms, server = datasetLocation) +} + +#' Create a copy of a Eunomia dataset +#' +#' @description +#' Creates a copy of a Eunomia database, and returns the path to the new database file. +#' If the dataset does not yet exist on the user's computer it will attempt to download the source data +#' to the the path defined by the EUNOMIA_DATA_FOLDER environment variable. +#' +#' @param datasetName The data set name as found on https://github.com/OHDSI/EunomiaDatasets. The +#' data set name corresponds to the folder with the data set ZIP files +#' @param cdmVersion The OMOP CDM version. This version will appear in the suffix of the data file, +#' for example: _.zip. Default: '5.3' +#' @param pathToData The path where the Eunomia data is stored on the file system., By default the +#' value of the environment variable "EUNOMIA_DATA_FOLDER" is used. +#' @param dbms The database system to use. "sqlite" (default) or "duckdb" #' @param databaseFile The path where the database file will be copied to. By default, the database #' will be copied to a temporary folder, and will be deleted at the end of the R #' session. +#' @param inputFormat The format of the files expected in the archive. (csv or parquet) +#' @param verbose Provide additional logging details during execution +#' @param overwrite Remove and replace an existing data set. #' -#' @return -#' A ConnectionDetails object, to be used with the \code{DatabaseConnector} package. +#' @return The file path to the new Eunomia dataset copy +#' @export #' #' @examples -#' connectionDetails <- getEunomiaConnectionDetails() -#' connection <- connect(connectionDetails) -#' querySql(connection, "SELECT COUNT(*) FROM person;") -#' disconnect(connection) +#' \dontrun{ +#' conn <- DBI::dbConnect(RSQLite::SQLite(), getDatabaseFile("GiBleed")) +#' DBI::dbDisconnect(conn) #' -#' @export -getEunomiaConnectionDetails <- function(databaseFile = tempfile(fileext = ".sqlite")) { - extractFolder <- tempdir() - file <- xzfile(system.file("sqlite", "cdm.tar.xz", package = "Eunomia"), open = "rb") - untar(file, exdir = extractFolder) - close(file) - file.rename(from = file.path(extractFolder, "cdm.sqlite"), to = databaseFile) - details <- DatabaseConnector::createConnectionDetails(dbms = "sqlite", server = databaseFile) - return(details) +#' conn <- DBI::dbConnect(duckdb::duckdb(), getDatabaseFile("GiBleed", dbms = "duckdb")) +#' DBI::dbDisconnect(conn, shutdown = TRUE) +#' +#' conn <- DatabaseConnector::connect(dbms = "sqlite", server = getDatabaseFile("GiBleed")) +#' DatabaseConnector::disconnect(conn) +#' } +#' +getDatabaseFile <- function(datasetName, + cdmVersion = "5.3", + pathToData = Sys.getenv("EUNOMIA_DATA_FOLDER"), + dbms = "sqlite", + databaseFile = tempfile(fileext = paste0(".", dbms)), + inputFormat = "csv", + verbose = FALSE, + overwrite = TRUE) { + + if (is.null(pathToData) || is.na(pathToData) || pathToData == "") { + pathToData <- tempdir() + } + + stopifnot(is.character(dbms), length(dbms) == 1, dbms %in% c("sqlite", "duckdb")) + stopifnot(is.character(cdmVersion), length(cdmVersion) == 1, cdmVersion %in% c("5.3", "5.4")) + + if (dbms == "duckdb") { + rlang::check_installed("duckdb") + # duckdb database are tied to a specific version of duckdb until it reaches v1.0 + duckdbVersion <- substr(utils::packageVersion("duckdb"), 1, 3) + datasetFileName <- paste0(datasetName, "_", cdmVersion, "_", duckdbVersion, ".", dbms) + } else { + datasetFileName <- paste0(datasetName, "_", cdmVersion, ".", dbms) + } + + # cached sqlite or duckdb file to be copied + datasetLocation <- file.path(pathToData, datasetFileName) + datasetAvailable <- file.exists(datasetLocation) + if (datasetAvailable && overwrite) { + if (verbose) { + message("overwrite specified, deleting existing dataset: ", datasetLocation, appendLF = TRUE) + } + unlink(datasetLocation) + datasetAvailable <- FALSE + } + + if (verbose) { + message("dataset: ",datasetLocation, " available: ",datasetAvailable, appendLF = TRUE) + } + + # zip archive of csv source files + archiveName <- paste0(datasetName, "_", cdmVersion, ".zip") + archiveLocation <- file.path(pathToData, archiveName) + archiveAvailable <- file.exists(archiveLocation) + + if (archiveAvailable && overwrite) { + if (verbose) { + message("overwrite specified, deleting existing archive: ", archiveLocation, appendLF = TRUE) + } + unlink(archiveLocation) + archiveAvailable <- FALSE + } + + if (verbose) { + message("archive: ",archiveLocation," available:",archiveAvailable,appendLF = TRUE) + } + + if (!datasetAvailable && !archiveAvailable) { + message(paste("attempting to download", datasetName)) + downloadedData <- downloadEunomiaData(datasetName = datasetName, cdmVersion = cdmVersion, pathToData = pathToData, verbose=verbose) + if (verbose) { + message("downloaded: ",downloadedData,appendLF = TRUE) + } + archiveAvailable <- TRUE + } + + if (!datasetAvailable && archiveAvailable) { + message("attempting to extract and load: ", archiveLocation," to: ",datasetLocation,appendLF = TRUE) + extractLoadData(from = archiveLocation, to = datasetLocation, dbms = dbms, cdmVersion = cdmVersion, inputFormat=inputFormat, verbose=verbose) + datasetAvailable <- TRUE + } + + if (verbose) { + message("copying: ",datasetLocation," to: ", databaseFile, appendLF = TRUE) + } + + copySuccess <- file.copy(from = datasetLocation, to = databaseFile, overwrite = overwrite) + if (isFALSE(copySuccess)) { + stop(paste("File copy from", datasetLocation, "to", databaseFile, "failed!")) + } + invisible(databaseFile) } diff --git a/R/Eunomia.R b/R/Eunomia.R index 3c90cbe..db55f6e 100644 --- a/R/Eunomia.R +++ b/R/Eunomia.R @@ -1,4 +1,4 @@ -# Copyright 2020 Observational Health Data Sciences and Informatics +# Copyright 2023 Observational Health Data Sciences and Informatics # # This file is part of Eunomia # @@ -17,8 +17,7 @@ #' @keywords internal "_PACKAGE" -#' @import DatabaseConnector #' @import RSQLite -#' @importFrom utils unzip read.csv write.csv untar -#' @importFrom readr write_csv +#' @importFrom utils unzip read.csv write.csv untar download.file +#' @importFrom readr read_csv NULL diff --git a/R/EunomiaData.R b/R/EunomiaData.R new file mode 100644 index 0000000..528b0e5 --- /dev/null +++ b/R/EunomiaData.R @@ -0,0 +1,291 @@ +#' Download Eunomia data files +#' +#' Download the Eunomia data files from https://github.com/OHDSI/EunomiaDatasets +#' +#' @param datasetName The data set name as found on https://github.com/OHDSI/EunomiaDatasets. The +#' data set name corresponds to the folder with the data set ZIP files +#' @param cdmVersion The OMOP CDM version. This version will appear in the suffix of the data file, +#' for example: _.zip. Default: '5.3' +#' @param pathToData The path where the Eunomia data is stored on the file system., By default the +#' value of the environment variable "EUNOMIA_DATA_FOLDER" is used. +#' @param overwrite Control whether the existing archive file will be overwritten should it already +#' exist. +#' @param verbose Provide additional logging details during execution. +#' @return +#' Invisibly returns the destination if the download was successful. +#' @examples +#' \dontrun{ +#' downloadEunomiaData("GiBleed") +#' } +#' @export +downloadEunomiaData <- function(datasetName, + cdmVersion = "5.3", + pathToData = Sys.getenv("EUNOMIA_DATA_FOLDER"), + overwrite = FALSE, + verbose = FALSE) { + if (is.null(pathToData) || is.na(pathToData) || pathToData == "") { + pathToData <- tempdir() + warningContent <- paste("The pathToData argument was not specified and the EUNOMIA_DATA_FOLDER environment variable was not set. Using", pathToData) + rlang::warn(warningContent, .frequency = c("once"), .frequency_id = "data_folder") + } + + if (is.null(datasetName) || is.na(datasetName) || datasetName == "") { + stop("The datasetName argument must be specified.") + } + + if (!dir.exists(pathToData)) { + dir.create(pathToData, recursive = TRUE) + } + + datasetNameVersion <- paste0(datasetName, "_", cdmVersion) + zipName <- paste0(datasetNameVersion, ".zip") + + if (file.exists(file.path(pathToData, zipName)) && !overwrite) { + message("Dataset already exists (",file.path(pathToData, zipName),"). Specify overwrite=T to overwrite existing zip archive.", appendLF = TRUE) + } else { + # downloads the file from github or user specified location + baseUrl <- Sys.getenv("EUNOMIA_DATASETS_URL") + if (baseUrl == "") { + baseUrl <- "https://raw.githubusercontent.com/OHDSI/EunomiaDatasets/main/datasets" + } + result <- utils::download.file( + url = paste(baseUrl, datasetName, zipName, sep = "/"), + destfile = file.path( + pathToData, + zipName + ) + ) + + invisible(file.path(pathToData, zipName)) + } +} + +#' Extract the Eunomia data files and load into a database +#' Extract files from a .ZIP file and creates a OMOP CDM database that is then stored in the +#' same directory as the .ZIP file. +#' +#' @param from The path to the .ZIP file that contains the csv CDM source files +#' @param to The path to the .sqlite or .duckdb file that will be created +#' @param dbms The file based database system to use: 'sqlite' (default) or 'duckdb' +#' @param cdmVersion The version of the OMOP CDM that are represented in the archive files. +#' @param inputFormat The format of the files expected in the archive. (csv or parquet) +#' @param verbose Provide additional logging details during execution. +#' @returns No return value, called to load archive into a database file. +#' @importFrom tools file_ext +#' @examples +#' \dontrun{ +#' extractLoadData("c:/strategusData/GiBleed_5.3.zip") +#' } +#' @seealso +#' \code{\link[Eunomia]{downloadEunomiaData}} +#' @export +extractLoadData <- function(from, to, dbms = "sqlite",cdmVersion="5.3", inputFormat="csv", verbose = FALSE) { + stopifnot(dbms == "sqlite" || dbms == "duckdb") + stopifnot(is.character(from), length(from) == 1, nchar(from) > 0) + stopifnot(is.character(to), length(to) == 1, nchar(to) > 0) + if (tools::file_ext(from) != "zip") { + stop("Source must be a .zip file") + } + if (!file.exists(from)) { + stop(paste0("zipped archive '", from, "' not found!")) + } + + unzipLocation <- tempdir() + utils::unzip(zipfile = from, exdir = unzipLocation, junkpaths = TRUE) + if (verbose) { + message("unzipping to: ",unzipLocation,appendLF = TRUE) + } + loadDataFiles(dataPath = unzipLocation, dbPath = to, dbms = dbms,cdmVersion = cdmVersion, inputFormat=inputFormat, verbose = verbose) + + unlink(unzipLocation) +} + +#' Load data files into a database(sqlite or duckdb) +#' +#' Load data from csv or parquet files into a database file (sqlite or duckdb). +#' +#' @param dataPath The path to the directory containing CDM source files (csv or parquet) +#' @param dbPath The path to the .sqlite or .duckdb file that will be created +#' @param dbms The file-based database system to use: 'sqlite' (default) or 'duckdb' +#' @param inputFormat The input format of the files to load. Supported formats include csv, parquet. +#' @param cdmVersion The CDM version to create in the resulting database. Supported versions are 5.3 and 5.4 +#' @param cdmDatabaseSchema The schema in which to create the CDM tables. Default is main. +#' @param verbose Provide additional logging details during execution. +#' @param overwrite Remove and replace an existing data set. +#' @returns No return value, loads data into database file. +#' @export +loadDataFiles <- function(dataPath, + dbPath, + inputFormat = "csv", + cdmVersion="5.3", + cdmDatabaseSchema = "main", + dbms = "sqlite", + verbose = FALSE, + overwrite = FALSE) { + stopifnot(inputFormat %in% c("csv","parquet")) + stopifnot(dbms == "sqlite" || dbms == "duckdb") + stopifnot(is.character(dataPath), length(dataPath) == 1, nchar(dataPath) > 0) + stopifnot(is.character(dbPath), length(dbPath) == 1, nchar(dbPath) > 0) + + dataFiles <- sort(list.files(path = dataPath, pattern = paste("*",inputFormat,sep="."))) + if (length(dataFiles) <= 0) { + stop("Data directory does not contain files to load into the database.") + } + + if (verbose) { + message("connecting to: ", dbms, appendLF = TRUE) + } + + if (overwrite) { + if (file.exists(dbPath)) { + if (verbose) { + message("deleting existing file: ", dbPath, appendLF = TRUE) + } + unlink(dbPath) + } + } + + if (dbms == "sqlite") { + connection <- DBI::dbConnect(RSQLite::SQLite(), dbname = dbPath) + on.exit(DBI::dbDisconnect(connection), add = TRUE) + } else if (dbms == "duckdb") { + connection <- DBI::dbConnect(duckdb::duckdb(), dbdir = dbPath) + on.exit(DBI::dbDisconnect(connection, shutdown = TRUE), add = TRUE) + on.exit(duckdb::duckdb_shutdown(duckdb::duckdb()), add=TRUE) + } + + # creating tables via DDL eliminates issues with inferring column types + # avoiding use of executeDdl as it requires DatabaseConnector which has some + # issues with managing tables in Sqlite & DuckDb + + tempDdlFolder <- tempdir() + # when running multiple tests in one session, R returns the same tempdir + # if multiple databases are being tested, we need to remove existing ddl + # if you unlink the entire tempdir, you can get rid of the database file created before completing tests + existingDdlFiles <- sort(list.files(path = tempDdlFolder, full.names = TRUE, pattern = ".*\\.sql$")) + for (existingDdlFile in existingDdlFiles) { + unlink(existingDdlFile) + } + + CommonDataModel::writeDdl( + targetDialect = dbms, + cdmVersion = cdmVersion, + cdmDatabaseSchema = cdmDatabaseSchema, + outputfolder = tempDdlFolder + ) + + ddlFiles <- sort(list.files(path = tempDdlFolder, full.names = TRUE, pattern = ".*\\.sql$")) + + for (ddlFile in ddlFiles) { + if (verbose) { + message("executing ddl statements from: ", ddlFile, appendLF = TRUE) + } + + ddlFileContents <- readChar(ddlFile, file.info(ddlFile)$size) + statements <- as.list(strsplit(ddlFileContents, ';')[[1]]) + for (statement in statements) { + DBI::dbExecute( + conn = connection, + statement = statement + ) + } + } + + for (i in 1:length(dataFiles)) { + dataFile <- dataFiles[i] + if (verbose) { + dataFileMessage <- paste("loading file: ", dataFile) + message(dataFileMessage, appendLF = TRUE) + } + + if (inputFormat == "csv") { + tableData <- readr::read_csv( + file = file.path(dataPath, dataFiles[i]), + show_col_types = FALSE + ) + } else if (inputFormat == "parquet") { + tableData <- arrow::read_parquet( + file = file.path(dataPath, dataFiles[i]) + ) + } + + names(tableData) <- tolower(names(tableData)) + tableName <- tools::file_path_sans_ext(tolower(dataFiles[i])) + + if (dbms == "sqlite") { + for (j in seq_len(ncol(tableData))) { + column <- tableData[[j]] + if (inherits(column, "Date")) { + tableData[, j] <- as.numeric(as.POSIXct(as.character(column), origin = "1970-01-01", tz = "GMT")) + } + if (inherits(column, "POSIXct")) { + tableData[, j] <- as.numeric(as.POSIXct(column, origin = "1970-01-01", tz = "GMT")) + } + } + } + + if (verbose) { + message("saving table: ",tableName," (rows: ", nrow(tableData), ")",appendLF = TRUE) + } + + DBI::dbWriteTable(conn = connection, name = tableName, value = tableData, append=TRUE) + } +} + +#' Export data files from a database(sqlite or duckdb) +#' +#' Helper function to export data to csv or parquet files from a database file (sqlite or duckdb). +#' +#' @param dbPath The path to the source .sqlite or .duckdb file +#' @param outputFolder The path to the export destination directory +#' @param dbms The file-based database system to use: 'sqlite' (default) or 'duckdb' +#' @param outputFormat The output format for the files. Supported formats include csv, parquet. +#' @param verbose Boolean argument controlling verbose debugging output +#' @returns No return value, called to export to outputFolder. +#' @export +exportDataFiles <- function(dbPath, outputFolder, outputFormat="csv", dbms = "sqlite", verbose=FALSE) { + stopifnot(outputFormat %in% c("csv","parquet")) + stopifnot(dbms %in% c("sqlite", "duckdb")) + + if (dbms == "sqlite") { + connection <- DBI::dbConnect(RSQLite::SQLite(), dbname = dbPath) + on.exit(DBI::dbDisconnect(connection), add = TRUE) + } else if (dbms == "duckdb") { + connection <- DBI::dbConnect(duckdb::duckdb(), dbdir = dbPath) + on.exit(DBI::dbDisconnect(connection, shutdown = TRUE), add = TRUE) + on.exit(duckdb::duckdb_shutdown(duckdb::duckdb()),add=TRUE) + } + + tableNames <- DBI::dbListTables(connection) + message("processing ", length(tableNames), " tables", appendLF = TRUE) + + if (!dir.exists(outputFolder)) { + dir.create( + path = outputFolder, + recursive = T + ) + } + + for (tableName in tableNames) { + if (verbose) { + message("processing ", tableName, appendLF = TRUE) + } + + outputFileName <- file.path(outputFolder,tableName) + + if (outputFormat == "csv") { + filePath <- paste(outputFileName, "csv", sep = ".") + query <- paste("SELECT * FROM", tableName) + result <- DBI::dbSendQuery(connection, query) + data <- DBI::dbFetch(result) + DBI::dbClearResult(result) + write.csv(data, filePath, row.names = T) + } else if (outputFormat == "parquet") { + filePath <- paste(outputFileName, "parquet", sep = ".") + query <- paste0("copy ", tableName, " to '", filePath, "' (FORMAT PARQUET);") + DBI::dbExecute(connection,query) + } else { + message("unknown file format") + } + } +} diff --git a/R/Export.R b/R/Export.R deleted file mode 100644 index df90319..0000000 --- a/R/Export.R +++ /dev/null @@ -1,72 +0,0 @@ -# Copyright 2020 Observational Health Data Sciences and Informatics -# -# This file is part of Eunomia -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -#' Extract the Eunomia database to CSV files -#' -#' @param outputFolder A folder where the CSV files will be written. -#' @param connectionDetails Connection details for the Eunomia database. Defaults to a fresh Eunomia -#' database. -#' -#' -#' @examples -#' \donttest{ -#' # For this example we'll create a temp folder: -#' folder <- tempfile() -#' dir.create(folder) -#' -#' exportToCsv(folder) -#' -#' list.files(folder) -#' -#' # [1] "CARE_SITE.csv" "CDM_SOURCE.csv" "COHORT.csv" -#' # [4] "COHORT_ATTRIBUTE.csv" "CONCEPT.csv" "CONCEPT_ANCESTOR.csv" -#' # [7] "CONCEPT_CLASS.csv" "CONCEPT_RELATIONSHIP.csv" "CONCEPT_SYNONYM.csv" -#' # [10] "CONDITION_ERA.csv" "CONDITION_OCCURRENCE.csv" "COST.csv" -#' # [13] "DEATH.csv" "DEVICE_EXPOSURE.csv" "DOMAIN.csv" -#' # [16] "DOSE_ERA.csv" "DRUG_ERA.csv" "DRUG_EXPOSURE.csv" -#' # [19] "DRUG_STRENGTH.csv" "FACT_RELATIONSHIP.csv" "LOCATION.csv" -#' # [22] "MEASUREMENT.csv" "METADATA.csv" "NOTE.csv" -#' # [25] "NOTE_NLP.csv" "OBSERVATION.csv" "OBSERVATION_PERIOD.csv" -#' # [28] "PAYER_PLAN_PERIOD.csv" "PERSON.csv" "PROCEDURE_OCCURRENCE.csv" -#' # [31] "PROVIDER.csv" "RELATIONSHIP.csv" "SOURCE_TO_CONCEPT_MAP.csv" -#' # [34] "SPECIMEN.csv" "VISIT_DETAIL.csv" "VISIT_OCCURRENCE.csv" -#' # [37] "VOCABULARY.csv" -#' -#' # Cleaning up the temp folder used in this example: -#' unlink(folder, recursive = TRUE) -#' } -#' -#' @export -exportToCsv <- function(outputFolder = file.path(getwd(), "csv"), - connectionDetails = getEunomiaConnectionDetails()) { - if (!file.exists(outputFolder)) { - dir.create(outputFolder, recursive = TRUE) - } - conn <- DatabaseConnector::connect(connectionDetails) - on.exit(DatabaseConnector::disconnect(conn)) - tables <- DatabaseConnector::getTableNames(conn, "main") - saveCsv <- function(table) { - fileName <- file.path(outputFolder, sprintf("%s.csv", table)) - writeLines(sprintf("Saving table %s to file %s", table, fileName)) - data <- DatabaseConnector::renderTranslateQuerySql(conn, "SELECT * FROM @table;", table = table) - write_csv(data, fileName, na = "") - return(NULL) - } - lapply(tables, saveCsv) - writeLines(sprintf("Done writing CSV files to %s.", outputFolder)) - invisible(NULL) -} diff --git a/README.md b/README.md index 03a17ac..908cfdd 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,8 @@ Eunomia ======= -[![Build Status](https://travis-ci.org/OHDSI/Eunomia.svg?branch=main)](https://travis-ci.org/OHDSI/Eunomia) -[![codecov.io](https://codecov.io/github/OHDSI/Eunomia/coverage.svg?branch=main)](https://codecov.io/github/OHDSI/Eunomia?branch=main) +[![Build Status](https://github.com/OHDSI/Eunomia/workflows/R-CMD-check/badge.svg)](https://github.com/OHDSI/Eunomia/actions?query=workflow%3AR-CMD-check) +[![codecov.io](https://codecov.io/github/OHDSI/Eunomia/coverage.svg?branch=main)](https://app.codecov.io/github/OHDSI/Eunomia?branch=main) [![CRAN_Status_Badge](http://www.r-pkg.org/badges/version/Eunomia)](https://cran.r-project.org/package=Eunomia) [![CRAN_Status_Badge](http://cranlogs.r-pkg.org/badges/Eunomia)](https://cran.r-project.org/package=Eunomia) @@ -10,14 +10,14 @@ Eunomia is part of [HADES](https://ohdsi.github.io/Hades/). Introduction ============ -Eunomia is a standard dataset in the OMOP (Observational Medical Outcomes Partnership) Common Data Model (CDM) for testing and demonstration purposes. Eunomia is used for many of the exercises in [the Book of OHDSI](https://ohdsi.github.io/TheBookOfOhdsi/). For functions that require schema name, use 'main'. +Eunomia is a standard dataset manager for sample OMOP (Observational Medical Outcomes Partnership) Common Data Model (CDM) datasets. Eunomia facilitates access to sample datasets from the [EunomiaDatasets repository](https://github.com/ohdsi/EunomiaDatasets). Eunomia is used for testing and demonstration purposes, including many of the exercises in [the Book of OHDSI](https://ohdsi.github.io/TheBookOfOhdsi/). For functions that require schema name, use 'main'. Features ======== -- Provides a small simulated dataset in the CDM. -- Also includes a subset of the Standardized Vocabularies. +- Download selected sample datasets from [EunomiaDatasets repository](https://github.com/ohdsi/EunomiaDatasets), which includes a subset of the Standardized Vocabularies. - Interfaces with the DatabaseConnector and SqlRender packages. -- No need to set up a database server. Eunomia runs in your R instance (using SQLite). +- No need to set up a database server. Eunomia runs in your R instance (currently using SQLite). +- (planned) supports for other databases Example ======= @@ -36,11 +36,11 @@ disconnect(connection) Technology ========== -Eunomia is an R package containing a SQLite database. +Eunomia is an R package providing access to sample datasets at [EunomiaDatasets repository](https://github.com/ohdsi/EunomiaDatasets). System Requirements =================== -Requires R. Some of the packages required by Eunomia require Java. +Requires R. Some of the packages required by Eunomia require Java. Installation ============ @@ -52,7 +52,7 @@ Installation ```r install.packages("Eunomia") ``` - + User Documentation ================== Documentation can be found on the [package website](https://ohdsi.github.io/Eunomia/). diff --git a/cran-comments.md b/cran-comments.md index ce3d7e7..a9e0d43 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -1,3 +1,20 @@ +Release of 2.0 version of package. +Resolved CRAN feedback issues + +--- + +Wrapped additional test code with java support checks. + +--- + +Removed LazyData setting. + +--- + +Resolved check errors. + +--- + Updated Date to current Date. This data package will not be updated frequently. It is similar to mtcars or iris. The sample data is provided in a standard data format that enables uniform storage of observational health care data, and is widely used for health care analytics. The data does not need to be updated frequently, it has remained static for almost two years. diff --git a/docs/404.html b/docs/404.html index a6a6de6..f293825 100644 --- a/docs/404.html +++ b/docs/404.html @@ -32,7 +32,7 @@ Eunomia - 1.0.3 + 2.0.0 @@ -86,7 +86,7 @@

Page not found (404)