From d4d2dd95165648d114f7e689d022e8ad4a15af26 Mon Sep 17 00:00:00 2001 From: Carl Boettiger Date: Thu, 22 Aug 2024 04:45:05 +0000 Subject: [PATCH 1/3] patch --- DESCRIPTION | 2 +- NAMESPACE | 1 + NEWS.md | 9 ++++++++- R/cached_connection.R | 18 ++++++++++++++++-- R/parse_uri.R | 7 +++++-- R/write_dataset.R | 18 ++++++++++++++++++ man/as_dataset.Rd | 21 +++++++++++++++++++++ man/cached_connection.Rd | 22 +++++++++++++++++++++- 8 files changed, 91 insertions(+), 7 deletions(-) create mode 100644 man/as_dataset.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 2a21c7c..aa730df 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: duckdbfs Title: High Performance Remote File System, Database and 'Geospatial' Access Using 'duckdb' -Version: 0.0.6 +Version: 0.0.7 Authors@R: c(person("Carl", "Boettiger", , "cboettig@gmail.com", c("aut", "cre"), comment = c(ORCID = "0000-0002-1642-628X")), diff --git a/NAMESPACE b/NAMESPACE index 5c35900..5707485 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,5 +1,6 @@ # Generated by roxygen2: do not edit by hand +export(as_dataset) export(as_view) export(cached_connection) export(close_connection) diff --git a/NEWS.md b/NEWS.md index 004dcba..f3b5e74 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,6 +1,13 @@ + # duckdbfs 0.0.6 -* bugfix open_dataset() uses random table name by default, avoid naming collisions. +* The default `cached_connection()` helper will configure a temporary storage location by default. + It also now supports all options supported by `duckdb::duckdb()` for connection creation. +* New `as_dataset()` utility copies a local in-memory data.frame into the connection. +* bugfix: reading from local disk recursively no longer requires manual `**`. + Also, trying to read from an existing _local_ file won't try and append recursive search + even when given the default recursive=TRUE option. +* bugfix: `open_dataset()` uses random table name by default, avoid naming collisions. # duckdbfs 0.0.5 diff --git a/R/cached_connection.R b/R/cached_connection.R index e46157a..1ad303f 100644 --- a/R/cached_connection.R +++ b/R/cached_connection.R @@ -21,6 +21,12 @@ duckdbfs_env <- new.env() #' At the close of the global environment, this function's finalizer #' should gracefully shutdown the connection before removing the cache. #' +#' +#' By default, this function creates an in-memory connection. When reading +#' from on-disk or remote files (parquet or csv), this option can still +#' effectively support most operations on much-larger-than-RAM data. +#' However, some operations require additional working space, so by default +#' we set a temporary storage location in configuration as well. #' @inheritParams duckdb::duckdb #' @returns a [duckdb::duckdb()] connection object #' @examples @@ -31,7 +37,10 @@ duckdbfs_env <- new.env() #' @export #' cached_connection <- function(dbdir = ":memory:", - read_only = FALSE) { + read_only = FALSE, + bigint = "numeric", + config = list(tempdir = tempfile()) + ) { #conn <- mget("duckdbfs_conn", envir = duckdbfs_env, # ifnotfound = list(NULL))$duckdbfs_conn @@ -50,9 +59,14 @@ cached_connection <- function(dbdir = ":memory:", if(getOption("duckdbfs_debug", FALSE)) { message("Making a duckdb connection!") } + conn <- DBI::dbConnect(duckdb::duckdb(), dbdir = dbdir, - read_only = read_only) + read_only = read_only, + bigint = bigint, + config = config) + + options(duckdbfs_conn = conn) # assign("duckdbfs_conn", conn, envir = duckdbfs_env) diff --git a/R/parse_uri.R b/R/parse_uri.R index 8f44f96..00fefcc 100644 --- a/R/parse_uri.R +++ b/R/parse_uri.R @@ -38,11 +38,14 @@ parse_uri <- function(sources, conn, recursive = TRUE) { s3_use_ssl = as.integer(use_ssl)) sources <- paste0(url$scheme, "://", url$hostname, url$path) - if(recursive) { + } + if(recursive) { + # Don't use recursive directory globs if we know it is a local file. + # Otherwise, we append the "/**". + if ( !file.exists(sources) ){ sources <- gsub("\\/$", "", sources) sources <- paste0(sources, "/**") } - } sources } diff --git a/R/write_dataset.R b/R/write_dataset.R index 3331e3b..b47b8a6 100644 --- a/R/write_dataset.R +++ b/R/write_dataset.R @@ -77,3 +77,21 @@ remote_name <- function (x, con) out } +#' as_dataset +#' +#' Push a local (in-memory) dataset into a the duckdb database as a table. +#' This enables it to share the connection source with other data. +#' This is equivalent to the behavior of copy=TRUE on many (but not all) of the two-table verbs in dplyr. +#' @param df a local data frame. Otherwise will be passed back without side effects +#' @return a remote `dplyr::tbl` connection to the table. +#' @inheritParams open_dataset +#' @export +as_dataset <- function(df, conn = cached_connection()) { + if(is_not_remote(df)) { + tblname = tmp_tbl_name() + DBI::dbWriteTable(conn, name = tblname, value = df) + df = dplyr::tbl(conn, tblname) + } + return(df) +} + diff --git a/man/as_dataset.Rd b/man/as_dataset.Rd new file mode 100644 index 0000000..243e37f --- /dev/null +++ b/man/as_dataset.Rd @@ -0,0 +1,21 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/write_dataset.R +\name{as_dataset} +\alias{as_dataset} +\title{as_dataset} +\usage{ +as_dataset(df, conn = cached_connection()) +} +\arguments{ +\item{df}{a local data frame. Otherwise will be passed back without side effects} + +\item{conn}{A connection to a database.} +} +\value{ +a remote \code{dplyr::tbl} connection to the table. +} +\description{ +Push a local (in-memory) dataset into a the duckdb database as a table. +This enables it to share the connection source with other data. +This is equivalent to the behavior of copy=TRUE on many (but not all) of the two-table verbs in dplyr. +} diff --git a/man/cached_connection.Rd b/man/cached_connection.Rd index 3d6f636..8920a65 100644 --- a/man/cached_connection.Rd +++ b/man/cached_connection.Rd @@ -4,7 +4,12 @@ \alias{cached_connection} \title{create a cachable duckdb connection} \usage{ -cached_connection(dbdir = ":memory:", read_only = FALSE) +cached_connection( + dbdir = ":memory:", + read_only = FALSE, + bigint = "numeric", + config = list(tempdir = tempfile()) +) } \arguments{ \item{dbdir}{Location for database files. Should be a path to an existing @@ -15,6 +20,15 @@ data is kept in RAM.} For file-based databases, this is only applied when the database file is opened for the first time. Subsequent connections (via the same \code{drv} object or a \code{drv} object pointing to the same path) will silently ignore this flag.} + +\item{bigint}{How 64-bit integers should be returned. There are two options: \code{"numeric"} and \code{"integer64"}. +If \code{"numeric"} is selected, bigint integers will be treated as double/numeric. +If \code{"integer64"} is selected, bigint integers will be set to bit64 encoding.} + +\item{config}{Named list with DuckDB configuration flags, see +\url{https://duckdb.org/docs/configuration/overview#configuration-reference} for the possible options. +These flags are only applied when the database object is instantiated. +Subsequent connections will silently ignore these flags.} } \value{ a \code{\link[duckdb:duckdb]{duckdb::duckdb()}} connection object @@ -37,6 +51,12 @@ connection object, because functions needing access to the connection can use this to create or access the existing connection. At the close of the global environment, this function's finalizer should gracefully shutdown the connection before removing the cache. + +By default, this function creates an in-memory connection. When reading +from on-disk or remote files (parquet or csv), this option can still +effectively support most operations on much-larger-than-RAM data. +However, some operations require additional working space, so by default +we set a temporary storage location in configuration as well. } \examples{ From 4904a8e9ffdb2c74ae798fbc420b07875b020690 Mon Sep 17 00:00:00 2001 From: Carl Boettiger Date: Thu, 22 Aug 2024 05:07:04 +0000 Subject: [PATCH 2/3] bugfix --- R/parse_uri.R | 21 ++++++++++----------- tests/testthat/test-write_dataset.R | 2 +- 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/R/parse_uri.R b/R/parse_uri.R index 00fefcc..d8e56f4 100644 --- a/R/parse_uri.R +++ b/R/parse_uri.R @@ -1,16 +1,15 @@ parse_uri <- function(sources, conn, recursive = TRUE) { - # Local file-systems don't need S3 parsing - # But use recursion only if local source is a directory - if(!any(grepl("^[http|s3:]", sources))) { - is_dir <- dir.exists(sources) - if(recursive) { - sources[is_dir] <- paste0(sources[is_dir], "/**") - } - return(sources) + if(any(grepl("^\\w+://"), sources)) { + # local file paths that don't require network should not attempt to load it + # Maybe unnecessary as httpfs should be bundled with R's binary duckdb + load_httpfs(conn) } - load_httpfs(conn) + # http URLs pass through as is, can't do recursion + if(any(grepl("^http", sources))) { + return(sources) + } ## for now only parse sources of length-1 if(length(sources) > 1) return(sources) @@ -19,7 +18,6 @@ parse_uri <- function(sources, conn, recursive = TRUE) { # first strip any * for compatibility sources <- gsub("/\\*+$", "", sources) - url <- url_parse(sources) scheme <- url$query[["scheme"]] use_ssl <- !identical(scheme, "http") @@ -39,10 +37,11 @@ parse_uri <- function(sources, conn, recursive = TRUE) { sources <- paste0(url$scheme, "://", url$hostname, url$path) } + if(recursive) { # Don't use recursive directory globs if we know it is a local file. # Otherwise, we append the "/**". - if ( !file.exists(sources) ){ + if ( !fs::is_file(sources) ){ sources <- gsub("\\/$", "", sources) sources <- paste0(sources, "/**") } diff --git a/tests/testthat/test-write_dataset.R b/tests/testthat/test-write_dataset.R index 32b9d91..99cd596 100644 --- a/tests/testthat/test-write_dataset.R +++ b/tests/testthat/test-write_dataset.R @@ -43,7 +43,7 @@ test_that("write_dataset partitions", { group_by(cyl, gear) |> write_dataset(path) - expect_true(file.exists(path)) + expect_true(dir.exists(path)) df <- open_dataset(path) expect_s3_class(df, "tbl") parts <- list.files(path) From 0dd32987a8f48add66a522d866c2ac81b40f7b93 Mon Sep 17 00:00:00 2001 From: Carl Boettiger Date: Thu, 22 Aug 2024 05:07:23 +0000 Subject: [PATCH 3/3] bugfix --- R/parse_uri.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/parse_uri.R b/R/parse_uri.R index d8e56f4..cef6d82 100644 --- a/R/parse_uri.R +++ b/R/parse_uri.R @@ -1,6 +1,6 @@ parse_uri <- function(sources, conn, recursive = TRUE) { - if(any(grepl("^\\w+://"), sources)) { + if(any(grepl("^\\w+://", sources))) { # local file paths that don't require network should not attempt to load it # Maybe unnecessary as httpfs should be bundled with R's binary duckdb load_httpfs(conn)