Skip to content

Commit

Permalink
Merge pull request #27 from cboettig/patch/v0.0.6
Browse files Browse the repository at this point in the history
Patch/v0.0.6
  • Loading branch information
cboettig authored Aug 22, 2024
2 parents 552ac79 + 0dd3298 commit 6b885d8
Show file tree
Hide file tree
Showing 9 changed files with 101 additions and 18 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: duckdbfs
Title: High Performance Remote File System, Database and 'Geospatial' Access Using 'duckdb'
Version: 0.0.6
Version: 0.0.7
Authors@R:
c(person("Carl", "Boettiger", , "[email protected]", c("aut", "cre"),
comment = c(ORCID = "0000-0002-1642-628X")),
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Generated by roxygen2: do not edit by hand

export(as_dataset)
export(as_view)
export(cached_connection)
export(close_connection)
Expand Down
9 changes: 8 additions & 1 deletion NEWS.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,13 @@

# duckdbfs 0.0.6

* bugfix open_dataset() uses random table name by default, avoid naming collisions.
* The default `cached_connection()` helper will configure a temporary storage location by default.
It also now supports all options supported by `duckdb::duckdb()` for connection creation.
* New `as_dataset()` utility copies a local in-memory data.frame into the connection.
* bugfix: reading from local disk recursively no longer requires manual `**`.
Also, trying to read from an existing _local_ file won't try and append recursive search
even when given the default recursive=TRUE option.
* bugfix: `open_dataset()` uses random table name by default, avoid naming collisions.

# duckdbfs 0.0.5

Expand Down
18 changes: 16 additions & 2 deletions R/cached_connection.R
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,12 @@ duckdbfs_env <- new.env()
#' At the close of the global environment, this function's finalizer
#' should gracefully shutdown the connection before removing the cache.
#'
#'
#' By default, this function creates an in-memory connection. When reading
#' from on-disk or remote files (parquet or csv), this option can still
#' effectively support most operations on much-larger-than-RAM data.
#' However, some operations require additional working space, so by default
#' we set a temporary storage location in configuration as well.
#' @inheritParams duckdb::duckdb
#' @returns a [duckdb::duckdb()] connection object
#' @examples
Expand All @@ -31,7 +37,10 @@ duckdbfs_env <- new.env()
#' @export
#'
cached_connection <- function(dbdir = ":memory:",
read_only = FALSE) {
read_only = FALSE,
bigint = "numeric",
config = list(tempdir = tempfile())
) {

#conn <- mget("duckdbfs_conn", envir = duckdbfs_env,
# ifnotfound = list(NULL))$duckdbfs_conn
Expand All @@ -50,9 +59,14 @@ cached_connection <- function(dbdir = ":memory:",
if(getOption("duckdbfs_debug", FALSE)) {
message("Making a duckdb connection!")
}

conn <- DBI::dbConnect(duckdb::duckdb(),
dbdir = dbdir,
read_only = read_only)
read_only = read_only,
bigint = bigint,
config = config)


options(duckdbfs_conn = conn)

# assign("duckdbfs_conn", conn, envir = duckdbfs_env)
Expand Down
26 changes: 14 additions & 12 deletions R/parse_uri.R
Original file line number Diff line number Diff line change
@@ -1,16 +1,15 @@
parse_uri <- function(sources, conn, recursive = TRUE) {

# Local file-systems don't need S3 parsing
# But use recursion only if local source is a directory
if(!any(grepl("^[http|s3:]", sources))) {
is_dir <- dir.exists(sources)
if(recursive) {
sources[is_dir] <- paste0(sources[is_dir], "/**")
}
return(sources)
if(any(grepl("^\\w+://", sources))) {
# local file paths that don't require network should not attempt to load it
# Maybe unnecessary as httpfs should be bundled with R's binary duckdb
load_httpfs(conn)
}

load_httpfs(conn)
# http URLs pass through as is, can't do recursion
if(any(grepl("^http", sources))) {
return(sources)
}

## for now only parse sources of length-1
if(length(sources) > 1) return(sources)
Expand All @@ -19,7 +18,6 @@ parse_uri <- function(sources, conn, recursive = TRUE) {
# first strip any * for compatibility
sources <- gsub("/\\*+$", "", sources)


url <- url_parse(sources)
scheme <- url$query[["scheme"]]
use_ssl <- !identical(scheme, "http")
Expand All @@ -38,11 +36,15 @@ parse_uri <- function(sources, conn, recursive = TRUE) {
s3_use_ssl = as.integer(use_ssl))

sources <- paste0(url$scheme, "://", url$hostname, url$path)
if(recursive) {
}

if(recursive) {
# Don't use recursive directory globs if we know it is a local file.
# Otherwise, we append the "/**".
if ( !fs::is_file(sources) ){
sources <- gsub("\\/$", "", sources)
sources <- paste0(sources, "/**")
}

}
sources
}
Expand Down
18 changes: 18 additions & 0 deletions R/write_dataset.R
Original file line number Diff line number Diff line change
Expand Up @@ -77,3 +77,21 @@ remote_name <- function (x, con)
out
}

#' as_dataset
#'
#' Push a local (in-memory) dataset into a the duckdb database as a table.
#' This enables it to share the connection source with other data.
#' This is equivalent to the behavior of copy=TRUE on many (but not all) of the two-table verbs in dplyr.
#' @param df a local data frame. Otherwise will be passed back without side effects
#' @return a remote `dplyr::tbl` connection to the table.
#' @inheritParams open_dataset
#' @export
as_dataset <- function(df, conn = cached_connection()) {
if(is_not_remote(df)) {
tblname = tmp_tbl_name()
DBI::dbWriteTable(conn, name = tblname, value = df)
df = dplyr::tbl(conn, tblname)
}
return(df)
}

21 changes: 21 additions & 0 deletions man/as_dataset.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

22 changes: 21 additions & 1 deletion man/cached_connection.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion tests/testthat/test-write_dataset.R
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ test_that("write_dataset partitions", {
group_by(cyl, gear) |>
write_dataset(path)

expect_true(file.exists(path))
expect_true(dir.exists(path))
df <- open_dataset(path)
expect_s3_class(df, "tbl")
parts <- list.files(path)
Expand Down

0 comments on commit 6b885d8

Please sign in to comment.