From ad4c03571d8a34e551cf4a2a31397c7472d9f703 Mon Sep 17 00:00:00 2001 From: cboettig Date: Thu, 30 Jan 2025 22:26:55 +0000 Subject: [PATCH] =?UTF-8?q?Deploying=20to=20gh-pages=20from=20@=20cboettig?= =?UTF-8?q?/duckdbfs@8abfdfd2d6b2d0e65efac69d1748ac1d7f9268bc=20?= =?UTF-8?q?=F0=9F=9A=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pkgdown.yml | 2 +- reference/duckdb_secrets.html | 107 ++++++++++++++++++++++++++++++++++ reference/index.html | 24 ++++++++ reference/load_h3.html | 104 +++++++++++++++++++++++++++++++++ reference/to_h3j.html | 92 +++++++++++++++++++++++++++++ reference/to_json.html | 99 +++++++++++++++++++++++++++++++ search.json | 2 +- sitemap.xml | 4 ++ 8 files changed, 432 insertions(+), 2 deletions(-) create mode 100644 reference/duckdb_secrets.html create mode 100644 reference/load_h3.html create mode 100644 reference/to_h3j.html create mode 100644 reference/to_json.html diff --git a/pkgdown.yml b/pkgdown.yml index 15db786..f2c595d 100644 --- a/pkgdown.yml +++ b/pkgdown.yml @@ -2,7 +2,7 @@ pandoc: 3.1.11 pkgdown: 2.1.1 pkgdown_sha: ~ articles: {} -last_built: 2025-01-30T19:51Z +last_built: 2025-01-30T22:26Z urls: reference: https://cboettig.github.io/duckdbfs/reference article: https://cboettig.github.io/duckdbfs/articles diff --git a/reference/duckdb_secrets.html b/reference/duckdb_secrets.html new file mode 100644 index 0000000..ac27ea1 --- /dev/null +++ b/reference/duckdb_secrets.html @@ -0,0 +1,107 @@ + +duckdb secrets — duckdb_secrets • duckdbfs + Skip to contents + + +
+
+
+ +
+

Configure the duckdb secrets for remote access.

+
+ +
+

Usage

+
duckdb_secrets(
+  key = Sys.getenv("AWS_ACCESS_KEY_ID", ""),
+  secret = Sys.getenv("AWS_SECRET_ACCESS_KEY", ""),
+  endpoint = Sys.getenv("AWS_S3_ENDPOINT", "s3.amazonaws.com"),
+  bucket = NULL,
+  url_style = NULL,
+  type = "S3",
+  conn = cached_connection()
+)
+
+ +
+

Arguments

+ + +
key
+

key

+ + +
secret
+

secret

+ + +
endpoint
+

endpoint address

+ + +
bucket
+

restricts the "SCOPE" of this key to only objects in this +bucket-name. note that the bucket name is currently insensitive to endpoint

+ + +
url_style
+

path or vhost, for S3

+ + +
type
+

Key type, e.g. S3. See duckdb docs for details. +references https://duckdb.org/docs/configuration/secrets_manager.html

+ + +
conn
+

A connection to a database.

+ +
+ +
+ + +
+ + + +
+ + + + + + + diff --git a/reference/index.html b/reference/index.html index 6e73a05..984d479 100644 --- a/reference/index.html +++ b/reference/index.html @@ -74,6 +74,18 @@

All functionsduckdb_secrets() + + +
duckdb secrets
+
+ + load_h3() + +
+
load the duckdb geospatial data plugin
+
+ load_spatial()
@@ -98,6 +110,18 @@

All functionsto_h3j() + + +
Write H3 hexagon data out as an h3j-compliant JSON file NOTE: the column containing H3 hashes must be named hexid
+

+ + to_json() + +
+
to_json write data out as a JSON object
+
+ to_sf()
diff --git a/reference/load_h3.html b/reference/load_h3.html new file mode 100644 index 0000000..61a179b --- /dev/null +++ b/reference/load_h3.html @@ -0,0 +1,104 @@ + +load the duckdb geospatial data plugin — load_h3 • duckdbfs + Skip to contents + + +
+
+
+ +
+

load the duckdb geospatial data plugin

+
+ +
+

Usage

+
load_h3(conn = cached_connection())
+
+ +
+

Arguments

+ + +
conn
+

A database connection object created using the +cache_connection function (default: cache_connection()).

+ +
+
+

Value

+

loads the extension and returns status invisibly.

+
+ + +
+

Examples

+
if (FALSE) { # interactive()
+
+library(dplyr)
+load_h3()
+ex <- system.file("extdata/spatial-test.csv", package="duckdbfs")
+
+zoom <- 9L # Zoom must be explicit integer, L
+query <- ex |>
+  open_dataset(format = "csv") |>
+  mutate(h3id = h3_latlng_to_cell_string(latitude, longitude, zoom))
+
+ # as data.frame
+ collect(query)
+
+ # write to a file
+ path <- tempfile(fileext = ".h3j")
+ query |> to_h3j(path)
+}
+
+
+
+ + +
+ + + +
+ + + + + + + diff --git a/reference/to_h3j.html b/reference/to_h3j.html new file mode 100644 index 0000000..5242938 --- /dev/null +++ b/reference/to_h3j.html @@ -0,0 +1,92 @@ + +Write H3 hexagon data out as an h3j-compliant JSON file NOTE: the column containing H3 hashes must be named hexid — to_h3j • duckdbfs + Skip to contents + + +
+
+
+ +
+

Write H3 hexagon data out as an h3j-compliant JSON file +NOTE: the column containing H3 hashes must be named hexid

+
+ +
+

Usage

+
to_h3j(dataset, path, conn = cached_connection())
+
+ +
+

Arguments

+ + +
dataset
+

a remote tbl object from open_dataset, +or an in-memory data.frame.

+ + +
path
+

a local file path or S3 path with write credentials

+ + +
conn
+

duckdbfs database connection

+ +
+ +
+

Examples

+
if (FALSE) { # interactive()
+# example code
+}
+
+
+
+ + +
+ + + +
+ + + + + + + diff --git a/reference/to_json.html b/reference/to_json.html new file mode 100644 index 0000000..566b389 --- /dev/null +++ b/reference/to_json.html @@ -0,0 +1,99 @@ + +to_json write data out as a JSON object — to_json • duckdbfs + Skip to contents + + +
+
+
+ +
+

to_json +write data out as a JSON object

+
+ +
+

Usage

+
to_json(
+  dataset,
+  path,
+  conn = cached_connection(),
+  array = TRUE,
+  options = NULL
+)
+
+ +
+

Arguments

+ + +
dataset
+

a remote tbl object from open_dataset, +or an in-memory data.frame.

+ + +
path
+

a local file path or S3 path with write credentials

+ + +
conn
+

duckdbfs database connection

+ + +
array
+

generate a JSON array?

+ + +
options
+

additional options as a char string, see

+ +
+ +
+ + +
+ + + +
+ + + + + + + diff --git a/search.json b/search.json index 0766d57..f427e0e 100644 --- a/search.json +++ b/search.json @@ -1 +1 @@ -[{"path":"https://cboettig.github.io/duckdbfs/LICENSE.html","id":null,"dir":"","previous_headings":"","what":"MIT License","title":"MIT License","text":"Copyright (c) 2023 duckdbfs authors Permission hereby granted, free charge, person obtaining copy software associated documentation files (“Software”), deal Software without restriction, including without limitation rights use, copy, modify, merge, publish, distribute, sublicense, /sell copies Software, permit persons Software furnished , subject following conditions: copyright notice permission notice shall included copies substantial portions Software. SOFTWARE PROVIDED “”, WITHOUT WARRANTY KIND, EXPRESS IMPLIED, INCLUDING LIMITED WARRANTIES MERCHANTABILITY, FITNESS PARTICULAR PURPOSE NONINFRINGEMENT. EVENT SHALL AUTHORS COPYRIGHT HOLDERS LIABLE CLAIM, DAMAGES LIABILITY, WHETHER ACTION CONTRACT, TORT OTHERWISE, ARISING , CONNECTION SOFTWARE USE DEALINGS SOFTWARE.","code":""},{"path":"https://cboettig.github.io/duckdbfs/authors.html","id":null,"dir":"","previous_headings":"","what":"Authors","title":"Authors and Citation","text":"Carl Boettiger. Author, maintainer. Michael D. Sumner. Contributor.","code":""},{"path":"https://cboettig.github.io/duckdbfs/authors.html","id":"citation","dir":"","previous_headings":"","what":"Citation","title":"Authors and Citation","text":"Boettiger C (2025). duckdbfs: High Performance Remote File System, Database 'Geospatial' Access Using 'duckdb'. R package version 0.0.9.1, https://cboettig.github.io/duckdbfs/, https://github.com/cboettig/duckdbfs.","code":"@Manual{, title = {duckdbfs: High Performance Remote File System, Database and 'Geospatial' Access Using 'duckdb'}, author = {Carl Boettiger}, year = {2025}, note = {R package version 0.0.9.1, https://cboettig.github.io/duckdbfs/}, url = {https://github.com/cboettig/duckdbfs}, }"},{"path":"https://cboettig.github.io/duckdbfs/index.html","id":"duckdbfs","dir":"","previous_headings":"","what":"High Performance Remote File System, Database and Geospatial Access Using duckdb","title":"High Performance Remote File System, Database and Geospatial Access Using duckdb","text":"duckdbfs simple wrapper around duckdb package facilitate working construction single lazy table (SQL connection) set file paths, URLs, S3 URIs.","code":""},{"path":"https://cboettig.github.io/duckdbfs/index.html","id":"installation","dir":"","previous_headings":"","what":"Installation","title":"High Performance Remote File System, Database and Geospatial Access Using duckdb","text":"can install development version duckdbfs GitHub :","code":"# install.packages(\"devtools\") devtools::install_github(\"cboettig/duckdbfs\")"},{"path":"https://cboettig.github.io/duckdbfs/index.html","id":"quickstart","dir":"","previous_headings":"","what":"Quickstart","title":"High Performance Remote File System, Database and Geospatial Access Using duckdb","text":"Imagine collection URLs files want combine single tibble R. files parquet csv, files may additional columns present files. combined data may large, potentially bigger available RAM slow download completely, may want subset using methods like dplyr::filter() dplyr::summarise(). can easily access data without downloading passing vector URLs. Note schemas (column names) match, must explicitly request duckdb join two schemas. Leave default, FALSE required achieve much better performance. Use filter(), select(), etc dplyr subset process data – method supported dbpylr. use dplyr::collect() trigger evaluation ingest results query R.","code":"library(duckdbfs) library(dplyr) #> #> Attaching package: 'dplyr' #> The following objects are masked from 'package:stats': #> #> filter, lag #> The following objects are masked from 'package:base': #> #> intersect, setdiff, setequal, union base <- paste0(\"https://github.com/duckdb/duckdb/raw/main/\", \"data/parquet-testing/hive-partitioning/union_by_name/\") f1 <- paste0(base, \"x=1/f1.parquet\") f2 <- paste0(base, \"x=1/f2.parquet\") f3 <- paste0(base, \"x=2/f2.parquet\") urls <- c(f1,f2,f3) ds <- open_dataset(urls, unify_schemas = TRUE) ds #> # Source: table [3 x 4] #> # Database: DuckDB v0.10.1 [unknown@Linux 6.6.10-76060610-generic:R 4.3.2/:memory:] #> i j x k #> #> 1 42 84 1 NA #> 2 42 84 1 NA #> 3 NA 128 2 33"},{"path":"https://cboettig.github.io/duckdbfs/index.html","id":"s3-based-access","dir":"","previous_headings":"","what":"S3-based access","title":"High Performance Remote File System, Database and Geospatial Access Using duckdb","text":"can also access remote data S3 protocol. advantage S3 unlike https, can discover files given folder, don’t list individually. particularly convenient accessing large, partitioned datasets, like GBIF: (nearly 200 GB data split across 2000 parquet files) additional configuration arguments passed helper function duckdb_s3_config() set access credentials configure settings, like alternative endpoints (use S3-compliant systems like minio). course also possible set ahead time calling duckdb_s3_config() directly. Many settings can also passed along compactly using URI query notation found arrow package. instance, can request anonymous access bucket alternative endpoint :","code":"parquet <- \"s3://gbif-open-data-us-east-1/occurrence/2023-06-01/occurrence.parquet\" duckdb_s3_config() gbif <- open_dataset(parquet, anonymous = TRUE, s3_region=\"us-east-1\") efi <- open_dataset(\"s3://anonymous@neon4cast-scores/parquet/aquatics?endpoint_override=data.ecoforecast.org\")"},{"path":"https://cboettig.github.io/duckdbfs/index.html","id":"spatial-data","dir":"","previous_headings":"","what":"Spatial data","title":"High Performance Remote File System, Database and Geospatial Access Using duckdb","text":"duckdb can also understand wide array spatial data queries spatial vector data, similar operations found popular sf package. See list supported functions details. spatial query operations require geometry column expresses simple feature geometry duckdb’s internal geometry format (nearly exactly WKB).","code":""},{"path":"https://cboettig.github.io/duckdbfs/index.html","id":"generating-spatial-data-from-tabular","dir":"","previous_headings":"Spatial data","what":"Generating spatial data from tabular","title":"High Performance Remote File System, Database and Geospatial Access Using duckdb","text":"common pattern first generate geometry column raw columns, latitude lognitude columns, using duckdb implementation method familiar postgis, st_point: Note coercing generic tabular CSV spatial data, user responsible specifying coordinate reference system (crs) used columns. instance, case data latitude-longitude, specify corresponding EPSG code. optional (sf allows objects unknown CRS), advisable. Recall used sort external database like duckdb, dplyr functions like dplyr::mutate() transcribed SQL dbplyr, actually ever run R. allows us seamlessly pass along spatial functions like st_point, despite available R function. (Also note SQL case-sensitive, function also written ST_Point). Optionally, can additional operations geometry column, computing distances (st_distance shown ), spatial filters, forth. to_sf() coercion parse input SQL query gets passed duckdb, return object collected sf::st_read, returning (-memory) sf object. details including complete list dozens spatial operations currently supported notes performance current limitations, see duckdb spatial docs","code":"spatial_ex <- paste0(\"https://raw.githubusercontent.com/cboettig/duckdbfs/\", \"main/inst/extdata/spatial-test.csv\") |> open_dataset(format = \"csv\") spatial_ex |> mutate(geometry = st_point(longitude, latitude)) |> mutate(dist = st_distance(geometry, st_point(0,0))) |> to_sf(crs = 4326) #> Simple feature collection with 10 features and 4 fields #> Geometry type: POINT #> Dimension: XY #> Bounding box: xmin: 1 ymin: 1 xmax: 10 ymax: 10 #> Geodetic CRS: WGS 84 #> site latitude longitude dist geom #> 1 a 1 1 1.414214 POINT (1 1) #> 2 b 2 2 2.828427 POINT (2 2) #> 3 c 3 3 4.242641 POINT (3 3) #> 4 d 4 4 5.656854 POINT (4 4) #> 5 e 5 5 7.071068 POINT (5 5) #> 6 f 6 6 8.485281 POINT (6 6) #> 7 g 7 7 9.899495 POINT (7 7) #> 8 h 8 8 11.313708 POINT (8 8) #> 9 i 9 9 12.727922 POINT (9 9) #> 10 j 10 10 14.142136 POINT (10 10)"},{"path":"https://cboettig.github.io/duckdbfs/index.html","id":"reading-spatial-vector-files","dir":"","previous_headings":"Spatial data","what":"Reading spatial vector files","title":"High Performance Remote File System, Database and Geospatial Access Using duckdb","text":"duckdb spatial package can also use GDAL read large spatial vector files. includes support remote files. means can easily subset columns wide array potentially remote file types filter rows columns, perform many spatial operations without ever reading entire objects memory R. Note open_dataset() always returns lazy remote table – yet downloaded data, let alone read R. simply connection allowing us stream data. can examine spatial metadata associated remote dataset using duckdbfs spatial helper function, st_read_meta, small dataset, can bring entire data R (memory) using to_sf(), specifying CRS indicated metadata: However, can also wide range spatial observations without importing data. can particularly helpful working large datasets. example: country polygon contains Melbourne? Note result still lazy read, haven’t downloaded read full spatial data object. , use to_sf() read query results native (-memory) sf object:","code":"url <- \"https://github.com/cboettig/duckdbfs/raw/main/inst/extdata/world.fgb\" countries <- open_dataset(url, format = \"sf\") countries_meta <- st_read_meta(url) countries_meta #> # A tibble: 1 × 7 #> feature_count geom_column_name geom_type name code wkt proj4 #> #> 1 177 geom Multi Polygon EPSG 4326 \"GEOGCS[\\\"WGS … +pro… in_mem <- countries |> to_sf(crs = countries_meta$wkt) library(sf) #> Linking to GEOS 3.12.1, GDAL 3.8.4, PROJ 9.4.0; sf_use_s2() is TRUE melbourne <- st_point(c(144.9633, -37.814)) |> st_as_text() countries |> filter(st_contains(geom, ST_GeomFromText({melbourne}))) #> # Source: SQL [1 x 16] #> # Database: DuckDB v0.10.1 [unknown@Linux 6.6.10-76060610-generic:R 4.3.2/:memory:] #> iso_a3 name sovereignt continent area pop_est pop_est_dens economy #> #> 1 AUS Australia Australia Oceania 7682300 21262641 2.77 2. Develo… #> # ℹ 8 more variables: income_grp , gdp_cap_est , life_exp , #> # well_being , footprint , inequality , HPI , geom sf_obj <- countries |> filter(continent == \"Africa\") |> to_sf() plot(sf_obj[\"name\"])"},{"path":"https://cboettig.github.io/duckdbfs/index.html","id":"spatial-joins","dir":"","previous_headings":"","what":"Spatial joins","title":"High Performance Remote File System, Database and Geospatial Access Using duckdb","text":"One common operation spatial joins, can powerful way subset large data. Lets consider set point geometries representing coordinates major cities around world: Note metadata must read directly source file, embedded duckdb table view. combining data countries data, confirm CRS datasets: instance, can return points (cities) within collection polygons (country boundaries Oceania continent): Possible spatial joins include: Note SQL functions case-sensitive, spatial_join expects lower-case names.","code":"url_cities <- \"https://github.com/cboettig/duckdbfs/raw/spatial-read/inst/extdata/metro.fgb\" cities <- open_dataset(url_cities, format=\"sf\") countries_meta$proj4 #> [1] \"+proj=longlat +datum=WGS84 +no_defs\" st_read_meta(url_cities)$proj4 #> [1] \"+proj=longlat +datum=WGS84 +no_defs\" countries |> dplyr::filter(continent == \"Oceania\") |> spatial_join(cities, by = \"st_intersects\", join=\"inner\") |> select(name_long, sovereignt, pop2020) #> # Source: SQL [6 x 3] #> # Database: DuckDB v0.10.1 [unknown@Linux 6.6.10-76060610-generic:R 4.3.2/:memory:] #> name_long sovereignt pop2020 #> #> 1 Brisbane Australia 2388517 #> 2 Perth Australia 2036118 #> 3 Sydney Australia 4729406 #> 4 Adelaide Australia 1320783 #> 5 Auckland New Zealand 1426070 #> 6 Melbourne Australia 4500501"},{"path":"https://cboettig.github.io/duckdbfs/index.html","id":"writing-datasets","dir":"","previous_headings":"","what":"Writing datasets","title":"High Performance Remote File System, Database and Geospatial Access Using duckdb","text":"Like arrow::write_dataset(), duckdbfs::write_dataset() can write partitioned parquet files local disks also directly S3 bucket. Partitioned writes take advantage threading. Partition variables can specified explicitly, dplyr grouping variables used default:","code":"mtcars |> group_by(cyl, gear) |> write_dataset(tempfile())"},{"path":"https://cboettig.github.io/duckdbfs/index.html","id":"local-files","dir":"","previous_headings":"","what":"Local files","title":"High Performance Remote File System, Database and Geospatial Access Using duckdb","text":"course, open_dataset() write_dataset() also used local files. Remember parquet format required, can read csv files (including multiple hive-partitioned csv files).","code":"write.csv(mtcars, \"mtcars.csv\", row.names=FALSE) lazy_cars <- open_dataset(\"mtcars.csv\", format = \"csv\")"},{"path":"https://cboettig.github.io/duckdbfs/index.html","id":"mechanism--motivation","dir":"","previous_headings":"","what":"Mechanism / motivation","title":"High Performance Remote File System, Database and Geospatial Access Using duckdb","text":"package simply creates duckdb connection, ensures httpfs spatial extensions installed necessary, sets S3 configuration, constructs VIEW using duckdb’s parquet_scan() read_csv_auto() methods associated options. returns dplyr::tbl() resulting view. Though straightforward, process substantially verbose analogous single function call provided arrow::open_dataset() due mostly necessary string manipulation construct VIEW SQL statement. ’ve used pattern lot, especially arrow option (http data) substantially worse performance (many S3 URIs).","code":""},{"path":"https://cboettig.github.io/duckdbfs/index.html","id":"advanced-notes","dir":"","previous_headings":"","what":"Advanced notes","title":"High Performance Remote File System, Database and Geospatial Access Using duckdb","text":"similar behavior arrow::open_dataset(), exceptions: time, arrow support access HTTP – remote sources must S3 GC-based object store. local file system S3 paths, duckdb can support “globbing” point path, e.g. open_dataset(data/*/subdir). (Like arrow, duckdbfs::open_dataset assume recursive path discovery directories). Note http(s) URLs always require full vector since ls() method possible. Even URLs vector-based paths, duckdb can automatically populate column names given hive structure hive_style=TRUE (default). Note passing vector paths can significantly faster globbing S3 sources ls() operation relatively expensive many partitions.","code":""},{"path":"https://cboettig.github.io/duckdbfs/index.html","id":"performance-notes","dir":"","previous_headings":"","what":"Performance notes","title":"High Performance Remote File System, Database and Geospatial Access Using duckdb","text":"settings, duckdbfs::open_dataset can give substantially better performance (orders magnitude) arrow::open_dataset(), settings may comparable even slower. Package versions, system libraries, network architecture, remote storage performance, network traffic, factors can influence performance, making precise benchmark comparisons real-world contexts difficult. slow network connections accessing remote table repeatedly, may improve performance create local copy table rather perform operations network. simplest way setting mode = \"TABLE\" instead “VIEW” open dataset. probably desirable pass duckdb connection backed persistent disk location case instead default cached_connection() unless available RAM limiting. unify_schema computationally expensive. Ensuring files/partitions match schema advance processing different files separately can greatly improve performance.","code":""},{"path":"https://cboettig.github.io/duckdbfs/reference/as_dataset.html","id":null,"dir":"Reference","previous_headings":"","what":"as_dataset — as_dataset","title":"as_dataset — as_dataset","text":"Push local (-memory) dataset duckdb database table. enables share connection source data. equivalent behavior copy=TRUE many () two-table verbs dplyr.","code":""},{"path":"https://cboettig.github.io/duckdbfs/reference/as_dataset.html","id":"ref-usage","dir":"Reference","previous_headings":"","what":"Usage","title":"as_dataset — as_dataset","text":"","code":"as_dataset(df, conn = cached_connection())"},{"path":"https://cboettig.github.io/duckdbfs/reference/as_dataset.html","id":"arguments","dir":"Reference","previous_headings":"","what":"Arguments","title":"as_dataset — as_dataset","text":"df local data frame. Otherwise passed back without side effects conn connection database.","code":""},{"path":"https://cboettig.github.io/duckdbfs/reference/as_dataset.html","id":"value","dir":"Reference","previous_headings":"","what":"Value","title":"as_dataset — as_dataset","text":"remote dplyr::tbl connection table.","code":""},{"path":"https://cboettig.github.io/duckdbfs/reference/as_view.html","id":null,"dir":"Reference","previous_headings":"","what":"as_view — as_view","title":"as_view — as_view","text":"Create View current query. can effective way allow query chain remain lazy","code":""},{"path":"https://cboettig.github.io/duckdbfs/reference/as_view.html","id":"ref-usage","dir":"Reference","previous_headings":"","what":"Usage","title":"as_view — as_view","text":"","code":"as_view(x, tblname = tmp_tbl_name(), conn = cached_connection())"},{"path":"https://cboettig.github.io/duckdbfs/reference/as_view.html","id":"arguments","dir":"Reference","previous_headings":"","what":"Arguments","title":"as_view — as_view","text":"x duckdb spatial dataset tblname name table create database. conn connection database.","code":""},{"path":"https://cboettig.github.io/duckdbfs/reference/as_view.html","id":"ref-examples","dir":"Reference","previous_headings":"","what":"Examples","title":"as_view — as_view","text":"","code":"if (FALSE) { # interactive() path <- system.file(\"extdata/spatial-test.csv\", package=\"duckdbfs\") df <- open_dataset(path) library(dplyr) df |> filter(latitude > 5) |> as_view() }"},{"path":"https://cboettig.github.io/duckdbfs/reference/cached_connection.html","id":null,"dir":"Reference","previous_headings":"","what":"create a cachable duckdb connection — cached_connection","title":"create a cachable duckdb connection — cached_connection","text":"function primarily intended internal use duckdbfs functions. However, can called directly user whenever desirable direct access connection object.","code":""},{"path":"https://cboettig.github.io/duckdbfs/reference/cached_connection.html","id":"ref-usage","dir":"Reference","previous_headings":"","what":"Usage","title":"create a cachable duckdb connection — cached_connection","text":"","code":"cached_connection( dbdir = \":memory:\", read_only = FALSE, bigint = \"numeric\", config = list(temp_directory = tempfile()), autoload_exts = getOption(\"duckdbfs_autoload_extensions\", TRUE) )"},{"path":"https://cboettig.github.io/duckdbfs/reference/cached_connection.html","id":"arguments","dir":"Reference","previous_headings":"","what":"Arguments","title":"create a cachable duckdb connection — cached_connection","text":"dbdir Location database files. path existing directory file system. default (\"\"), data kept RAM. read_only Set TRUE read-operation. file-based databases, applied database file opened first time. Subsequent connections (via drv object drv object pointing path) silently ignore flag. bigint 64-bit integers returned. two options: \"numeric\" \"integer64\". \"numeric\" selected, bigint integers treated double/numeric. \"integer64\" selected, bigint integers set bit64 encoding. config Named list DuckDB configuration flags, see https://duckdb.org/docs/configuration/overview#configuration-reference possible options. flags applied database object instantiated. Subsequent connections silently ignore flags. autoload_exts auto-load extensions? TRUE default, can configured options(duckdbfs_autoload_extensions = FALSE)","code":""},{"path":"https://cboettig.github.io/duckdbfs/reference/cached_connection.html","id":"value","dir":"Reference","previous_headings":"","what":"Value","title":"create a cachable duckdb connection — cached_connection","text":"duckdb::duckdb() connection object","code":""},{"path":"https://cboettig.github.io/duckdbfs/reference/cached_connection.html","id":"details","dir":"Reference","previous_headings":"","what":"Details","title":"create a cachable duckdb connection — cached_connection","text":"first called (user internal function), function creates duckdb connection places connection cache (duckdbfs_conn option). subsequent calls, function returns cached connection, rather recreating fresh connection. frees user responsibility managing connection object, functions needing access connection can use create access existing connection. close global environment, function's finalizer gracefully shutdown connection removing cache. default, function creates -memory connection. reading -disk remote files (parquet csv), option can still effectively support operations much-larger--RAM data. However, operations require additional working space, default set temporary storage location configuration well.","code":""},{"path":"https://cboettig.github.io/duckdbfs/reference/cached_connection.html","id":"ref-examples","dir":"Reference","previous_headings":"","what":"Examples","title":"create a cachable duckdb connection — cached_connection","text":"","code":"con <- cached_connection() close_connection(con)"},{"path":"https://cboettig.github.io/duckdbfs/reference/close_connection.html","id":null,"dir":"Reference","previous_headings":"","what":"close connection — close_connection","title":"close connection — close_connection","text":"close connection","code":""},{"path":"https://cboettig.github.io/duckdbfs/reference/close_connection.html","id":"ref-usage","dir":"Reference","previous_headings":"","what":"Usage","title":"close connection — close_connection","text":"","code":"close_connection(conn = cached_connection())"},{"path":"https://cboettig.github.io/duckdbfs/reference/close_connection.html","id":"arguments","dir":"Reference","previous_headings":"","what":"Arguments","title":"close connection — close_connection","text":"conn duckdb connection (leave blank) Closes invisible cached connection duckdb","code":""},{"path":"https://cboettig.github.io/duckdbfs/reference/close_connection.html","id":"value","dir":"Reference","previous_headings":"","what":"Value","title":"close connection — close_connection","text":"returns nothing.","code":""},{"path":"https://cboettig.github.io/duckdbfs/reference/close_connection.html","id":"details","dir":"Reference","previous_headings":"","what":"Details","title":"close connection — close_connection","text":"Shuts connection gc removes . clear cached reference avoid using stale connection avoids complaint connection garbage collected.","code":""},{"path":"https://cboettig.github.io/duckdbfs/reference/close_connection.html","id":"ref-examples","dir":"Reference","previous_headings":"","what":"Examples","title":"close connection — close_connection","text":"","code":"close_connection()"},{"path":"https://cboettig.github.io/duckdbfs/reference/duckdb_s3_config.html","id":null,"dir":"Reference","previous_headings":"","what":"Configure S3 settings for database connection — duckdb_s3_config","title":"Configure S3 settings for database connection — duckdb_s3_config","text":"function used configure S3 settings database connection. allows set various S3-related parameters access key, secret access key, endpoint, region, session token, uploader settings, URL compatibility mode, URL style, SSL usage.","code":""},{"path":"https://cboettig.github.io/duckdbfs/reference/duckdb_s3_config.html","id":"ref-usage","dir":"Reference","previous_headings":"","what":"Usage","title":"Configure S3 settings for database connection — duckdb_s3_config","text":"","code":"duckdb_s3_config( conn = cached_connection(), s3_access_key_id = NULL, s3_secret_access_key = NULL, s3_endpoint = NULL, s3_region = NULL, s3_session_token = NULL, s3_uploader_max_filesize = NULL, s3_uploader_max_parts_per_file = NULL, s3_uploader_thread_limit = NULL, s3_url_compatibility_mode = NULL, s3_url_style = NULL, s3_use_ssl = NULL, anonymous = NULL )"},{"path":"https://cboettig.github.io/duckdbfs/reference/duckdb_s3_config.html","id":"arguments","dir":"Reference","previous_headings":"","what":"Arguments","title":"Configure S3 settings for database connection — duckdb_s3_config","text":"conn database connection object created using cache_connection function (default: cache_connection()). s3_access_key_id S3 access key ID (default: NULL). s3_secret_access_key S3 secret access key (default: NULL). s3_endpoint S3 endpoint (default: NULL). s3_region S3 region (default: NULL). s3_session_token S3 session token (default: NULL). s3_uploader_max_filesize maximum filesize S3 uploader (50GB 5TB, default 800GB). s3_uploader_max_parts_per_file maximum number parts per file S3 uploader (1 10000, default 10000). s3_uploader_thread_limit thread limit S3 uploader (default: 50). s3_url_compatibility_mode Disable Globs Query Parameters S3 URLs (default: 0, allows globs/queries). s3_url_style style S3 URLs use. Default \"vhost\" unless s3_endpoint set, makes default \"path\" (.e. MINIO systems). s3_use_ssl Enable disable SSL S3 connections (default: 1 (TRUE)). anonymous request anonymous access (sets s3_access_key_id s3_secret_access_key \"\", allowing anonymous access public buckets).","code":""},{"path":"https://cboettig.github.io/duckdbfs/reference/duckdb_s3_config.html","id":"value","dir":"Reference","previous_headings":"","what":"Value","title":"Configure S3 settings for database connection — duckdb_s3_config","text":"Returns silently (NULL) successful.","code":""},{"path":"https://cboettig.github.io/duckdbfs/reference/duckdb_s3_config.html","id":"details","dir":"Reference","previous_headings":"","what":"Details","title":"Configure S3 settings for database connection — duckdb_s3_config","text":"see https://duckdb.org/docs/sql/configuration.html","code":""},{"path":"https://cboettig.github.io/duckdbfs/reference/duckdb_s3_config.html","id":"ref-examples","dir":"Reference","previous_headings":"","what":"Examples","title":"Configure S3 settings for database connection — duckdb_s3_config","text":"","code":"if (FALSE) { # interactive() # Configure S3 settings duckdb_s3_config( s3_access_key_id = \"YOUR_ACCESS_KEY_ID\", s3_secret_access_key = \"YOUR_SECRET_ACCESS_KEY\", s3_endpoint = \"YOUR_S3_ENDPOINT\", s3_region = \"YOUR_S3_REGION\", s3_uploader_max_filesize = \"800GB\", s3_uploader_max_parts_per_file = 100, s3_uploader_thread_limit = 8, s3_url_compatibility_mode = FALSE, s3_url_style = \"vhost\", s3_use_ssl = TRUE, anonymous = TRUE) }"},{"path":"https://cboettig.github.io/duckdbfs/reference/load_spatial.html","id":null,"dir":"Reference","previous_headings":"","what":"load the duckdb geospatial data plugin — load_spatial","title":"load the duckdb geospatial data plugin — load_spatial","text":"load duckdb geospatial data plugin","code":""},{"path":"https://cboettig.github.io/duckdbfs/reference/load_spatial.html","id":"ref-usage","dir":"Reference","previous_headings":"","what":"Usage","title":"load the duckdb geospatial data plugin — load_spatial","text":"","code":"load_spatial( conn = cached_connection(), nightly = getOption(\"duckdbfs_use_nightly\", FALSE), force = FALSE )"},{"path":"https://cboettig.github.io/duckdbfs/reference/load_spatial.html","id":"arguments","dir":"Reference","previous_headings":"","what":"Arguments","title":"load the duckdb geospatial data plugin — load_spatial","text":"conn database connection object created using cache_connection function (default: cache_connection()). nightly use nightly version ? default FALSE, configurable duckdbfs_use_nightly option. force force re-install?","code":""},{"path":"https://cboettig.github.io/duckdbfs/reference/load_spatial.html","id":"value","dir":"Reference","previous_headings":"","what":"Value","title":"load the duckdb geospatial data plugin — load_spatial","text":"loads extension returns status invisibly.","code":""},{"path":"https://cboettig.github.io/duckdbfs/reference/load_spatial.html","id":"references","dir":"Reference","previous_headings":"","what":"References","title":"load the duckdb geospatial data plugin — load_spatial","text":"https://duckdb.org/docs/extensions/spatial.html","code":""},{"path":"https://cboettig.github.io/duckdbfs/reference/open_dataset.html","id":null,"dir":"Reference","previous_headings":"","what":"Open a dataset from a variety of sources — open_dataset","title":"Open a dataset from a variety of sources — open_dataset","text":"function opens dataset variety sources, including Parquet, CSV, etc, using either local file system paths, URLs, S3 bucket URI notation.","code":""},{"path":"https://cboettig.github.io/duckdbfs/reference/open_dataset.html","id":"ref-usage","dir":"Reference","previous_headings":"","what":"Usage","title":"Open a dataset from a variety of sources — open_dataset","text":"","code":"open_dataset( sources, schema = NULL, hive_style = TRUE, unify_schemas = FALSE, format = c(\"parquet\", \"csv\", \"tsv\", \"sf\"), conn = cached_connection(), tblname = tmp_tbl_name(), mode = \"VIEW\", filename = FALSE, recursive = TRUE, ... )"},{"path":"https://cboettig.github.io/duckdbfs/reference/open_dataset.html","id":"arguments","dir":"Reference","previous_headings":"","what":"Arguments","title":"Open a dataset from a variety of sources — open_dataset","text":"sources character vector paths dataset files. schema schema dataset. NULL, schema inferred dataset files. hive_style logical value indicating whether dataset uses Hive-style partitioning. unify_schemas logical value indicating whether unify schemas dataset files (union_by_name). TRUE, execute UNION column name across files (NOTE: can add considerably initial execution time) format format dataset files. One \"parquet\", \"csv\", \"tsv\", \"sf\" (spatial vector files supported sf package / GDAL). argument provided, function try guess type based minimal heuristics. conn connection database. tblname name table create database. mode mode create table . One \"VIEW\" \"TABLE\". Creating VIEW, default, execute quickly create local copy dataset. TABLE create local copy duckdb's native format, downloading full dataset necessary. using TABLE mode large data, please sure use conn connections disk-based storage, e.g. calling cached_connection(), e.g. cached_connection(\"storage_path\"), otherwise full data must fit RAM. Using TABLE assumes familiarity R's DBI-based interface. filename logical value indicating whether include filename table name. recursive assume recursive path? default TRUE. Set FALSE trying open single, un-partitioned file. ... optional additional arguments passed duckdb_s3_config(). Note apply set URI notation thus may used override provide settings supported format.","code":""},{"path":"https://cboettig.github.io/duckdbfs/reference/open_dataset.html","id":"value","dir":"Reference","previous_headings":"","what":"Value","title":"Open a dataset from a variety of sources — open_dataset","text":"lazy dplyr::tbl object representing opened dataset backed duckdb SQL connection. dplyr (tidyr) verbs can used directly object, can translated SQL commands automatically via dbplyr. Generic R commands require using dplyr::collect() table, forces evaluation reading resulting data memory.","code":""},{"path":"https://cboettig.github.io/duckdbfs/reference/open_dataset.html","id":"ref-examples","dir":"Reference","previous_headings":"","what":"Examples","title":"Open a dataset from a variety of sources — open_dataset","text":"","code":"if (FALSE) { # interactive() # A remote, hive-partitioned Parquet dataset base <- paste0(\"https://github.com/duckdb/duckdb/raw/main/\", \"data/parquet-testing/hive-partitioning/union_by_name/\") f1 <- paste0(base, \"x=1/f1.parquet\") f2 <- paste0(base, \"x=1/f2.parquet\") f3 <- paste0(base, \"x=2/f2.parquet\") open_dataset(c(f1,f2,f3), unify_schemas = TRUE) # Access an S3 database specifying an independently-hosted (MINIO) endpoint efi <- open_dataset(\"s3://neon4cast-scores/parquet/aquatics\", s3_access_key_id=\"\", s3_endpoint=\"data.ecoforecast.org\") }"},{"path":"https://cboettig.github.io/duckdbfs/reference/spatial_join.html","id":null,"dir":"Reference","previous_headings":"","what":"spatial_join — spatial_join","title":"spatial_join — spatial_join","text":"spatial_join","code":""},{"path":"https://cboettig.github.io/duckdbfs/reference/spatial_join.html","id":"ref-usage","dir":"Reference","previous_headings":"","what":"Usage","title":"spatial_join — spatial_join","text":"","code":"spatial_join( x, y, by = c(\"st_intersects\", \"st_within\", \"st_dwithin\", \"st_touches\", \"st_contains\", \"st_containsproperly\", \"st_covers\", \"st_overlaps\", \"st_crosses\", \"st_equals\", \"st_disjoint\"), args = \"\", join = \"left\", tblname = tmp_tbl_name(), conn = cached_connection() )"},{"path":"https://cboettig.github.io/duckdbfs/reference/spatial_join.html","id":"arguments","dir":"Reference","previous_headings":"","what":"Arguments","title":"spatial_join — spatial_join","text":"x duckdb table spatial geometry column called \"geom\" y duckdb table spatial geometry column called \"geom\" spatial join function, see details. args additional arguments join function (e.g. distance st_dwithin) join JOIN type (left, right, inner, full) tblname name temporary view conn duckdb connection (imputed duckdbfs default, must shared across tables)","code":""},{"path":"https://cboettig.github.io/duckdbfs/reference/spatial_join.html","id":"value","dir":"Reference","previous_headings":"","what":"Value","title":"spatial_join — spatial_join","text":"(lazy) view resulting table. Users can continue operate using dplyr operations call to_st() collect sf object.","code":""},{"path":"https://cboettig.github.io/duckdbfs/reference/spatial_join.html","id":"details","dir":"Reference","previous_headings":"","what":"Details","title":"spatial_join — spatial_join","text":"Possible spatial joins include: though SQL case sensitive, function expects lower case names \"\" functions.","code":""},{"path":"https://cboettig.github.io/duckdbfs/reference/spatial_join.html","id":"ref-examples","dir":"Reference","previous_headings":"","what":"Examples","title":"spatial_join — spatial_join","text":"","code":"if (FALSE) { # interactive() # note we can read in remote data in a variety of vector formats: countries <- paste0(\"/vsicurl/\", \"https://github.com/cboettig/duckdbfs/\", \"raw/spatial-read/inst/extdata/world.gpkg\") |> open_dataset(format = \"sf\") cities <- paste0(\"/vsicurl/https://github.com/cboettig/duckdbfs/raw/\", \"spatial-read/inst/extdata/metro.fgb\") |> open_dataset(format = \"sf\") countries |> dplyr::filter(iso_a3 == \"AUS\") |> spatial_join(cities) }"},{"path":"https://cboettig.github.io/duckdbfs/reference/st_read_meta.html","id":null,"dir":"Reference","previous_headings":"","what":"read spatial metadata — st_read_meta","title":"read spatial metadata — st_read_meta","text":"time, reads subset spatial metadata. similar reported ogrinfo -json","code":""},{"path":"https://cboettig.github.io/duckdbfs/reference/st_read_meta.html","id":"ref-usage","dir":"Reference","previous_headings":"","what":"Usage","title":"read spatial metadata — st_read_meta","text":"","code":"st_read_meta( path, layer = 1L, tblname = tbl_name(path), conn = cached_connection(), ... )"},{"path":"https://cboettig.github.io/duckdbfs/reference/st_read_meta.html","id":"arguments","dir":"Reference","previous_headings":"","what":"Arguments","title":"read spatial metadata — st_read_meta","text":"path URL path spatial data file layer layer number read metadata , defaults first layer. tblname metadata stored view name, default based name file. conn connection database. ... optional additional arguments passed duckdb_s3_config(). Note apply set URI notation thus may used override provide settings supported format.","code":""},{"path":"https://cboettig.github.io/duckdbfs/reference/st_read_meta.html","id":"value","dir":"Reference","previous_headings":"","what":"Value","title":"read spatial metadata — st_read_meta","text":"lazy dplyr::tbl object containing core spatial metadata projection information.","code":""},{"path":"https://cboettig.github.io/duckdbfs/reference/st_read_meta.html","id":"ref-examples","dir":"Reference","previous_headings":"","what":"Examples","title":"read spatial metadata — st_read_meta","text":"","code":"if (FALSE) { # interactive() st_read_meta(\"https://github.com/duckdb/duckdb_spatial/raw/main/test/data/amsterdam_roads.fgb\") }"},{"path":"https://cboettig.github.io/duckdbfs/reference/to_sf.html","id":null,"dir":"Reference","previous_headings":"","what":"Convert output to sf object — to_sf","title":"Convert output to sf object — to_sf","text":"Convert output sf object","code":""},{"path":"https://cboettig.github.io/duckdbfs/reference/to_sf.html","id":"ref-usage","dir":"Reference","previous_headings":"","what":"Usage","title":"Convert output to sf object — to_sf","text":"","code":"to_sf(x, crs = NA, conn = cached_connection())"},{"path":"https://cboettig.github.io/duckdbfs/reference/to_sf.html","id":"arguments","dir":"Reference","previous_headings":"","what":"Arguments","title":"Convert output to sf object — to_sf","text":"x remote duckdb tbl (open_dataset) dplyr-pipeline thereof. crs coordinate reference system, format understood sf::st_crs. conn connection object tbl. Takes duckdb table (open_dataset) dataset dplyr pipline returns sf object. Important: table must geometry column, almost always create first. Note: to_sf() triggers collection R. function suitable use end dplyr pipeline subset data. Using function large dataset without filtering first may exceed available memory.","code":""},{"path":"https://cboettig.github.io/duckdbfs/reference/to_sf.html","id":"value","dir":"Reference","previous_headings":"","what":"Value","title":"Convert output to sf object — to_sf","text":"sf class object (memory).","code":""},{"path":"https://cboettig.github.io/duckdbfs/reference/to_sf.html","id":"ref-examples","dir":"Reference","previous_headings":"","what":"Examples","title":"Convert output to sf object — to_sf","text":"","code":"if (FALSE) { # interactive() library(dplyr) csv_file <- system.file(\"extdata/spatial-test.csv\", package=\"duckdbfs\") # Note that we almost always must first create a `geometry` column, e.g. # from lat/long columns using the `st_point` method. sf <- open_dataset(csv_file, format = \"csv\") |> mutate(geom = ST_Point(longitude, latitude)) |> to_sf() # We can use the full space of spatial operations, including spatial # and normal dplyr filters. All operations are translated into a # spatial SQL query by `to_sf`: open_dataset(csv_file, format = \"csv\") |> mutate(geom = ST_Point(longitude, latitude)) |> mutate(dist = ST_Distance(geom, ST_Point(0,0))) |> filter(site %in% c(\"a\", \"b\", \"e\")) |> to_sf() }"},{"path":"https://cboettig.github.io/duckdbfs/reference/write_dataset.html","id":null,"dir":"Reference","previous_headings":"","what":"write_dataset — write_dataset","title":"write_dataset — write_dataset","text":"write_dataset","code":""},{"path":"https://cboettig.github.io/duckdbfs/reference/write_dataset.html","id":"ref-usage","dir":"Reference","previous_headings":"","what":"Usage","title":"write_dataset — write_dataset","text":"","code":"write_dataset( dataset, path, conn = cached_connection(), format = c(\"parquet\", \"csv\"), partitioning = dplyr::group_vars(dataset), overwrite = TRUE, ... )"},{"path":"https://cboettig.github.io/duckdbfs/reference/write_dataset.html","id":"arguments","dir":"Reference","previous_headings":"","what":"Arguments","title":"write_dataset — write_dataset","text":"dataset remote tbl object open_dataset, -memory data.frame. path local file path S3 path write credentials conn duckdbfs database connection format export format partitioning names columns use partition variables overwrite allow overwriting existing files? ... additional arguments duckdb_s3_config()","code":""},{"path":"https://cboettig.github.io/duckdbfs/reference/write_dataset.html","id":"value","dir":"Reference","previous_headings":"","what":"Value","title":"write_dataset — write_dataset","text":"Returns path, invisibly.","code":""},{"path":"https://cboettig.github.io/duckdbfs/reference/write_dataset.html","id":"ref-examples","dir":"Reference","previous_headings":"","what":"Examples","title":"write_dataset — write_dataset","text":"","code":"if (FALSE) { # interactive() write_dataset(mtcars, tempfile()) } if (FALSE) { # interactive() write_dataset(mtcars, tempdir()) }"},{"path":"https://cboettig.github.io/duckdbfs/news/index.html","id":"duckdbfs-009","dir":"Changelog","previous_headings":"","what":"duckdbfs 0.0.9","title":"duckdbfs 0.0.9","text":"CRAN release: 2024-12-16 Restore default non-nightly.","code":""},{"path":"https://cboettig.github.io/duckdbfs/news/index.html","id":"duckdbfs-008","dir":"Changelog","previous_headings":"","what":"duckdbfs 0.0.8","title":"duckdbfs 0.0.8","text":"CRAN release: 2024-12-09 work-around error file built DuckDB version 'v1.1.3', can load extensions built DuckDB version '19864453f7'. using nightly repo extensions default.","code":""},{"path":"https://cboettig.github.io/duckdbfs/news/index.html","id":"duckdbfs-007","dir":"Changelog","previous_headings":"","what":"duckdbfs 0.0.7","title":"duckdbfs 0.0.7","text":"CRAN release: 2024-08-29 default cached_connection() helper configure temporary storage location default. also now supports options supported duckdb::duckdb() connection creation. New as_dataset() utility copies local -memory data.frame connection.","code":""},{"path":"https://cboettig.github.io/duckdbfs/news/index.html","id":"duckdbfs-006","dir":"Changelog","previous_headings":"","what":"duckdbfs 0.0.6","title":"duckdbfs 0.0.6","text":"bugfix: reading local disk recursively longer requires manual **. Also, trying read existing local file won’t try append recursive search even given default recursive=TRUE option. bugfix: open_dataset() uses random table name default, avoid naming collisions.","code":""},{"path":"https://cboettig.github.io/duckdbfs/news/index.html","id":"duckdbfs-005","dir":"Changelog","previous_headings":"","what":"duckdbfs 0.0.5","title":"duckdbfs 0.0.5","text":"CRAN release: 2024-08-17 bugfix write_dataset() longer adds ** paths writing partitions. Protect unsupported table names generated file names start digit, fixes #21.","code":""},{"path":"https://cboettig.github.io/duckdbfs/news/index.html","id":"duckdbfs-004","dir":"Changelog","previous_headings":"","what":"duckdbfs 0.0.4","title":"duckdbfs 0.0.4","text":"CRAN release: 2024-02-28 open_dataset() gains ability read spatial vector data formats (objects read sf) using format=\"sf\" default geometry column to_sf() now termed geom, match default used duckdb’s st_read() function. open_dataset() now tries guess data format instead defaulting parquet format explicitly provided. new function, spatial_join(), allows variety spatial joins. new function, st_read_meta(), exposes spatial metadata remote spatial objects. new helper function, as_view(), creates temporary view query.","code":""},{"path":"https://cboettig.github.io/duckdbfs/news/index.html","id":"duckdbfs-003","dir":"Changelog","previous_headings":"","what":"duckdbfs 0.0.3","title":"duckdbfs 0.0.3","text":"CRAN release: 2023-10-19 write_dataset() now understands lazy queries, just lazy tables.","code":""},{"path":"https://cboettig.github.io/duckdbfs/news/index.html","id":"duckdbfs-002","dir":"Changelog","previous_headings":"","what":"duckdbfs 0.0.2","title":"duckdbfs 0.0.2","text":"CRAN release: 2023-09-06 duckdbfs now spatial data query support! Users can leverage spatial data operations like st_distance() st_area() request return values sf objects. Supports network-based access . See README.md Added write_dataset() can write (potentially partitioned) parquet local directories remote (S3) buckets. S3 interface supports arrow-compatible URI notation: Alternate endpoints can now passed like s3://userid:secret_token@bucket-name?endpoint_override=data.ecoforecast.org Users can omit use * (match file) ** (recursive search) just supply path. Recursive search assumed automatically. Note: unlike arrow, still supports use globs (*) elsewhere path, e.g. s3://bucket/*/path duckdb_s3_config gains argument anonymous allowing users ignore existing AWS keys may set environmental variables AWS configuration files. can also passed username position URI notation, e.g. s3://anonymous@bucket_name. open_dataset drops use endpoint argument. Instead, alternative S3 endpoints can set either using URI query notation calling duckdb_s3_config() first. Additionally, arguments duckdb_s3_config(), including s3_endpoint, can now passed open_dataset .... Note settings override set URI notation.","code":""},{"path":"https://cboettig.github.io/duckdbfs/news/index.html","id":"duckdbfs-001","dir":"Changelog","previous_headings":"","what":"duckdbfs 0.0.1","title":"duckdbfs 0.0.1","text":"CRAN release: 2023-08-09 Initial release CRAN","code":""}] +[{"path":"https://cboettig.github.io/duckdbfs/LICENSE.html","id":null,"dir":"","previous_headings":"","what":"MIT License","title":"MIT License","text":"Copyright (c) 2023 duckdbfs authors Permission hereby granted, free charge, person obtaining copy software associated documentation files (“Software”), deal Software without restriction, including without limitation rights use, copy, modify, merge, publish, distribute, sublicense, /sell copies Software, permit persons Software furnished , subject following conditions: copyright notice permission notice shall included copies substantial portions Software. SOFTWARE PROVIDED “”, WITHOUT WARRANTY KIND, EXPRESS IMPLIED, INCLUDING LIMITED WARRANTIES MERCHANTABILITY, FITNESS PARTICULAR PURPOSE NONINFRINGEMENT. EVENT SHALL AUTHORS COPYRIGHT HOLDERS LIABLE CLAIM, DAMAGES LIABILITY, WHETHER ACTION CONTRACT, TORT OTHERWISE, ARISING , CONNECTION SOFTWARE USE DEALINGS SOFTWARE.","code":""},{"path":"https://cboettig.github.io/duckdbfs/authors.html","id":null,"dir":"","previous_headings":"","what":"Authors","title":"Authors and Citation","text":"Carl Boettiger. Author, maintainer. Michael D. Sumner. Contributor.","code":""},{"path":"https://cboettig.github.io/duckdbfs/authors.html","id":"citation","dir":"","previous_headings":"","what":"Citation","title":"Authors and Citation","text":"Boettiger C (2025). duckdbfs: High Performance Remote File System, Database 'Geospatial' Access Using 'duckdb'. R package version 0.0.9.1, https://cboettig.github.io/duckdbfs/, https://github.com/cboettig/duckdbfs.","code":"@Manual{, title = {duckdbfs: High Performance Remote File System, Database and 'Geospatial' Access Using 'duckdb'}, author = {Carl Boettiger}, year = {2025}, note = {R package version 0.0.9.1, https://cboettig.github.io/duckdbfs/}, url = {https://github.com/cboettig/duckdbfs}, }"},{"path":"https://cboettig.github.io/duckdbfs/index.html","id":"duckdbfs","dir":"","previous_headings":"","what":"High Performance Remote File System, Database and Geospatial Access Using duckdb","title":"High Performance Remote File System, Database and Geospatial Access Using duckdb","text":"duckdbfs simple wrapper around duckdb package facilitate working construction single lazy table (SQL connection) set file paths, URLs, S3 URIs.","code":""},{"path":"https://cboettig.github.io/duckdbfs/index.html","id":"installation","dir":"","previous_headings":"","what":"Installation","title":"High Performance Remote File System, Database and Geospatial Access Using duckdb","text":"can install development version duckdbfs GitHub :","code":"# install.packages(\"devtools\") devtools::install_github(\"cboettig/duckdbfs\")"},{"path":"https://cboettig.github.io/duckdbfs/index.html","id":"quickstart","dir":"","previous_headings":"","what":"Quickstart","title":"High Performance Remote File System, Database and Geospatial Access Using duckdb","text":"Imagine collection URLs files want combine single tibble R. files parquet csv, files may additional columns present files. combined data may large, potentially bigger available RAM slow download completely, may want subset using methods like dplyr::filter() dplyr::summarise(). can easily access data without downloading passing vector URLs. Note schemas (column names) match, must explicitly request duckdb join two schemas. Leave default, FALSE required achieve much better performance. Use filter(), select(), etc dplyr subset process data – method supported dbpylr. use dplyr::collect() trigger evaluation ingest results query R.","code":"library(duckdbfs) library(dplyr) #> #> Attaching package: 'dplyr' #> The following objects are masked from 'package:stats': #> #> filter, lag #> The following objects are masked from 'package:base': #> #> intersect, setdiff, setequal, union base <- paste0(\"https://github.com/duckdb/duckdb/raw/main/\", \"data/parquet-testing/hive-partitioning/union_by_name/\") f1 <- paste0(base, \"x=1/f1.parquet\") f2 <- paste0(base, \"x=1/f2.parquet\") f3 <- paste0(base, \"x=2/f2.parquet\") urls <- c(f1,f2,f3) ds <- open_dataset(urls, unify_schemas = TRUE) ds #> # Source: table [3 x 4] #> # Database: DuckDB v0.10.1 [unknown@Linux 6.6.10-76060610-generic:R 4.3.2/:memory:] #> i j x k #> #> 1 42 84 1 NA #> 2 42 84 1 NA #> 3 NA 128 2 33"},{"path":"https://cboettig.github.io/duckdbfs/index.html","id":"s3-based-access","dir":"","previous_headings":"","what":"S3-based access","title":"High Performance Remote File System, Database and Geospatial Access Using duckdb","text":"can also access remote data S3 protocol. advantage S3 unlike https, can discover files given folder, don’t list individually. particularly convenient accessing large, partitioned datasets, like GBIF: (nearly 200 GB data split across 2000 parquet files) additional configuration arguments passed helper function duckdb_s3_config() set access credentials configure settings, like alternative endpoints (use S3-compliant systems like minio). course also possible set ahead time calling duckdb_s3_config() directly. Many settings can also passed along compactly using URI query notation found arrow package. instance, can request anonymous access bucket alternative endpoint :","code":"parquet <- \"s3://gbif-open-data-us-east-1/occurrence/2023-06-01/occurrence.parquet\" duckdb_s3_config() gbif <- open_dataset(parquet, anonymous = TRUE, s3_region=\"us-east-1\") efi <- open_dataset(\"s3://anonymous@neon4cast-scores/parquet/aquatics?endpoint_override=data.ecoforecast.org\")"},{"path":"https://cboettig.github.io/duckdbfs/index.html","id":"spatial-data","dir":"","previous_headings":"","what":"Spatial data","title":"High Performance Remote File System, Database and Geospatial Access Using duckdb","text":"duckdb can also understand wide array spatial data queries spatial vector data, similar operations found popular sf package. See list supported functions details. spatial query operations require geometry column expresses simple feature geometry duckdb’s internal geometry format (nearly exactly WKB).","code":""},{"path":"https://cboettig.github.io/duckdbfs/index.html","id":"generating-spatial-data-from-tabular","dir":"","previous_headings":"Spatial data","what":"Generating spatial data from tabular","title":"High Performance Remote File System, Database and Geospatial Access Using duckdb","text":"common pattern first generate geometry column raw columns, latitude lognitude columns, using duckdb implementation method familiar postgis, st_point: Note coercing generic tabular CSV spatial data, user responsible specifying coordinate reference system (crs) used columns. instance, case data latitude-longitude, specify corresponding EPSG code. optional (sf allows objects unknown CRS), advisable. Recall used sort external database like duckdb, dplyr functions like dplyr::mutate() transcribed SQL dbplyr, actually ever run R. allows us seamlessly pass along spatial functions like st_point, despite available R function. (Also note SQL case-sensitive, function also written ST_Point). Optionally, can additional operations geometry column, computing distances (st_distance shown ), spatial filters, forth. to_sf() coercion parse input SQL query gets passed duckdb, return object collected sf::st_read, returning (-memory) sf object. details including complete list dozens spatial operations currently supported notes performance current limitations, see duckdb spatial docs","code":"spatial_ex <- paste0(\"https://raw.githubusercontent.com/cboettig/duckdbfs/\", \"main/inst/extdata/spatial-test.csv\") |> open_dataset(format = \"csv\") spatial_ex |> mutate(geometry = st_point(longitude, latitude)) |> mutate(dist = st_distance(geometry, st_point(0,0))) |> to_sf(crs = 4326) #> Simple feature collection with 10 features and 4 fields #> Geometry type: POINT #> Dimension: XY #> Bounding box: xmin: 1 ymin: 1 xmax: 10 ymax: 10 #> Geodetic CRS: WGS 84 #> site latitude longitude dist geom #> 1 a 1 1 1.414214 POINT (1 1) #> 2 b 2 2 2.828427 POINT (2 2) #> 3 c 3 3 4.242641 POINT (3 3) #> 4 d 4 4 5.656854 POINT (4 4) #> 5 e 5 5 7.071068 POINT (5 5) #> 6 f 6 6 8.485281 POINT (6 6) #> 7 g 7 7 9.899495 POINT (7 7) #> 8 h 8 8 11.313708 POINT (8 8) #> 9 i 9 9 12.727922 POINT (9 9) #> 10 j 10 10 14.142136 POINT (10 10)"},{"path":"https://cboettig.github.io/duckdbfs/index.html","id":"reading-spatial-vector-files","dir":"","previous_headings":"Spatial data","what":"Reading spatial vector files","title":"High Performance Remote File System, Database and Geospatial Access Using duckdb","text":"duckdb spatial package can also use GDAL read large spatial vector files. includes support remote files. means can easily subset columns wide array potentially remote file types filter rows columns, perform many spatial operations without ever reading entire objects memory R. Note open_dataset() always returns lazy remote table – yet downloaded data, let alone read R. simply connection allowing us stream data. can examine spatial metadata associated remote dataset using duckdbfs spatial helper function, st_read_meta, small dataset, can bring entire data R (memory) using to_sf(), specifying CRS indicated metadata: However, can also wide range spatial observations without importing data. can particularly helpful working large datasets. example: country polygon contains Melbourne? Note result still lazy read, haven’t downloaded read full spatial data object. , use to_sf() read query results native (-memory) sf object:","code":"url <- \"https://github.com/cboettig/duckdbfs/raw/main/inst/extdata/world.fgb\" countries <- open_dataset(url, format = \"sf\") countries_meta <- st_read_meta(url) countries_meta #> # A tibble: 1 × 7 #> feature_count geom_column_name geom_type name code wkt proj4 #> #> 1 177 geom Multi Polygon EPSG 4326 \"GEOGCS[\\\"WGS … +pro… in_mem <- countries |> to_sf(crs = countries_meta$wkt) library(sf) #> Linking to GEOS 3.12.1, GDAL 3.8.4, PROJ 9.4.0; sf_use_s2() is TRUE melbourne <- st_point(c(144.9633, -37.814)) |> st_as_text() countries |> filter(st_contains(geom, ST_GeomFromText({melbourne}))) #> # Source: SQL [1 x 16] #> # Database: DuckDB v0.10.1 [unknown@Linux 6.6.10-76060610-generic:R 4.3.2/:memory:] #> iso_a3 name sovereignt continent area pop_est pop_est_dens economy #> #> 1 AUS Australia Australia Oceania 7682300 21262641 2.77 2. Develo… #> # ℹ 8 more variables: income_grp , gdp_cap_est , life_exp , #> # well_being , footprint , inequality , HPI , geom sf_obj <- countries |> filter(continent == \"Africa\") |> to_sf() plot(sf_obj[\"name\"])"},{"path":"https://cboettig.github.io/duckdbfs/index.html","id":"spatial-joins","dir":"","previous_headings":"","what":"Spatial joins","title":"High Performance Remote File System, Database and Geospatial Access Using duckdb","text":"One common operation spatial joins, can powerful way subset large data. Lets consider set point geometries representing coordinates major cities around world: Note metadata must read directly source file, embedded duckdb table view. combining data countries data, confirm CRS datasets: instance, can return points (cities) within collection polygons (country boundaries Oceania continent): Possible spatial joins include: Note SQL functions case-sensitive, spatial_join expects lower-case names.","code":"url_cities <- \"https://github.com/cboettig/duckdbfs/raw/spatial-read/inst/extdata/metro.fgb\" cities <- open_dataset(url_cities, format=\"sf\") countries_meta$proj4 #> [1] \"+proj=longlat +datum=WGS84 +no_defs\" st_read_meta(url_cities)$proj4 #> [1] \"+proj=longlat +datum=WGS84 +no_defs\" countries |> dplyr::filter(continent == \"Oceania\") |> spatial_join(cities, by = \"st_intersects\", join=\"inner\") |> select(name_long, sovereignt, pop2020) #> # Source: SQL [6 x 3] #> # Database: DuckDB v0.10.1 [unknown@Linux 6.6.10-76060610-generic:R 4.3.2/:memory:] #> name_long sovereignt pop2020 #> #> 1 Brisbane Australia 2388517 #> 2 Perth Australia 2036118 #> 3 Sydney Australia 4729406 #> 4 Adelaide Australia 1320783 #> 5 Auckland New Zealand 1426070 #> 6 Melbourne Australia 4500501"},{"path":"https://cboettig.github.io/duckdbfs/index.html","id":"writing-datasets","dir":"","previous_headings":"","what":"Writing datasets","title":"High Performance Remote File System, Database and Geospatial Access Using duckdb","text":"Like arrow::write_dataset(), duckdbfs::write_dataset() can write partitioned parquet files local disks also directly S3 bucket. Partitioned writes take advantage threading. Partition variables can specified explicitly, dplyr grouping variables used default:","code":"mtcars |> group_by(cyl, gear) |> write_dataset(tempfile())"},{"path":"https://cboettig.github.io/duckdbfs/index.html","id":"local-files","dir":"","previous_headings":"","what":"Local files","title":"High Performance Remote File System, Database and Geospatial Access Using duckdb","text":"course, open_dataset() write_dataset() also used local files. Remember parquet format required, can read csv files (including multiple hive-partitioned csv files).","code":"write.csv(mtcars, \"mtcars.csv\", row.names=FALSE) lazy_cars <- open_dataset(\"mtcars.csv\", format = \"csv\")"},{"path":"https://cboettig.github.io/duckdbfs/index.html","id":"mechanism--motivation","dir":"","previous_headings":"","what":"Mechanism / motivation","title":"High Performance Remote File System, Database and Geospatial Access Using duckdb","text":"package simply creates duckdb connection, ensures httpfs spatial extensions installed necessary, sets S3 configuration, constructs VIEW using duckdb’s parquet_scan() read_csv_auto() methods associated options. returns dplyr::tbl() resulting view. Though straightforward, process substantially verbose analogous single function call provided arrow::open_dataset() due mostly necessary string manipulation construct VIEW SQL statement. ’ve used pattern lot, especially arrow option (http data) substantially worse performance (many S3 URIs).","code":""},{"path":"https://cboettig.github.io/duckdbfs/index.html","id":"advanced-notes","dir":"","previous_headings":"","what":"Advanced notes","title":"High Performance Remote File System, Database and Geospatial Access Using duckdb","text":"similar behavior arrow::open_dataset(), exceptions: time, arrow support access HTTP – remote sources must S3 GC-based object store. local file system S3 paths, duckdb can support “globbing” point path, e.g. open_dataset(data/*/subdir). (Like arrow, duckdbfs::open_dataset assume recursive path discovery directories). Note http(s) URLs always require full vector since ls() method possible. Even URLs vector-based paths, duckdb can automatically populate column names given hive structure hive_style=TRUE (default). Note passing vector paths can significantly faster globbing S3 sources ls() operation relatively expensive many partitions.","code":""},{"path":"https://cboettig.github.io/duckdbfs/index.html","id":"performance-notes","dir":"","previous_headings":"","what":"Performance notes","title":"High Performance Remote File System, Database and Geospatial Access Using duckdb","text":"settings, duckdbfs::open_dataset can give substantially better performance (orders magnitude) arrow::open_dataset(), settings may comparable even slower. Package versions, system libraries, network architecture, remote storage performance, network traffic, factors can influence performance, making precise benchmark comparisons real-world contexts difficult. slow network connections accessing remote table repeatedly, may improve performance create local copy table rather perform operations network. simplest way setting mode = \"TABLE\" instead “VIEW” open dataset. probably desirable pass duckdb connection backed persistent disk location case instead default cached_connection() unless available RAM limiting. unify_schema computationally expensive. Ensuring files/partitions match schema advance processing different files separately can greatly improve performance.","code":""},{"path":"https://cboettig.github.io/duckdbfs/reference/as_dataset.html","id":null,"dir":"Reference","previous_headings":"","what":"as_dataset — as_dataset","title":"as_dataset — as_dataset","text":"Push local (-memory) dataset duckdb database table. enables share connection source data. equivalent behavior copy=TRUE many () two-table verbs dplyr.","code":""},{"path":"https://cboettig.github.io/duckdbfs/reference/as_dataset.html","id":"ref-usage","dir":"Reference","previous_headings":"","what":"Usage","title":"as_dataset — as_dataset","text":"","code":"as_dataset(df, conn = cached_connection())"},{"path":"https://cboettig.github.io/duckdbfs/reference/as_dataset.html","id":"arguments","dir":"Reference","previous_headings":"","what":"Arguments","title":"as_dataset — as_dataset","text":"df local data frame. Otherwise passed back without side effects conn connection database.","code":""},{"path":"https://cboettig.github.io/duckdbfs/reference/as_dataset.html","id":"value","dir":"Reference","previous_headings":"","what":"Value","title":"as_dataset — as_dataset","text":"remote dplyr::tbl connection table.","code":""},{"path":"https://cboettig.github.io/duckdbfs/reference/as_view.html","id":null,"dir":"Reference","previous_headings":"","what":"as_view — as_view","title":"as_view — as_view","text":"Create View current query. can effective way allow query chain remain lazy","code":""},{"path":"https://cboettig.github.io/duckdbfs/reference/as_view.html","id":"ref-usage","dir":"Reference","previous_headings":"","what":"Usage","title":"as_view — as_view","text":"","code":"as_view(x, tblname = tmp_tbl_name(), conn = cached_connection())"},{"path":"https://cboettig.github.io/duckdbfs/reference/as_view.html","id":"arguments","dir":"Reference","previous_headings":"","what":"Arguments","title":"as_view — as_view","text":"x duckdb spatial dataset tblname name table create database. conn connection database.","code":""},{"path":"https://cboettig.github.io/duckdbfs/reference/as_view.html","id":"ref-examples","dir":"Reference","previous_headings":"","what":"Examples","title":"as_view — as_view","text":"","code":"if (FALSE) { # interactive() path <- system.file(\"extdata/spatial-test.csv\", package=\"duckdbfs\") df <- open_dataset(path) library(dplyr) df |> filter(latitude > 5) |> as_view() }"},{"path":"https://cboettig.github.io/duckdbfs/reference/cached_connection.html","id":null,"dir":"Reference","previous_headings":"","what":"create a cachable duckdb connection — cached_connection","title":"create a cachable duckdb connection — cached_connection","text":"function primarily intended internal use duckdbfs functions. However, can called directly user whenever desirable direct access connection object.","code":""},{"path":"https://cboettig.github.io/duckdbfs/reference/cached_connection.html","id":"ref-usage","dir":"Reference","previous_headings":"","what":"Usage","title":"create a cachable duckdb connection — cached_connection","text":"","code":"cached_connection( dbdir = \":memory:\", read_only = FALSE, bigint = \"numeric\", config = list(temp_directory = tempfile()), autoload_exts = getOption(\"duckdbfs_autoload_extensions\", TRUE) )"},{"path":"https://cboettig.github.io/duckdbfs/reference/cached_connection.html","id":"arguments","dir":"Reference","previous_headings":"","what":"Arguments","title":"create a cachable duckdb connection — cached_connection","text":"dbdir Location database files. path existing directory file system. default (\"\"), data kept RAM. read_only Set TRUE read-operation. file-based databases, applied database file opened first time. Subsequent connections (via drv object drv object pointing path) silently ignore flag. bigint 64-bit integers returned. two options: \"numeric\" \"integer64\". \"numeric\" selected, bigint integers treated double/numeric. \"integer64\" selected, bigint integers set bit64 encoding. config Named list DuckDB configuration flags, see https://duckdb.org/docs/configuration/overview#configuration-reference possible options. flags applied database object instantiated. Subsequent connections silently ignore flags. autoload_exts auto-load extensions? TRUE default, can configured options(duckdbfs_autoload_extensions = FALSE)","code":""},{"path":"https://cboettig.github.io/duckdbfs/reference/cached_connection.html","id":"value","dir":"Reference","previous_headings":"","what":"Value","title":"create a cachable duckdb connection — cached_connection","text":"duckdb::duckdb() connection object","code":""},{"path":"https://cboettig.github.io/duckdbfs/reference/cached_connection.html","id":"details","dir":"Reference","previous_headings":"","what":"Details","title":"create a cachable duckdb connection — cached_connection","text":"first called (user internal function), function creates duckdb connection places connection cache (duckdbfs_conn option). subsequent calls, function returns cached connection, rather recreating fresh connection. frees user responsibility managing connection object, functions needing access connection can use create access existing connection. close global environment, function's finalizer gracefully shutdown connection removing cache. default, function creates -memory connection. reading -disk remote files (parquet csv), option can still effectively support operations much-larger--RAM data. However, operations require additional working space, default set temporary storage location configuration well.","code":""},{"path":"https://cboettig.github.io/duckdbfs/reference/cached_connection.html","id":"ref-examples","dir":"Reference","previous_headings":"","what":"Examples","title":"create a cachable duckdb connection — cached_connection","text":"","code":"con <- cached_connection() close_connection(con)"},{"path":"https://cboettig.github.io/duckdbfs/reference/close_connection.html","id":null,"dir":"Reference","previous_headings":"","what":"close connection — close_connection","title":"close connection — close_connection","text":"close connection","code":""},{"path":"https://cboettig.github.io/duckdbfs/reference/close_connection.html","id":"ref-usage","dir":"Reference","previous_headings":"","what":"Usage","title":"close connection — close_connection","text":"","code":"close_connection(conn = cached_connection())"},{"path":"https://cboettig.github.io/duckdbfs/reference/close_connection.html","id":"arguments","dir":"Reference","previous_headings":"","what":"Arguments","title":"close connection — close_connection","text":"conn duckdb connection (leave blank) Closes invisible cached connection duckdb","code":""},{"path":"https://cboettig.github.io/duckdbfs/reference/close_connection.html","id":"value","dir":"Reference","previous_headings":"","what":"Value","title":"close connection — close_connection","text":"returns nothing.","code":""},{"path":"https://cboettig.github.io/duckdbfs/reference/close_connection.html","id":"details","dir":"Reference","previous_headings":"","what":"Details","title":"close connection — close_connection","text":"Shuts connection gc removes . clear cached reference avoid using stale connection avoids complaint connection garbage collected.","code":""},{"path":"https://cboettig.github.io/duckdbfs/reference/close_connection.html","id":"ref-examples","dir":"Reference","previous_headings":"","what":"Examples","title":"close connection — close_connection","text":"","code":"close_connection()"},{"path":"https://cboettig.github.io/duckdbfs/reference/duckdb_s3_config.html","id":null,"dir":"Reference","previous_headings":"","what":"Configure S3 settings for database connection — duckdb_s3_config","title":"Configure S3 settings for database connection — duckdb_s3_config","text":"function used configure S3 settings database connection. allows set various S3-related parameters access key, secret access key, endpoint, region, session token, uploader settings, URL compatibility mode, URL style, SSL usage.","code":""},{"path":"https://cboettig.github.io/duckdbfs/reference/duckdb_s3_config.html","id":"ref-usage","dir":"Reference","previous_headings":"","what":"Usage","title":"Configure S3 settings for database connection — duckdb_s3_config","text":"","code":"duckdb_s3_config( conn = cached_connection(), s3_access_key_id = NULL, s3_secret_access_key = NULL, s3_endpoint = NULL, s3_region = NULL, s3_session_token = NULL, s3_uploader_max_filesize = NULL, s3_uploader_max_parts_per_file = NULL, s3_uploader_thread_limit = NULL, s3_url_compatibility_mode = NULL, s3_url_style = NULL, s3_use_ssl = NULL, anonymous = NULL )"},{"path":"https://cboettig.github.io/duckdbfs/reference/duckdb_s3_config.html","id":"arguments","dir":"Reference","previous_headings":"","what":"Arguments","title":"Configure S3 settings for database connection — duckdb_s3_config","text":"conn database connection object created using cache_connection function (default: cache_connection()). s3_access_key_id S3 access key ID (default: NULL). s3_secret_access_key S3 secret access key (default: NULL). s3_endpoint S3 endpoint (default: NULL). s3_region S3 region (default: NULL). s3_session_token S3 session token (default: NULL). s3_uploader_max_filesize maximum filesize S3 uploader (50GB 5TB, default 800GB). s3_uploader_max_parts_per_file maximum number parts per file S3 uploader (1 10000, default 10000). s3_uploader_thread_limit thread limit S3 uploader (default: 50). s3_url_compatibility_mode Disable Globs Query Parameters S3 URLs (default: 0, allows globs/queries). s3_url_style style S3 URLs use. Default \"vhost\" unless s3_endpoint set, makes default \"path\" (.e. MINIO systems). s3_use_ssl Enable disable SSL S3 connections (default: 1 (TRUE)). anonymous request anonymous access (sets s3_access_key_id s3_secret_access_key \"\", allowing anonymous access public buckets).","code":""},{"path":"https://cboettig.github.io/duckdbfs/reference/duckdb_s3_config.html","id":"value","dir":"Reference","previous_headings":"","what":"Value","title":"Configure S3 settings for database connection — duckdb_s3_config","text":"Returns silently (NULL) successful.","code":""},{"path":"https://cboettig.github.io/duckdbfs/reference/duckdb_s3_config.html","id":"details","dir":"Reference","previous_headings":"","what":"Details","title":"Configure S3 settings for database connection — duckdb_s3_config","text":"see https://duckdb.org/docs/sql/configuration.html","code":""},{"path":"https://cboettig.github.io/duckdbfs/reference/duckdb_s3_config.html","id":"ref-examples","dir":"Reference","previous_headings":"","what":"Examples","title":"Configure S3 settings for database connection — duckdb_s3_config","text":"","code":"if (FALSE) { # interactive() # Configure S3 settings duckdb_s3_config( s3_access_key_id = \"YOUR_ACCESS_KEY_ID\", s3_secret_access_key = \"YOUR_SECRET_ACCESS_KEY\", s3_endpoint = \"YOUR_S3_ENDPOINT\", s3_region = \"YOUR_S3_REGION\", s3_uploader_max_filesize = \"800GB\", s3_uploader_max_parts_per_file = 100, s3_uploader_thread_limit = 8, s3_url_compatibility_mode = FALSE, s3_url_style = \"vhost\", s3_use_ssl = TRUE, anonymous = TRUE) }"},{"path":"https://cboettig.github.io/duckdbfs/reference/duckdb_secrets.html","id":null,"dir":"Reference","previous_headings":"","what":"duckdb secrets — duckdb_secrets","title":"duckdb secrets — duckdb_secrets","text":"Configure duckdb secrets remote access.","code":""},{"path":"https://cboettig.github.io/duckdbfs/reference/duckdb_secrets.html","id":"ref-usage","dir":"Reference","previous_headings":"","what":"Usage","title":"duckdb secrets — duckdb_secrets","text":"","code":"duckdb_secrets( key = Sys.getenv(\"AWS_ACCESS_KEY_ID\", \"\"), secret = Sys.getenv(\"AWS_SECRET_ACCESS_KEY\", \"\"), endpoint = Sys.getenv(\"AWS_S3_ENDPOINT\", \"s3.amazonaws.com\"), bucket = NULL, url_style = NULL, type = \"S3\", conn = cached_connection() )"},{"path":"https://cboettig.github.io/duckdbfs/reference/duckdb_secrets.html","id":"arguments","dir":"Reference","previous_headings":"","what":"Arguments","title":"duckdb secrets — duckdb_secrets","text":"key key secret secret endpoint endpoint address bucket restricts \"SCOPE\" key objects bucket-name. note bucket name currently insensitive endpoint url_style path vhost, S3 type Key type, e.g. S3. See duckdb docs details. references https://duckdb.org/docs/configuration/secrets_manager.html conn connection database.","code":""},{"path":"https://cboettig.github.io/duckdbfs/reference/load_h3.html","id":null,"dir":"Reference","previous_headings":"","what":"load the duckdb geospatial data plugin — load_h3","title":"load the duckdb geospatial data plugin — load_h3","text":"load duckdb geospatial data plugin","code":""},{"path":"https://cboettig.github.io/duckdbfs/reference/load_h3.html","id":"ref-usage","dir":"Reference","previous_headings":"","what":"Usage","title":"load the duckdb geospatial data plugin — load_h3","text":"","code":"load_h3(conn = cached_connection())"},{"path":"https://cboettig.github.io/duckdbfs/reference/load_h3.html","id":"arguments","dir":"Reference","previous_headings":"","what":"Arguments","title":"load the duckdb geospatial data plugin — load_h3","text":"conn database connection object created using cache_connection function (default: cache_connection()).","code":""},{"path":"https://cboettig.github.io/duckdbfs/reference/load_h3.html","id":"value","dir":"Reference","previous_headings":"","what":"Value","title":"load the duckdb geospatial data plugin — load_h3","text":"loads extension returns status invisibly.","code":""},{"path":"https://cboettig.github.io/duckdbfs/reference/load_h3.html","id":"references","dir":"Reference","previous_headings":"","what":"References","title":"load the duckdb geospatial data plugin — load_h3","text":"https://github.com/isaacbrodsky/h3-duckdb","code":""},{"path":"https://cboettig.github.io/duckdbfs/reference/load_h3.html","id":"ref-examples","dir":"Reference","previous_headings":"","what":"Examples","title":"load the duckdb geospatial data plugin — load_h3","text":"","code":"if (FALSE) { # interactive() library(dplyr) load_h3() ex <- system.file(\"extdata/spatial-test.csv\", package=\"duckdbfs\") zoom <- 9L # Zoom must be explicit integer, L query <- ex |> open_dataset(format = \"csv\") |> mutate(h3id = h3_latlng_to_cell_string(latitude, longitude, zoom)) # as data.frame collect(query) # write to a file path <- tempfile(fileext = \".h3j\") query |> to_h3j(path) }"},{"path":"https://cboettig.github.io/duckdbfs/reference/load_spatial.html","id":null,"dir":"Reference","previous_headings":"","what":"load the duckdb geospatial data plugin — load_spatial","title":"load the duckdb geospatial data plugin — load_spatial","text":"load duckdb geospatial data plugin","code":""},{"path":"https://cboettig.github.io/duckdbfs/reference/load_spatial.html","id":"ref-usage","dir":"Reference","previous_headings":"","what":"Usage","title":"load the duckdb geospatial data plugin — load_spatial","text":"","code":"load_spatial( conn = cached_connection(), nightly = getOption(\"duckdbfs_use_nightly\", FALSE), force = FALSE )"},{"path":"https://cboettig.github.io/duckdbfs/reference/load_spatial.html","id":"arguments","dir":"Reference","previous_headings":"","what":"Arguments","title":"load the duckdb geospatial data plugin — load_spatial","text":"conn database connection object created using cache_connection function (default: cache_connection()). nightly use nightly version ? default FALSE, configurable duckdbfs_use_nightly option. force force re-install?","code":""},{"path":"https://cboettig.github.io/duckdbfs/reference/load_spatial.html","id":"value","dir":"Reference","previous_headings":"","what":"Value","title":"load the duckdb geospatial data plugin — load_spatial","text":"loads extension returns status invisibly.","code":""},{"path":"https://cboettig.github.io/duckdbfs/reference/load_spatial.html","id":"references","dir":"Reference","previous_headings":"","what":"References","title":"load the duckdb geospatial data plugin — load_spatial","text":"https://duckdb.org/docs/extensions/spatial.html","code":""},{"path":"https://cboettig.github.io/duckdbfs/reference/open_dataset.html","id":null,"dir":"Reference","previous_headings":"","what":"Open a dataset from a variety of sources — open_dataset","title":"Open a dataset from a variety of sources — open_dataset","text":"function opens dataset variety sources, including Parquet, CSV, etc, using either local file system paths, URLs, S3 bucket URI notation.","code":""},{"path":"https://cboettig.github.io/duckdbfs/reference/open_dataset.html","id":"ref-usage","dir":"Reference","previous_headings":"","what":"Usage","title":"Open a dataset from a variety of sources — open_dataset","text":"","code":"open_dataset( sources, schema = NULL, hive_style = TRUE, unify_schemas = FALSE, format = c(\"parquet\", \"csv\", \"tsv\", \"sf\"), conn = cached_connection(), tblname = tmp_tbl_name(), mode = \"VIEW\", filename = FALSE, recursive = TRUE, ... )"},{"path":"https://cboettig.github.io/duckdbfs/reference/open_dataset.html","id":"arguments","dir":"Reference","previous_headings":"","what":"Arguments","title":"Open a dataset from a variety of sources — open_dataset","text":"sources character vector paths dataset files. schema schema dataset. NULL, schema inferred dataset files. hive_style logical value indicating whether dataset uses Hive-style partitioning. unify_schemas logical value indicating whether unify schemas dataset files (union_by_name). TRUE, execute UNION column name across files (NOTE: can add considerably initial execution time) format format dataset files. One \"parquet\", \"csv\", \"tsv\", \"sf\" (spatial vector files supported sf package / GDAL). argument provided, function try guess type based minimal heuristics. conn connection database. tblname name table create database. mode mode create table . One \"VIEW\" \"TABLE\". Creating VIEW, default, execute quickly create local copy dataset. TABLE create local copy duckdb's native format, downloading full dataset necessary. using TABLE mode large data, please sure use conn connections disk-based storage, e.g. calling cached_connection(), e.g. cached_connection(\"storage_path\"), otherwise full data must fit RAM. Using TABLE assumes familiarity R's DBI-based interface. filename logical value indicating whether include filename table name. recursive assume recursive path? default TRUE. Set FALSE trying open single, un-partitioned file. ... optional additional arguments passed duckdb_s3_config(). Note apply set URI notation thus may used override provide settings supported format.","code":""},{"path":"https://cboettig.github.io/duckdbfs/reference/open_dataset.html","id":"value","dir":"Reference","previous_headings":"","what":"Value","title":"Open a dataset from a variety of sources — open_dataset","text":"lazy dplyr::tbl object representing opened dataset backed duckdb SQL connection. dplyr (tidyr) verbs can used directly object, can translated SQL commands automatically via dbplyr. Generic R commands require using dplyr::collect() table, forces evaluation reading resulting data memory.","code":""},{"path":"https://cboettig.github.io/duckdbfs/reference/open_dataset.html","id":"ref-examples","dir":"Reference","previous_headings":"","what":"Examples","title":"Open a dataset from a variety of sources — open_dataset","text":"","code":"if (FALSE) { # interactive() # A remote, hive-partitioned Parquet dataset base <- paste0(\"https://github.com/duckdb/duckdb/raw/main/\", \"data/parquet-testing/hive-partitioning/union_by_name/\") f1 <- paste0(base, \"x=1/f1.parquet\") f2 <- paste0(base, \"x=1/f2.parquet\") f3 <- paste0(base, \"x=2/f2.parquet\") open_dataset(c(f1,f2,f3), unify_schemas = TRUE) # Access an S3 database specifying an independently-hosted (MINIO) endpoint efi <- open_dataset(\"s3://neon4cast-scores/parquet/aquatics\", s3_access_key_id=\"\", s3_endpoint=\"data.ecoforecast.org\") }"},{"path":"https://cboettig.github.io/duckdbfs/reference/spatial_join.html","id":null,"dir":"Reference","previous_headings":"","what":"spatial_join — spatial_join","title":"spatial_join — spatial_join","text":"spatial_join","code":""},{"path":"https://cboettig.github.io/duckdbfs/reference/spatial_join.html","id":"ref-usage","dir":"Reference","previous_headings":"","what":"Usage","title":"spatial_join — spatial_join","text":"","code":"spatial_join( x, y, by = c(\"st_intersects\", \"st_within\", \"st_dwithin\", \"st_touches\", \"st_contains\", \"st_containsproperly\", \"st_covers\", \"st_overlaps\", \"st_crosses\", \"st_equals\", \"st_disjoint\"), args = \"\", join = \"left\", tblname = tmp_tbl_name(), conn = cached_connection() )"},{"path":"https://cboettig.github.io/duckdbfs/reference/spatial_join.html","id":"arguments","dir":"Reference","previous_headings":"","what":"Arguments","title":"spatial_join — spatial_join","text":"x duckdb table spatial geometry column called \"geom\" y duckdb table spatial geometry column called \"geom\" spatial join function, see details. args additional arguments join function (e.g. distance st_dwithin) join JOIN type (left, right, inner, full) tblname name temporary view conn duckdb connection (imputed duckdbfs default, must shared across tables)","code":""},{"path":"https://cboettig.github.io/duckdbfs/reference/spatial_join.html","id":"value","dir":"Reference","previous_headings":"","what":"Value","title":"spatial_join — spatial_join","text":"(lazy) view resulting table. Users can continue operate using dplyr operations call to_st() collect sf object.","code":""},{"path":"https://cboettig.github.io/duckdbfs/reference/spatial_join.html","id":"details","dir":"Reference","previous_headings":"","what":"Details","title":"spatial_join — spatial_join","text":"Possible spatial joins include: though SQL case sensitive, function expects lower case names \"\" functions.","code":""},{"path":"https://cboettig.github.io/duckdbfs/reference/spatial_join.html","id":"ref-examples","dir":"Reference","previous_headings":"","what":"Examples","title":"spatial_join — spatial_join","text":"","code":"if (FALSE) { # interactive() # note we can read in remote data in a variety of vector formats: countries <- paste0(\"/vsicurl/\", \"https://github.com/cboettig/duckdbfs/\", \"raw/spatial-read/inst/extdata/world.gpkg\") |> open_dataset(format = \"sf\") cities <- paste0(\"/vsicurl/https://github.com/cboettig/duckdbfs/raw/\", \"spatial-read/inst/extdata/metro.fgb\") |> open_dataset(format = \"sf\") countries |> dplyr::filter(iso_a3 == \"AUS\") |> spatial_join(cities) }"},{"path":"https://cboettig.github.io/duckdbfs/reference/st_read_meta.html","id":null,"dir":"Reference","previous_headings":"","what":"read spatial metadata — st_read_meta","title":"read spatial metadata — st_read_meta","text":"time, reads subset spatial metadata. similar reported ogrinfo -json","code":""},{"path":"https://cboettig.github.io/duckdbfs/reference/st_read_meta.html","id":"ref-usage","dir":"Reference","previous_headings":"","what":"Usage","title":"read spatial metadata — st_read_meta","text":"","code":"st_read_meta( path, layer = 1L, tblname = tbl_name(path), conn = cached_connection(), ... )"},{"path":"https://cboettig.github.io/duckdbfs/reference/st_read_meta.html","id":"arguments","dir":"Reference","previous_headings":"","what":"Arguments","title":"read spatial metadata — st_read_meta","text":"path URL path spatial data file layer layer number read metadata , defaults first layer. tblname metadata stored view name, default based name file. conn connection database. ... optional additional arguments passed duckdb_s3_config(). Note apply set URI notation thus may used override provide settings supported format.","code":""},{"path":"https://cboettig.github.io/duckdbfs/reference/st_read_meta.html","id":"value","dir":"Reference","previous_headings":"","what":"Value","title":"read spatial metadata — st_read_meta","text":"lazy dplyr::tbl object containing core spatial metadata projection information.","code":""},{"path":"https://cboettig.github.io/duckdbfs/reference/st_read_meta.html","id":"ref-examples","dir":"Reference","previous_headings":"","what":"Examples","title":"read spatial metadata — st_read_meta","text":"","code":"if (FALSE) { # interactive() st_read_meta(\"https://github.com/duckdb/duckdb_spatial/raw/main/test/data/amsterdam_roads.fgb\") }"},{"path":"https://cboettig.github.io/duckdbfs/reference/to_h3j.html","id":null,"dir":"Reference","previous_headings":"","what":"Write H3 hexagon data out as an h3j-compliant JSON file NOTE: the column containing H3 hashes must be named hexid — to_h3j","title":"Write H3 hexagon data out as an h3j-compliant JSON file NOTE: the column containing H3 hashes must be named hexid — to_h3j","text":"Write H3 hexagon data h3j-compliant JSON file NOTE: column containing H3 hashes must named hexid","code":""},{"path":"https://cboettig.github.io/duckdbfs/reference/to_h3j.html","id":"ref-usage","dir":"Reference","previous_headings":"","what":"Usage","title":"Write H3 hexagon data out as an h3j-compliant JSON file NOTE: the column containing H3 hashes must be named hexid — to_h3j","text":"","code":"to_h3j(dataset, path, conn = cached_connection())"},{"path":"https://cboettig.github.io/duckdbfs/reference/to_h3j.html","id":"arguments","dir":"Reference","previous_headings":"","what":"Arguments","title":"Write H3 hexagon data out as an h3j-compliant JSON file NOTE: the column containing H3 hashes must be named hexid — to_h3j","text":"dataset remote tbl object open_dataset, -memory data.frame. path local file path S3 path write credentials conn duckdbfs database connection","code":""},{"path":"https://cboettig.github.io/duckdbfs/reference/to_h3j.html","id":"ref-examples","dir":"Reference","previous_headings":"","what":"Examples","title":"Write H3 hexagon data out as an h3j-compliant JSON file NOTE: the column containing H3 hashes must be named hexid — to_h3j","text":"","code":"if (FALSE) { # interactive() # example code }"},{"path":"https://cboettig.github.io/duckdbfs/reference/to_json.html","id":null,"dir":"Reference","previous_headings":"","what":"to_json write data out as a JSON object — to_json","title":"to_json write data out as a JSON object — to_json","text":"to_json write data JSON object","code":""},{"path":"https://cboettig.github.io/duckdbfs/reference/to_json.html","id":"ref-usage","dir":"Reference","previous_headings":"","what":"Usage","title":"to_json write data out as a JSON object — to_json","text":"","code":"to_json( dataset, path, conn = cached_connection(), array = TRUE, options = NULL )"},{"path":"https://cboettig.github.io/duckdbfs/reference/to_json.html","id":"arguments","dir":"Reference","previous_headings":"","what":"Arguments","title":"to_json write data out as a JSON object — to_json","text":"dataset remote tbl object open_dataset, -memory data.frame. path local file path S3 path write credentials conn duckdbfs database connection array generate JSON array? options additional options char string, see","code":""},{"path":"https://cboettig.github.io/duckdbfs/reference/to_sf.html","id":null,"dir":"Reference","previous_headings":"","what":"Convert output to sf object — to_sf","title":"Convert output to sf object — to_sf","text":"Convert output sf object","code":""},{"path":"https://cboettig.github.io/duckdbfs/reference/to_sf.html","id":"ref-usage","dir":"Reference","previous_headings":"","what":"Usage","title":"Convert output to sf object — to_sf","text":"","code":"to_sf(x, crs = NA, conn = cached_connection())"},{"path":"https://cboettig.github.io/duckdbfs/reference/to_sf.html","id":"arguments","dir":"Reference","previous_headings":"","what":"Arguments","title":"Convert output to sf object — to_sf","text":"x remote duckdb tbl (open_dataset) dplyr-pipeline thereof. crs coordinate reference system, format understood sf::st_crs. conn connection object tbl. Takes duckdb table (open_dataset) dataset dplyr pipline returns sf object. Important: table must geometry column, almost always create first. Note: to_sf() triggers collection R. function suitable use end dplyr pipeline subset data. Using function large dataset without filtering first may exceed available memory.","code":""},{"path":"https://cboettig.github.io/duckdbfs/reference/to_sf.html","id":"value","dir":"Reference","previous_headings":"","what":"Value","title":"Convert output to sf object — to_sf","text":"sf class object (memory).","code":""},{"path":"https://cboettig.github.io/duckdbfs/reference/to_sf.html","id":"ref-examples","dir":"Reference","previous_headings":"","what":"Examples","title":"Convert output to sf object — to_sf","text":"","code":"if (FALSE) { # interactive() library(dplyr) csv_file <- system.file(\"extdata/spatial-test.csv\", package=\"duckdbfs\") # Note that we almost always must first create a `geometry` column, e.g. # from lat/long columns using the `st_point` method. sf <- open_dataset(csv_file, format = \"csv\") |> mutate(geom = ST_Point(longitude, latitude)) |> to_sf() # We can use the full space of spatial operations, including spatial # and normal dplyr filters. All operations are translated into a # spatial SQL query by `to_sf`: open_dataset(csv_file, format = \"csv\") |> mutate(geom = ST_Point(longitude, latitude)) |> mutate(dist = ST_Distance(geom, ST_Point(0,0))) |> filter(site %in% c(\"a\", \"b\", \"e\")) |> to_sf() }"},{"path":"https://cboettig.github.io/duckdbfs/reference/write_dataset.html","id":null,"dir":"Reference","previous_headings":"","what":"write_dataset — write_dataset","title":"write_dataset — write_dataset","text":"write_dataset","code":""},{"path":"https://cboettig.github.io/duckdbfs/reference/write_dataset.html","id":"ref-usage","dir":"Reference","previous_headings":"","what":"Usage","title":"write_dataset — write_dataset","text":"","code":"write_dataset( dataset, path, conn = cached_connection(), format = c(\"parquet\", \"csv\"), partitioning = dplyr::group_vars(dataset), overwrite = TRUE, ... )"},{"path":"https://cboettig.github.io/duckdbfs/reference/write_dataset.html","id":"arguments","dir":"Reference","previous_headings":"","what":"Arguments","title":"write_dataset — write_dataset","text":"dataset remote tbl object open_dataset, -memory data.frame. path local file path S3 path write credentials conn duckdbfs database connection format export format partitioning names columns use partition variables overwrite allow overwriting existing files? ... additional arguments duckdb_s3_config()","code":""},{"path":"https://cboettig.github.io/duckdbfs/reference/write_dataset.html","id":"value","dir":"Reference","previous_headings":"","what":"Value","title":"write_dataset — write_dataset","text":"Returns path, invisibly.","code":""},{"path":"https://cboettig.github.io/duckdbfs/reference/write_dataset.html","id":"ref-examples","dir":"Reference","previous_headings":"","what":"Examples","title":"write_dataset — write_dataset","text":"","code":"if (FALSE) { # interactive() write_dataset(mtcars, tempfile()) } if (FALSE) { # interactive() write_dataset(mtcars, tempdir()) }"},{"path":"https://cboettig.github.io/duckdbfs/news/index.html","id":"duckdbfs-009","dir":"Changelog","previous_headings":"","what":"duckdbfs 0.0.9","title":"duckdbfs 0.0.9","text":"CRAN release: 2024-12-16 Restore default non-nightly.","code":""},{"path":"https://cboettig.github.io/duckdbfs/news/index.html","id":"duckdbfs-008","dir":"Changelog","previous_headings":"","what":"duckdbfs 0.0.8","title":"duckdbfs 0.0.8","text":"CRAN release: 2024-12-09 work-around error file built DuckDB version 'v1.1.3', can load extensions built DuckDB version '19864453f7'. using nightly repo extensions default.","code":""},{"path":"https://cboettig.github.io/duckdbfs/news/index.html","id":"duckdbfs-007","dir":"Changelog","previous_headings":"","what":"duckdbfs 0.0.7","title":"duckdbfs 0.0.7","text":"CRAN release: 2024-08-29 default cached_connection() helper configure temporary storage location default. also now supports options supported duckdb::duckdb() connection creation. New as_dataset() utility copies local -memory data.frame connection.","code":""},{"path":"https://cboettig.github.io/duckdbfs/news/index.html","id":"duckdbfs-006","dir":"Changelog","previous_headings":"","what":"duckdbfs 0.0.6","title":"duckdbfs 0.0.6","text":"bugfix: reading local disk recursively longer requires manual **. Also, trying read existing local file won’t try append recursive search even given default recursive=TRUE option. bugfix: open_dataset() uses random table name default, avoid naming collisions.","code":""},{"path":"https://cboettig.github.io/duckdbfs/news/index.html","id":"duckdbfs-005","dir":"Changelog","previous_headings":"","what":"duckdbfs 0.0.5","title":"duckdbfs 0.0.5","text":"CRAN release: 2024-08-17 bugfix write_dataset() longer adds ** paths writing partitions. Protect unsupported table names generated file names start digit, fixes #21.","code":""},{"path":"https://cboettig.github.io/duckdbfs/news/index.html","id":"duckdbfs-004","dir":"Changelog","previous_headings":"","what":"duckdbfs 0.0.4","title":"duckdbfs 0.0.4","text":"CRAN release: 2024-02-28 open_dataset() gains ability read spatial vector data formats (objects read sf) using format=\"sf\" default geometry column to_sf() now termed geom, match default used duckdb’s st_read() function. open_dataset() now tries guess data format instead defaulting parquet format explicitly provided. new function, spatial_join(), allows variety spatial joins. new function, st_read_meta(), exposes spatial metadata remote spatial objects. new helper function, as_view(), creates temporary view query.","code":""},{"path":"https://cboettig.github.io/duckdbfs/news/index.html","id":"duckdbfs-003","dir":"Changelog","previous_headings":"","what":"duckdbfs 0.0.3","title":"duckdbfs 0.0.3","text":"CRAN release: 2023-10-19 write_dataset() now understands lazy queries, just lazy tables.","code":""},{"path":"https://cboettig.github.io/duckdbfs/news/index.html","id":"duckdbfs-002","dir":"Changelog","previous_headings":"","what":"duckdbfs 0.0.2","title":"duckdbfs 0.0.2","text":"CRAN release: 2023-09-06 duckdbfs now spatial data query support! Users can leverage spatial data operations like st_distance() st_area() request return values sf objects. Supports network-based access . See README.md Added write_dataset() can write (potentially partitioned) parquet local directories remote (S3) buckets. S3 interface supports arrow-compatible URI notation: Alternate endpoints can now passed like s3://userid:secret_token@bucket-name?endpoint_override=data.ecoforecast.org Users can omit use * (match file) ** (recursive search) just supply path. Recursive search assumed automatically. Note: unlike arrow, still supports use globs (*) elsewhere path, e.g. s3://bucket/*/path duckdb_s3_config gains argument anonymous allowing users ignore existing AWS keys may set environmental variables AWS configuration files. can also passed username position URI notation, e.g. s3://anonymous@bucket_name. open_dataset drops use endpoint argument. Instead, alternative S3 endpoints can set either using URI query notation calling duckdb_s3_config() first. Additionally, arguments duckdb_s3_config(), including s3_endpoint, can now passed open_dataset .... Note settings override set URI notation.","code":""},{"path":"https://cboettig.github.io/duckdbfs/news/index.html","id":"duckdbfs-001","dir":"Changelog","previous_headings":"","what":"duckdbfs 0.0.1","title":"duckdbfs 0.0.1","text":"CRAN release: 2023-08-09 Initial release CRAN","code":""}] diff --git a/sitemap.xml b/sitemap.xml index 02f5d19..cd41e8d 100644 --- a/sitemap.xml +++ b/sitemap.xml @@ -10,11 +10,15 @@ https://cboettig.github.io/duckdbfs/reference/cached_connection.html https://cboettig.github.io/duckdbfs/reference/close_connection.html https://cboettig.github.io/duckdbfs/reference/duckdb_s3_config.html +https://cboettig.github.io/duckdbfs/reference/duckdb_secrets.html https://cboettig.github.io/duckdbfs/reference/index.html +https://cboettig.github.io/duckdbfs/reference/load_h3.html https://cboettig.github.io/duckdbfs/reference/load_spatial.html https://cboettig.github.io/duckdbfs/reference/open_dataset.html https://cboettig.github.io/duckdbfs/reference/spatial_join.html https://cboettig.github.io/duckdbfs/reference/st_read_meta.html +https://cboettig.github.io/duckdbfs/reference/to_h3j.html +https://cboettig.github.io/duckdbfs/reference/to_json.html https://cboettig.github.io/duckdbfs/reference/to_sf.html https://cboettig.github.io/duckdbfs/reference/write_dataset.html