From 51388799be238eaa9ac149db98d61e8d722f2d98 Mon Sep 17 00:00:00 2001 From: Toby Dylan Hocking Date: Thu, 4 Jan 2024 22:19:13 -0700 Subject: [PATCH] glob vign --- DESCRIPTION | 2 +- NEWS | 4 + R/capture_first_glob.R | 15 +- README.org | 1 + man/capture_first_glob.Rd | 15 +- vignettes/v0-overview.Rmd | 23 ++- vignettes/v7-capture-glob.Rmd | 286 ++++++++++++++++++++++++++++++++++ 7 files changed, 340 insertions(+), 6 deletions(-) create mode 100644 vignettes/v7-capture-glob.Rmd diff --git a/DESCRIPTION b/DESCRIPTION index e799dbd..dd6425b 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: nc Maintainer: Toby Dylan Hocking Author: Toby Dylan Hocking -Version: 2023.8.24 +Version: 2024.1.4 License: GPL-3 Title: Named Capture to Data Tables Description: User-friendly functions for extracting a data diff --git a/NEWS b/NEWS index 41cdb57..7a34b32 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,7 @@ +Changes in version 2024.1.4 + +- new vignette v7-capture-glob. + Changes in version 2023.8.24 - provide argument descriptions for un-exported functions (required to avoid NOTE on CRAN using new R-devel). diff --git a/R/capture_first_glob.R b/R/capture_first_glob.R index a7710f5..a678eac 100644 --- a/R/capture_first_glob.R +++ b/R/capture_first_glob.R @@ -25,10 +25,21 @@ capture_first_glob <- structure(function data.table::setDTthreads(1) - ## Example 1: simple pattern. + ## Example 0: iris data, one file per species. + library(data.table) + dir.create(iris.dir <- tempfile()) + icsv <- function(sp)file.path(iris.dir, paste0(sp, ".csv")) + data.table(iris)[, fwrite(.SD, icsv(Species)), by=Species] + dir(iris.dir) + data.table::fread(file.path(iris.dir,"setosa.csv"), nrows=2) + (iglob <- file.path(iris.dir,"*.csv")) + nc::capture_first_glob(iglob, Species="[^/]+", "[.]csv") + + ## Example 1: four files, two capture groups, custom read function. db <- system.file("extdata/chip-seq-chunk-db", package="nc", mustWork=TRUE) suffix <- if(interactive())"gz" else "head" - glob <- paste0(db, "/*/*/counts/*", suffix) + (glob <- paste0(db, "/*/*/counts/*", suffix)) + Sys.glob(glob) read.bedGraph <- function(f)data.table::fread( f, skip=1, col.names = c("chrom","start", "end", "count")) data.chunk.pattern <- list( diff --git a/README.org b/README.org index 0c1809d..3c87dd3 100644 --- a/README.org +++ b/README.org @@ -72,6 +72,7 @@ The main functions provided in nc are: | Data frame chr cols | =capture_first_df= | =tidyr::extract/separate_wider_regex= | =data.table::tstrsplit= | | Data frame col names | =capture_melt_single= | =tidyr::pivot_longer= | =data.table::melt= | | Data frame col names | =capture_melt_multiple= | =tidyr::pivot_longer= | =data.table::melt= | +| File paths | =capture_first_glob= | =arrow::open_dataset= | | - [[https://cloud.r-project.org/web/packages/nc/vignettes/v0-overview.html][Vignette 0]] provides an overview of the various functions. - [[https://cloud.r-project.org/web/packages/nc/vignettes/v1-capture-first.html][Vignette 1]] discusses =capture_first_vec= and =capture_first_df=, which capture the first match in each of diff --git a/man/capture_first_glob.Rd b/man/capture_first_glob.Rd index 5faf363..327994f 100644 --- a/man/capture_first_glob.Rd +++ b/man/capture_first_glob.Rd @@ -25,10 +25,21 @@ contents of all files specified by \code{glob}.} data.table::setDTthreads(1) -## Example 1: simple pattern. +## Example 0: iris data, one file per species. +library(data.table) +dir.create(iris.dir <- tempfile()) +icsv <- function(sp)file.path(iris.dir, paste0(sp, ".csv")) +data.table(iris)[, fwrite(.SD, icsv(Species)), by=Species] +dir(iris.dir) +data.table::fread(file.path(iris.dir,"setosa.csv"), nrows=2) +(iglob <- file.path(iris.dir,"*.csv")) +nc::capture_first_glob(iglob, Species="[^/]+", "[.]csv") + +## Example 1: four files, two capture groups, custom read function. db <- system.file("extdata/chip-seq-chunk-db", package="nc", mustWork=TRUE) suffix <- if(interactive())"gz" else "head" -glob <- paste0(db, "/*/*/counts/*", suffix) +(glob <- paste0(db, "/*/*/counts/*", suffix)) +Sys.glob(glob) read.bedGraph <- function(f)data.table::fread( f, skip=1, col.names = c("chrom","start", "end", "count")) data.chunk.pattern <- list( diff --git a/vignettes/v0-overview.Rmd b/vignettes/v0-overview.Rmd index f9b213f..761ccaf 100644 --- a/vignettes/v0-overview.Rmd +++ b/vignettes/v0-overview.Rmd @@ -43,7 +43,8 @@ A variant is doing the same thing, but with input subjects coming from a data table/frame with character columns. ```{r} -subject.dt <- data.table::data.table( +library(data.table) +subject.dt <- data.table( JobID = c("13937810_25", "14022192_1"), Elapsed = c("07:04:42", "07:04:49")) int.pat <- list("[0-9]+", as.integer) @@ -81,6 +82,26 @@ nc::capture_melt_multiple(one.iris, column=".*", "[.]", dim =".*") nc::capture_melt_multiple(one.iris, part =".*", "[.]", column=".*") ``` +## Reading regularly named data files + +[Capture glob](v7-capture-glob.html) is for the situation when you +have several data files on disk, with regular names that you can match +with a glob/regex. In the example below we first write one CSV file +for each iris Species, + +```{r} +dir.create(iris.dir <- tempfile()) +icsv <- function(sp)file.path(iris.dir, paste0(sp, ".csv")) +data.table(iris)[, fwrite(.SD, icsv(Species)), by=Species] +dir(iris.dir) +``` + +We then use a glob and a regex to read those files in the code below: + +```{r} +nc::capture_first_glob(file.path(iris.dir,"*.csv"), Species="[^/]+", "[.]csv") +``` + ## Helper functions for defining complex pattterns [Helpers](v5-helpers.html) describes various functions that simplify diff --git a/vignettes/v7-capture-glob.Rmd b/vignettes/v7-capture-glob.Rmd new file mode 100644 index 0000000..954ede4 --- /dev/null +++ b/vignettes/v7-capture-glob.Rmd @@ -0,0 +1,286 @@ +--- +title: "Reading regularly named files" +date: "`r Sys.Date()`" +output: + rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{Reading regularly named files} + %\VignetteEngine{knitr::rmarkdown} + \usepackage[utf8]{inputenc} +--- + +# Reading regularly named files + +```{r setup, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" +) +data.table::setDTthreads(1) +options(width=100) +``` + +This vignette contains a number of examples which explain how to use +`capture_first_glob` to read data from a set of regularly named files. + +## Example 0: iris data, one file per species + +We begin with a simple example: iris data have 150 rows, as shown +below. + +```{r} +library(data.table) +dir.create(iris.dir <- tempfile()) +icsv <- function(sp)file.path(iris.dir, paste0(sp, ".csv")) +(iris.dt <- data.table(iris)) +``` + +In the code below, we save one CSV file for each of the three Species. + +```{r} +iris.dt[, fwrite(.SD, icsv(Species)), by=Species] +dir(iris.dir) +``` + +The output above shows that there are three CSV files, one for each +Species in the iris data. Below we read the first two rows of one +file, + +```{r} +data.table::fread(file.path(iris.dir,"setosa.csv"), nrows=2) +``` + +The output above shows that the CSV data file itself does not contain +a Species column (the Species is instead encoded in the file name). +Below we construct a glob, which is a string for matching files, + +```{r} +(iglob <- file.path(iris.dir,"*.csv")) +Sys.glob(iglob) +``` + +The output above indicates that `iglob` matches the three data files. +Below we read those files into R, using the following syntax: + +* The first argument `iglob` is a string/glob which indicates the files to read, +* the other arguments form a regular expression pattern: + * The named argument `Species` matches that part of the file name, + and is captured to the resulting column of the same name, + * the un-name argument `"[.]csv"` indicates that suffix must be matched (but since the argument is not named, it is not captured, nor saved as a column in the output). + +```{r} +nc::capture_first_glob(iglob, Species="[^/]+", "[.]csv") +``` + +The output above indicates that we have successfully read the iris data back into R, including the `Species` column which was not present in the CSV data files. + +## Example 1: four files, two capture groups, custom read function + +Consider the example below, which is slightly more complex. +The code below defines a glob for matching several data files. + +```{r} +db <- system.file("extdata/chip-seq-chunk-db", package="nc", mustWork=TRUE) +suffix <- if(interactive())"gz" else "head" +(glob <- paste0(db, "/*/*/counts/*", suffix)) +(matched.files <- Sys.glob(glob)) +``` + +The output above indicates there are four data files that are matched by the glob. +Below we read the first one, + +```{r} +readLines(matched.files[1], n=5) +``` + +We can see from the output above that this data file has a header of meta-data (not column names) on the first line, whereas the other lines contain tab-delimited data. +We can read it with fread, as long as we provide a couple non-default arguments, as in the code below: + +```{r} +read.bedGraph <- function(f)data.table::fread( + f, skip=1, col.names = c("chrom","start", "end", "count")) +read.bedGraph(matched.files[1]) +``` + +The output above indicates the data has been correctly read into R as a table with four columns. +To do that for each of the files, we use this custom `READ` function in the code below, + +```{r} +data.chunk.pattern <- list( + data="H.*?", + "/", + chunk="[0-9]+", as.integer) +(data.chunk.dt <- nc::capture_first_glob(glob, data.chunk.pattern, READ=read.bedGraph)) +``` + +The output above indicates the data files have been read into R as a table, with two additional columns (data and chunk), which correspond to the capture group names used in the regular expression pattern above. + +## Why not base R? + +We can absolutely use base R to read these files, but it takes a bit more code, as shown below. + +```{r} +base.df.list <- list() +for(file.csv in matched.files){ + file.df <- read.table(file.csv, skip=1, col.names=c("chrom","start", "end", "count")) + counts.path <- dirname(file.csv) + chunk.path <- dirname(counts.path) + data.path <- dirname(chunk.path) + base.df.list[[file.csv]] <- data.frame( + data=basename(data.path), + chunk=basename(chunk.path), + file.df) +} +base.df <- do.call(rbind, base.df.list) +rownames(base.df) <- NULL +head(base.df) +str(base.df) +``` + +The output above shows that we have read a data frame into R, +and that it is consistent with the data table returned by `nc::capture_first_glob`, +which should be preferred for simplicity when the files are regularly named. +In contrast, this section shows how arbitrary R code can be used, +so this approach should be preferred when the data in the file path +can not be captured using regular expressions. + +## Example 3: Hive partition file names + +In the code below, we write the same data to a set of CSV files with +different names, + +```{r} +if(requireNamespace("arrow")){ + path <- tempfile() + arrow::write_dataset( + dataset=data.chunk.dt, + path=path, + format="csv", + partitioning=c("data","chunk"), + max_rows_per_file=1000) + hive.glob <- file.path(path, "*", "*", "*.csv") + (hive.files <- Sys.glob(hive.glob)) +} +``` + +In the output above, we can see that there are regularly named files +with three variables encoded in the file path (data, chunk, part). +The code below reads one of the files back into R: + +```{r} +data.table::fread(hive.files[1]) +``` + +The output above indicates that the file only has four columns (and is missing the variables which are encoded in the file path). +In the code below, we read all those files back into R: + +```{r} +if(requireNamespace("arrow")){ + hive.pattern <- list( + nc::field("data","=",".*?"), + "/", + nc::field("chunk","=",".*?", as.integer), + "/", + nc::field("part","-","[0-9]+", as.integer)) + print(hive.dt <- nc::capture_first_glob(hive.glob, hive.pattern)) + hive.dt[, .(rows=.N), keyby=.(data,chunk,part)] +} +``` + +The output above indicates that we have successfully read the data back into R. + +## Example 4: pattern with two more capture groups + +In the code below, we read the same data files, with a more complex +pattern that has two additional capture groups (name and id). + +```{r} +(count.dt <- nc::capture_first_glob( + glob, + data.chunk.pattern, + "/counts/", + name=list("McGill", id="[0-9]+", as.integer), + READ=read.bedGraph)) +count.dt[, .(count=.N), by=.(data, chunk, name, id, chrom)] +``` + +The output above indicates that we have successfully read the data into R, +with two additional columns (name and id). +These data can be visualized using the code below, + +```{r} +if(require(ggplot2)){ + ggplot()+ + facet_wrap(~data+chunk+name+chrom, labeller=label_both, scales="free")+ + geom_step(aes( + start/1e3, count), + data=count.dt) +} +``` + +The plot above includes panel/facet titles which come from the variables which were stored in the file names. + +## Example 5: parsing non-CSV data + +The following example demonstrates how non-CSV data may be parsed, using a custom `READ` function. +Consider the vignette data files, + +```{r} +vignettes <- system.file("extdata/vignettes", package="nc", mustWork=TRUE) +(vglob <- paste0(vignettes, "/*.Rmd")) +(vfiles <- Sys.glob(vglob)) +``` + +The output above includes the glob and the files it matches. +Below we define a function for parsing one of those files, + +```{r} +non.greedy.lines <- list( + list(".*\n"), "*?") +optional.name <- list( + list(" ", chunk_name="[^,}]+"), "?") +chunk.pattern <- list( + before=non.greedy.lines, + "```\\{r", + optional.name, + parameters=".*", + "\\}\n", + code=non.greedy.lines, + "```") +READ.vignette <- function(f)nc::capture_all_str(f, chunk.pattern) +str(READ.vignette(vfiles[1])) +``` + +The output above shows a data table with 7 rows, one for each code chunk defined in the vignette data file. +We read all of the vignette files using the code below. + +```{r} +chunk.dt <- nc::capture_first_glob( + vglob, + "/v", + vignette_number="[0-9]", as.integer, + "-", + vignette_name=".*?", + ".Rmd", + READ=READ.vignette +)[ +, chunk_number := seq_along(chunk_name), by=vignette_number +] +chunk.dt[, .( + vignette_number, vignette_name, chunk_number, chunk_name, + lines=nchar(code))] +``` + +The output above is a data table with one row for each chunk in each data file. +Some columns (`vignette_number` and `vignette_name`) come from the file path, +and others come from the data file contents, including chunk number, name, and line count. +The files also contain code which has been parsed and can be extracted via the code below, for example: + +```{r} +cat(chunk.dt$code[2]) +``` + +## Conclusion + +In this vignette we have seen how to read regularly named data files into R, +by providing a glob and a regular expression to `nc::capture_first_glob`.