Skip to content

Commit

Permalink
Add new dataset instructions (#147)
Browse files Browse the repository at this point in the history
* Streamline adding a new dataset

 * Add instructions to README for adding a new dataset
 * Add scripts to update the dataset metadata
 * Add update_doc method to only add a single dataset
 * Add HTML documentation generation to update_doc
 * Change update_doc to correctly round trip quotes in the metadata CSV

* Sort datasets CSV

* Allow datasets with a .RData extension as well as .rda

---------

Co-authored-by: Frankie Robertson <[email protected]>
  • Loading branch information
asinghvi17 and frankier authored Apr 2, 2024
1 parent 56d065c commit 4aac673
Show file tree
Hide file tree
Showing 6 changed files with 189 additions and 112 deletions.
17 changes: 17 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,23 @@ mlmRev|guImmun|Immunization in Guatemala|2159|13
mlmRev|guPrenat|Prenatal care in Guatemala|2449|15
mlmRev|star|Student Teacher Achievement Ratio (STAR) project data|26796|18

# How to add datasets from a new package

**Step 1: add the data from the package**

1. In your clone of this repo `mkdir -p data/$PKG`
2. Go to CRAN
3. Download the *source package*
4. Extract one or more of the datasets in the `data` directory into the new directory

**Step 2: add the metadata**

Run the script:

$ scripts/update_doc_one.sh $PKG

Now it's ready for you to submit your pull request.

# Licensing and Intellectual Property

Following Vincent's lead, we have assumed that all of the data sets in this repository can be made available under the GPL-3 license. If you know that one of the datasets released here should not be released publicly or if you know that a data set can only be released under a different license, please contact me so that I can remove the data set from this repository.
60 changes: 30 additions & 30 deletions doc/datasets.csv
Original file line number Diff line number Diff line change
Expand Up @@ -506,6 +506,36 @@
"datasets","volcano","Topographic Information on Auckland's Maunga Whau Volcano",87,61
"datasets","warpbreaks","The Number of Breaks in Yarn during Weaving",54,3
"datasets","women","Average Heights and Weights for American Women",15,2
"gamair","aral","aral",488,4
"gamair","aral.bnd","aral.bnd",107,3
"gamair","bird","bird",25100,7
"gamair","blowfly","blowfly",180,3
"gamair","bone","bone",23,4
"gamair","brain","brain",1567,6
"gamair","cairo","cairo",3780,7
"gamair","chicago","chicago",5114,8
"gamair","chl","chl",13840,7
"gamair","co2s","co2s",507,4
"gamair","coast","coast",2091,3
"gamair","engine","engine",19,3
"gamair","gas","gas",60,804
"gamair","harrier","harrier",37,3
"gamair","hubble","hubble",24,4
"gamair","ipo","ipo",156,7
"gamair","mack","mack",634,17
"gamair","mackp","mackp",1162,9
"gamair","med","med",1476,25
"gamair","meh","meh",1476,24
"gamair","mpg","mpg",205,27
"gamair","prostate","prostate",654,530
"gamair","sitka","sitka",1027,6
"gamair","sole","sole",1575,8
"gamair","sperm.comp1","sperm.comp1",15,5
"gamair","sperm.comp2","sperm.comp2",24,11
"gamair","stomata","stomata",24,4
"gamair","swer","swer",2196,10
"gamair","wesdr","wesdr",669,5
"gamair","wine","wine",47,8
"gap","PD","A study of Parkinson's disease and APOE, LRRK2, SNCA makers",825,22
"gap","aldh2","ALDH2 markers and Alcoholism",263,18
"gap","apoeapoc","APOE/APOC1 markers and Alzheimer's",353,8
Expand Down Expand Up @@ -732,33 +762,3 @@
"vcd","VonBort","Von Bortkiewicz Horse Kicks Data",280,4
"vcd","WeldonDice","Weldon's Dice Data",11,2
"vcd","WomenQueue","Women in Queues",11,2
"gamair","aral.bnd","aral.bnd",107,3
"gamair","aral","aral",488,4
"gamair","bird","bird",25100,7
"gamair","blowfly","blowfly",180,3
"gamair","bone","bone",23,4
"gamair","brain","brain",1567,6
"gamair","cairo","cairo",3780,7
"gamair","chicago","chicago",5114,8
"gamair","chl","chl",13840,7
"gamair","co2s","co2s",507,4
"gamair","coast","coast",2091,3
"gamair","engine","engine",19,3
"gamair","gas","gas",60,804
"gamair","harrier","harrier",37,3
"gamair","hubble","hubble",24,4
"gamair","ipo","ipo",156,7
"gamair","mack","mack",634,17
"gamair","mackp","mackp",1162,9
"gamair","med","med",1476,25
"gamair","meh","meh",1476,24
"gamair","mpg","mpg",205,27
"gamair","prostate","prostate",654,530
"gamair","sitka","sitka",1027,6
"gamair","sole","sole",1575,8
"gamair","sperm.comp1","sperm.comp1",15,5
"gamair","sperm.comp2","sperm.comp2",24,11
"gamair","stomata","stomata",24,4
"gamair","swer","swer",2196,10
"gamair","wesdr","wesdr",669,5
"gamair","wine","wine",47,8
4 changes: 4 additions & 0 deletions scripts/update_doc_all.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
R --no-save <<END
source("src/update_doc.r")
update_docs(".")
END
4 changes: 4 additions & 0 deletions scripts/update_doc_one.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
R --no-save <<END
source("src/update_doc.r")
update_package_doc(".", "$1")
END
5 changes: 5 additions & 0 deletions src/dataset.jl
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,11 @@ const Dataset_typedetect_rows = Dict{Tuple{String, String}, Union{Vector,Dict}}(
function dataset(package_name::AbstractString, dataset_name::AbstractString)
basename = joinpath(@__DIR__, "..", "data", package_name)

rdataname = joinpath(basename, string(dataset_name, ".RData"))
if isfile(rdataname)
return load(rdataname)[dataset_name]
end

rdaname = joinpath(basename, string(dataset_name, ".rda"))
if isfile(rdaname)
return load(rdaname)[dataset_name]
Expand Down
211 changes: 129 additions & 82 deletions src/update_doc.r
Original file line number Diff line number Diff line change
@@ -1,3 +1,129 @@
install_packages <- function(packages) {
# Pick a decent mirror if none set
r <- getOption("repos")
if (r["CRAN"] == "@CRAN@") {
r <- "http://cran.rstudio.com/"
}
suppressWarnings({install.packages(packages, repos = r)})
}

install_packages(c("R2HTML"))
library(R2HTML)

write_doc = function(package, dataset) {
help.ref <- try(help(eval(dataset), package=eval(package)), silent = TRUE)
doc <- try(utils:::.getHelpFile(help.ref), silent = TRUE)
try(dir.create(paste0('doc/', package)), silent = TRUE)
fn_doc <- paste0('doc/', package, '/', dataset, '.html')
tools::Rd2HTML(doc, out = fn_doc)
}

do_package_update <- function(data_dir, package_df, old_dataset_df, dataset_df, mismatched_dims_df, package) {
suppressWarnings({library(package, character.only = TRUE)})

# Get package description
pdesc <- packageDescription(package)
new_row <- as.data.frame(pdesc[c("Package", "Title")],
stringsAsFactors = FALSE)
package_df <- rbind(package_df, new_row)

pdat <- data(package=package)$results

datasets <- dir(path = file.path(data_dir, package))

# Trim filenames to dataset names
r <- "(.+)\\.(csv\\.gz|rda|RData)$"
format_recognized <- grepl(r, datasets)
if (!(all(format_recognized))) {
stop("Unrecognized formats:\n",
cat(datasets[!format_recognized], sep = "\n"))
}
datanames <- sub(r, "\\1", datasets)

for (dataname in datanames) {
evaltext = paste0("data(", dataname, ", package=package)")
eval(parse(text = evaltext))
ds <- get(dataname)

write_doc(package, dataname)

# Get dataset description
title <- unique(pdat[, "Title"][pdat[, "Item"] == dataname])
if (length(title) != 1) {
stop(package, "/", dataname, " had ", length(title), " descriptions.")
}

# Old dims to fall back on
old_row = subset(old_dataset_df,
Dataset == dataname & Package == package)
nr <- if (nrow(old_row)) old_row$Rows[[1]] else NA
nc <- if (nrow(old_row)) old_row$Columns[[1]] else NA

# Check against new dims when simple
new_nr <- NROW(ds)
new_nc <- NCOL(ds)
if (!(any(c("table", "ltraj") %in% class(ds))) &&
class(ds) != "list" &&
is.numeric(new_nr) &&
is.numeric(new_nc)) {

expected_cols <- c(nc, nc - 1) # row.names sometimes included
if (!is.numeric(nr) || !is.numeric(nc)) {
nr <- new_nr
nc <- new_nc
} else if (new_nr != nr || !(new_nc %in% expected_cols)) {

new_row <- data.frame(Package = package,
Dataset = dataname,
Class = class(ds),
OldRows = nr,
OldColumns = nc,
NewRows = new_nr,
NewColumns = new_nc)

mismatched_dims_df <- rbind(mismatched_dims_df, new_row)

}
}

new_row <- data.frame(Package = package,
Dataset = dataname,
Title = title,
Rows = nr,
Columns = nc,
stringsAsFactors = FALSE)

dataset_df <- rbind(dataset_df, new_row)
}
return(list(package_df = package_df, dataset_df = dataset_df, mismatched_dims_df = mismatched_dims_df))
}

update_package_doc <- function(pkg_dir, package) {
data_dir <- file.path(pkg_dir, "data")
doc_dir <- file.path(pkg_dir, "doc")

package_fn <- file.path(doc_dir, "packages.csv")
dataset_fn <- file.path(doc_dir, "datasets.csv")

package_df <- read.csv(package_fn)
dataset_df <- read.csv(dataset_fn)

install_packages(c(package))

mismatched_dims_df <- data.frame()
dfs <- do_package_update(data_dir, package_df, dataset_df, dataset_df, mismatched_dims_df, package)
package_df <- dfs$package_df
dataset_df <- dfs$dataset_df
mismatched_dims_df <- dfs$mismatched_dims_df

package_df <- sort_upper_first(clean(package_df), c("Package"))
dataset_df <- sort_upper_first(clean(dataset_df), c("Package", "Dataset"))

write(package_df, package_fn)
write(dataset_df, dataset_fn)
return(mismatched_dims_df)
}

update_docs <- function(pkg_dir) {
data_dir <- file.path(pkg_dir, "data")
doc_dir <- file.path(pkg_dir, "doc")
Expand All @@ -16,90 +142,11 @@ update_docs <- function(pkg_dir) {
# Install any missing packages
new_packages <- packages[!(packages %in% installed.packages()[, "Package"])]
if (length(new_packages)) {
# Pick a decent mirror if none set
r <- getOption("repos")
if (r["CRAN"] == "@CRAN@") {
r <- "http://cran.rstudio.com/"
}
suppressWarnings({install.packages(new_packages, repos = r)})
install_packages(new_packages)
}

for (package in packages) {
suppressWarnings({library(package, character.only = TRUE)})

# Get package description
pdesc <- packageDescription(package)
new_row <- as.data.frame(pdesc[c("Package", "Title")],
stringsAsFactors = FALSE)
package_df <- rbind(package_df, new_row)

pdat <- data(package=package)$results

datasets <- dir(path = file.path(data_dir, package))

# Trim filenames to dataset names
r <- "(.+)\\.(csv\\.gz|rda)$"
format_recognized <- grepl(r, datasets)
if (!(all(format_recognized))) {
stop("Unrecognized formats:\n",
cat(datasets[!format_recognized], sep = "\n"))
}
datanames <- sub(r, "\\1", datasets)

for (dataname in datanames) {
eval(parse(text = paste0("data(", dataname, ", package=package)")))
ds <- get(dataname)

# TODO: Write rst and html doc per dataset

# Get dataset description
title <- unique(pdat[, "Title"][pdat[, "Item"] == dataname])
if (length(title) != 1) {
stop(package, "/", title, " had ", length(title), " descriptions.")
}

# Old dims to fall back on
old_row = subset(old_dataset_df,
Dataset == dataname & Package == package)
nr <- if (nrow(old_row)) old_row$Rows[[1]] else NA
nc <- if (nrow(old_row)) old_row$Columns[[1]] else NA

# Check against new dims when simple
new_nr <- NROW(ds)
new_nc <- NCOL(ds)
if (!(any(c("table", "ltraj") %in% class(ds))) &&
class(ds) != "list" &&
is.numeric(new_nr) &&
is.numeric(new_nc)) {

expected_cols <- c(nc, nc - 1) # row.names sometimes included
if (!is.numeric(nr) || !is.numeric(nc)) {
nr <- new_nr
nc <- new_nc
} else if (new_nr != nr || !(new_nc %in% expected_cols)) {

new_row <- data.frame(Package = package,
Dataset = dataname,
Class = class(ds),
OldRows = nr,
OldColumns = nc,
NewRows = new_nr,
NewColumns = new_nc)

mismatched_dims_df <- rbind(mismatched_dims_df, new_row)

}
}

new_row <- data.frame(Package = package,
Dataset = dataname,
Title = title,
Rows = nr,
Columns = nc,
stringsAsFactors = FALSE)

dataset_df <- rbind(dataset_df, new_row)
}
do_package_update(data_dir, package_df, old_dataset_df, dataset_df, mismatched_dims_df, package)
}

stopifnot(nrow(dataset_df) > 0)
Expand All @@ -114,7 +161,7 @@ update_docs <- function(pkg_dir) {
}

write <- function(df, fn) {
write.table(df, file = fn, sep = ",", qmethod = "escape", row.names = FALSE)
write.table(df, file = fn, sep = ",", qmethod = "double", row.names = FALSE)
}

clean <- function(df) {
Expand Down

0 comments on commit 4aac673

Please sign in to comment.