Merge pull request #284 from dblodgett-usgs/main

Download from cloud storage.
DOI-USGS · May 25, 2022 · 97c852f · 97c852f
2 parents f6b919a + c6950c3
commit 97c852f
Show file tree

Hide file tree

Showing 12 changed files with 230 additions and 115 deletions.
diff --git a/R/AAA.R b/R/AAA.R
@@ -33,6 +33,12 @@ check_session <- function(x) {
 		stop('session is not authorized. See ?authenticate_sb', call. = FALSE)	
 }
 
+session_authorized <- function(session){
+
+	return(session_validate(session) && !is.null(session))
+
+}
+
 session_val <- function(x) {
 	if (!session_validate(x)) {
 		stop('Session state is invalid, please re-authenticate', call. = FALSE)

diff --git a/R/gql_helpers.R b/R/gql_helpers.R
@@ -0,0 +1,70 @@
+get_gql_header <- function() {
+	httr::add_headers(
+		.headers = c(`content-type` = "application/json", 
+								 accept = "application/json", 
+								 authorization = paste("Bearer", 
+								 											get_access_token())))
+}
+
+#' @noRd
+#' @param q character gql query to embed into json body
+#' @param gql handle to pass to POST
+#' @param json character json to pass -- shoul include gql query and additional content. 
+#' json is optional - it will default to just the query.
+run_gql_query <- function(q, gql, json = jsonlite::toJSON(list(query = q), auto_unbox = TRUE)) {
+	out <- httr::POST(pkg.env$graphql_url, get_gql_header(), 
+										body = json,  
+										handle = gql)
+
+	if(out$status_code == 200) {
+		jsonlite::fromJSON(rawToChar(out$content))
+	} else {
+		stop(paste("Error making multipart session.\n code:", out$status_code,
+							 "\n content:", rawToChar(out$content)))		
+	}
+
+}
+
+#GraphQL Queries for interaction with ScienceBase Manager
+create_multipart_upload_session <- function(s3_filepath, content_type, username, gql) {
+
+	run_gql_query(sprintf(
+		'query { createMultipartUploadSession(object: "%s" contentType: "%s" username: "%s") }', 
+		s3_filepath, content_type, username), gql)$data$createMultipartUploadSession
+
+}
+
+get_presigned_url_for_chunk <- function(s3_filepath, upload_id, part_number, gql) {
+
+	run_gql_query(sprintf(
+		'query { getPreSignedUrlForChunk(object: "%s", upload_id: "%s", part_number: "%s") }',
+		s3_filepath, upload_id, part_number), gql)$data$getPreSignedUrlForChunk
+
+}
+
+complete_multipart_upload <- function(item_str, upload_id, etag_payload, gql) {
+
+	eta <- sapply(etag_payload, function(x) {
+		sprintf('{ETag: "%s", PartNumber: %i}', gsub('"', "", x$ETag), x$PartNumber)
+	})
+
+	eta <- paste0("[", paste(eta, collapse = ","), "]")
+
+	run_gql_query(sprintf(
+		'query { completeMultiPartUpload(object: "%s" upload_id: "%s" parts_eTags: %s) }',
+		item_str, upload_id, eta), gql)
+}
+
+get_cloud_download_url <- function(cr, gql) {
+
+	query <- "query getS3DownloadUrl($input: SaveFileInputs){ getS3DownloadUrl(input: $input){ downloadUri }}"
+
+	variables <- sprintf('{"input": {"selectedRows": {"cuid": "%s", "key": "%s", "title": "%s", "useForPreview": "%s"}}}',
+											 cr$cuid, cr$key, cr$title, cr$useForPreview)
+
+	variables <- list(input = list(selectedRows = list(cuid = cr$cuid, key = cr$key, title = cr$title, useForPreview = cr$useForPreview)))
+
+	json <- jsonlite::toJSON(list(query = query, variables = variables), auto_unbox = TRUE)
+
+	run_gql_query(query, gql, json = json)
+}
diff --git a/R/item_file_download.R b/R/item_file_download.R
@@ -10,69 +10,79 @@
 #'   to download.
 #' @param destinations String vector list of destinations for requested files.
 #'   Must be same length as \code{names}
-#' @param dest_dir A directory path for saving files when \code{names} parameter
-#'   is omitted
+#' @param dest_dir A directory path for saving files when \code{names} destinations 
+#' parameter is not specified.
 #' @param overwrite_file Boolean indicating if file should be overwritten if it
 #'   already exists locally
 #'   
 #' @return Character vector of full paths to local files 
 #'
-#' @author Luke Winslow
-#'
 #' @examples \dontrun{
 #'
-#' #downloads two files attached to this item
-#' item_file_download('548b2b31e4b03f64633662a4', dest_dir=tempdir())
+#' #downloads all files attached to this item
+#' item_file_download('627f1572d34e3bef0c9a30d8', dest_dir=tempdir())
 #'
 #' #downloads a specific file attached to this item
-#' item_file_download('548b2b31e4b03f64633662a4', names='gdp.txt',
-#' 		destinations=file.path(tempdir(), 'fname.txt'))
+#' item_file_download('627f1572d34e3bef0c9a30d8', names='example.txt',
+#' 		destinations=file.path(tempdir(), 'out.txt'))
+#' 		
 #' }
 #' @export
 item_file_download = function(sb_id, ..., names, destinations, 
 															dest_dir = getwd(), session=current_session(), 
 															overwrite_file = FALSE){
 
 	sb_id = as.sbitem(sb_id)
-	if(!session_validate(session)){
+
+	if(!session_validate(session))
 		stop('Session state is invalid, please re-authenticate')
-	}
 
-	#We have two states, missing names and destinations, which means we need a dest_dir
-	if(missing(names) && missing(destinations)){
-
-		#populate names and destinations from files that are on SB
-		flist = item_list_files(sb_id, ..., session=session)
-		if(nrow(flist) < 1){
-			stop(sb_id$id, ':Item has no attached files')
-		}
-		names = flist$fname
-		destinations = file.path(dest_dir, names)
-
-	#or we have names and destinations
-	}else if(!missing(names) & !missing(destinations)){
-		if(length(names) != length(destinations)){
-			stop('Length of names and destinations must be identical')
-		}
+	flist <- item_list_files(sb_id, ..., session=session)
+
+	if(nrow(flist) < 1)
+		stop(sb_id$id, ':Item has no attached files')
+
+	if(missing(names)) {
+
+		names <- flist$fname
 
-		flist = item_list_files(sb_id, ..., session=session)
+	} else {
+
+		if(!missing(destinations) & length(names) != length(destinations))
+				stop('Length of names and destinations must be identical')
 
-		if(!all(names %in% flist$fname)){
-			stop(sb_id$id, 'Item does not contain all requested files')
-		}
-	#otherwise in some other error condition
-	}else{
-		stop('Must have either names & destinations, or dest_dir for all files')
 	}
 
-
-	flist = merge(flist, data.frame(fname=names, dest=destinations, stringsAsFactors=FALSE))
+	if(!all(names %in% flist$fname)) stop(sb_id$id, 'Item does not contain all requested files')
+
+	if(!exists("destinations") | missing(destinations)) {
+		destinations <- file.path(dest_dir, names)
+	}
+
+	flist <- merge(cbind(flist, do.call(rbind.data.frame, attr(flist, "cloud"))), 
+								 data.frame(fname=names, dest=destinations))
 
 	for(i in seq_len(nrow(flist))) {
 		tryCatch({
+
+			if(flist[i, ]$cuid != "") {
+
+				if(!exists("gql")) gql <- httr::handle(url = pkg.env$graphql_url)
+
+				message("retrieving S3 URL")
+
+				flist[i, ]$url <- get_cloud_download_url(flist[i, c("cuid", "key", "title", "useForPreview")], 
+																								 gql)[[1]]$getS3DownloadUrl$downloadUri[1]
+
+			}
+
+		  message(paste("downloading file", flist[i,]$dest))
+
 			GET(url=flist[i,]$url, ..., 
 					write_disk(flist[i,]$dest, overwrite = overwrite_file), 
-					handle=session, timeout = httr::timeout(default_timeout()))
+					handle=session, timeout = httr::timeout(default_timeout()),
+					httr::progress())
+
 		}, error = function(e) {
 			if(file.exists(flist[i,]$dest)) {
 				warning(paste(basename(flist[i,]$dest), "exists, and overwrite is false. Skipping."))

diff --git a/R/item_list_files.R b/R/item_list_files.R
@@ -2,6 +2,9 @@
 #'
 #' @template manipulate_item
 #' @param recursive (logical) List files recursively. Default: \code{FALSE}
+#' @param fetch_cloud_urls (logical) fetch a tokenized cloud download URLs? Default: \code{TRUE}
+#' This option will take slightly longer but the `url` attribute of the returned list will
+#' work for direct file downloads or use with pther applications and libraries.
 #'
 #' @return 
 #' A data.frame with columns fname, size, url, and facet. 
@@ -47,7 +50,8 @@
 #' item_list_files(id = '56562348e4b071e7ea53e09d', recursive = FALSE) # default
 #' item_list_files(id = '56562348e4b071e7ea53e09d', recursive = TRUE)
 #' }
-item_list_files = function(sb_id, recursive = FALSE, ..., session=current_session()){
+item_list_files = function(sb_id, recursive = FALSE, fetch_cloud_urls = TRUE, ..., 
+													 session=current_session()){
 
 	session_val(session)
 
@@ -92,6 +96,10 @@ item_list_files = function(sb_id, recursive = FALSE, ..., session=current_sessio
 										url = rep("", lf),
 										facet = rep("", lf))
 
+	cloud <- rep(list(list(cuid = "", key = "", 
+														 title = "", useForPreview = "")), 
+									 lf)
+
 	if (length(files) == 0) {
 		return(out)
 	}
@@ -102,8 +110,34 @@ item_list_files = function(sb_id, recursive = FALSE, ..., session=current_sessio
 		out[i,'url'] = files[[i]]$url
 		if(!is.null(f <- files[[i]]$facet_name))
 			out[i, "facet"] = f
+		if(!is.null(files[[i]]$cuid)) {
+			cloud[[i]]$cuid <- files[[i]]$cuid
+			cloud[[i]]$key <- files[[i]]$key
+			cloud[[i]]$title <- ifelse(!is.null(files[[i]]$title), 
+																				files[[i]]$title, "")
+			cloud[[i]]$useForPreview <- ifelse(!is.null(files[[i]]$useForPreview), 
+																								 files[[i]]$useForPreview, FALSE)
+
+			if(fetch_cloud_urls) {
+
+				if(!exists("gql")) gql <- httr::handle(url = pkg.env$graphql_url)
+
+				message("retrieving S3 URL")
+
+				out[i, 'url'] = get_cloud_download_url(
+					data.frame(cuid = cloud[[i]]$cuid,
+										 key = cloud[[i]]$key,
+										 title = cloud[[i]]$title,
+										 useForPreview = cloud[[i]]$useForPreview),
+					gql)[[1]]$getS3DownloadUrl$downloadUri[1]
+
+			}
+
+		}
 	}
 
+	attr(out, "cloud") <- cloud
+
 	return(out)
 }
 

diff --git a/R/item_upload_files.R b/R/item_upload_files.R
@@ -143,7 +143,7 @@ multi_file_body <- function(files){
 #' item_append_files(res$id, "foobar.txt")
 #' }
 #' @export
-item_upload_cloud <- function(sb_id, files, status = TRUE, session=current_session()) {
+item_upload_cloud <- function(sb_id, files, ..., status = TRUE, session=current_session()) {
 
 	try(sb_id <- sb_id$id, silent = TRUE)
 
@@ -212,55 +212,3 @@ cloud_upload <- function(file, mimetype, itemid, chunk_size_bytes = pkg.env$chun
 	return(invisible(complete_multipart_upload(f_path, session_id, parts_header, gql)))
 
 }
-
-get_gql_header <- function() {
-	httr::add_headers(
-		.headers = c(`content-type` = "application/json", 
-								 accept = "application/json", 
-								 authorization = paste("Bearer", 
-								 											get_access_token())))
-}
-
-run_gql_query <- function(q, gql) {
-	out <- httr::POST(pkg.env$graphql_url, get_gql_header(), 
-										body = jsonlite::toJSON(list(query = q), auto_unbox = TRUE),  
-										handle = gql)
-
-	if(out$status_code == 200) {
-		jsonlite::fromJSON(rawToChar(out$content))
-	} else {
-		stop(paste("Error making multipart session.\n code:", out$status_code,
-							 "\n content:", rawToChar(out$content)))		
-	}
-
-}
-
-#GraphQL Queries for interaction with ScienceBase Manager
-create_multipart_upload_session <- function(s3_filepath, content_type, username, gql) {
-
-	run_gql_query(sprintf(
-		'query { createMultipartUploadSession(object: "%s" contentType: "%s" username: "%s") }', 
-		s3_filepath, content_type, username), gql)$data$createMultipartUploadSession
-
-}
-
-get_presigned_url_for_chunk <- function(s3_filepath, upload_id, part_number, gql) {
-
-	run_gql_query(sprintf(
-		'query { getPreSignedUrlForChunk(object: "%s", upload_id: "%s", part_number: "%s") }',
-		s3_filepath, upload_id, part_number), gql)$data$getPreSignedUrlForChunk
-
-}
-
-complete_multipart_upload <- function(item_str, upload_id, etag_payload, gql) {
-
-	eta <- sapply(etag_payload, function(x) {
-		sprintf('{ETag: "%s", PartNumber: %i}', gsub('"', "", x$ETag), x$PartNumber)
-	})
-
-	eta <- paste0("[", paste(eta, collapse = ","), "]")
-
-	run_gql_query(sprintf(
-		'query { completeMultiPartUpload(object: "%s" upload_id: "%s" parts_eTags: %s) }',
-		item_str, upload_id, eta), gql)
-}
diff --git a/R/session_authorized.R b/R/session_authorized.R
diff --git a/man/item_file_download.Rd b/man/item_file_download.Rd
diff --git a/man/item_list_files.Rd b/man/item_list_files.Rd