From ff409b998500959978078ad9b1e4e34495756abe Mon Sep 17 00:00:00 2001 From: hong-revo Date: Thu, 3 Aug 2017 19:18:21 +1000 Subject: [PATCH] documentation --- DESCRIPTION | 1 + NAMESPACE | 5 ++- R/as_xdf.R | 55 ++++++++++++++++++++----- R/copy_to_hdfs.R | 32 +++++++++++++++ R/hdfs_utils.R | 90 +++++++++++++++++++++++++++++++++++++---- R/is_xdf.R | 37 +++++++++++++++++ R/tbl_df_methods.R | 38 ++++++++++++------ R/workdir_utils.R | 18 ++++----- R/xdf_utils.R | 82 +++++++++++++++++++------------------- man/as.data.frame.Rd | 10 +---- man/as_xdf.Rd | 78 ++++++++++++++++++++++++++++++++++++ man/compute.Rd | 34 ++++++++++++++++ man/copy_to.Rd | 43 ++++++++++++++++++++ man/hdfs.Rd | 95 ++++++++++++++++++++++++++++++++++++++++++++ man/local_exec.Rd | 25 ++++++++++++ man/workdir.Rd | 6 +-- man/xdf_utils.Rd | 41 +++++++++++++++++++ 17 files changed, 593 insertions(+), 97 deletions(-) create mode 100644 R/is_xdf.R create mode 100644 man/as_xdf.Rd create mode 100644 man/compute.Rd create mode 100644 man/copy_to.Rd create mode 100644 man/hdfs.Rd create mode 100644 man/local_exec.Rd create mode 100644 man/xdf_utils.Rd diff --git a/DESCRIPTION b/DESCRIPTION index cb38696..059e80e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -39,6 +39,7 @@ Collate: '00rxArgs.R' 'group_by_xdf.R' 'hdfs_utils.R' 'imports.R' + 'is_xdf.R' 'join_utils.R' 'joins_unsupported.R' 'joins_xdf.R' diff --git a/NAMESPACE b/NAMESPACE index cf31110..3d71952 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -11,8 +11,8 @@ S3method(as_xdf,RxFileData) S3method(as_xdf,RxXdfData) S3method(as_xdf,default) S3method(cbind,RxXdfData) -S3method(collect,RxFileData) -S3method(compute,RxFileData) +S3method(collect,RxXdfData) +S3method(compute,RxXdfData) S3method(copy_to,RxHdfsFileSystem) S3method(distinct,RxFileData) S3method(distinct,grouped_tbl_xdf) @@ -70,6 +70,7 @@ export(as_standard_xdf) export(as_xdf) export(cbind.RxXdfData) export(clean_dplyrxdf_dir) +export(copy_to_hdfs) export(copy_xdf) export(delete_xdf) export(doXdf) diff --git a/R/as_xdf.R b/R/as_xdf.R index 6571346..8e1820e 100644 --- a/R/as_xdf.R +++ b/R/as_xdf.R @@ -1,10 +1,40 @@ +#' Detect and coerce to Xdf data source objects +#' +#' Functions to detect and coerce to Xdf data source objects. +#' +#' @param .data An R object that can be coerced to an Xdf data source. This includes another existing Xdf data source; see details below. +#' @param file The path/filename for the Xdf data file. +#' @param composite Whether to create a composite Xdf. +#' @param overwrite Whether to overwrite any existing file. +#' @param ... Other arguments to pass to \code{\link{rxDataStep}}. +#' +#' @details +#' The \code{as_xdf} function takes the object given by \code{.data} and imports its data into an Xdf file, returning a data source pointing to that file. The file can be either a standard or a \emph{composite} Xdf, as given by the \code{composite} argument. A composite Xdf is actually a directory containing data and metadata files; it can be manipulated by the RevoScaleR functions as if it were a single dataset. +#' +#' The \code{as_standard_xdf} and \code{as_composite_xdf} functions are shorthand for \code{as_xdf(*, composite=FALSE)} and \code{as_xdf(*, composite=TRUE)} respectively; they always create either a standard or composite Xdf. You can use this to switch an existing Xdf data source from one type of Xdf to the other. Note that Xdf files in HDFS must always be composite. +#' +#' Passing a \code{tbl_xdf} object to an \code{as} function will strip off the tbl information, returning a raw Xdf data source. This can be useful for resetting the beginning of a pipeline. +#' +#' The \code{file} argument gives the name of the Xdf file to create. If not specified, this is taken from the input data source where possible (for Xdf and file data sources, including text). Otherwise, a random name is generated. If no directory is specified, the file is created in the current working directory (if in the native filesystem) or in the user directory (in HDFS). +#' +#' You can use the \code{as} functions with any RevoScaleR data source, or otherwise with any R object that can be turned into a data frame. The resulting Xdf file will be created in the same filesystem as the input data source. If the input does not have a filesystem, for example if it is an in-database table or a data frame, the file is created in the native filesystem. +#' +#' @return +#' For the \code{as} functions, an Xdf data source object pointing to the created file. For the \code{is} functions, a TRUE/FALSE value. +#' +#' @seealso +#' \code{\link{as}}, \code{\link{is}}, \code{\link{inherits}}, +#' \code{\link{rxDataStep}}, \code{\link{rxImport}} +#' +#' @rdname as_xdf #' @export -as_composite_xdf <- function(.data, ...) +as_composite_xdf <- function(...) { as_xdf(.data, ..., composite=TRUE) } +#' @rdname as_xdf #' @export as_standard_xdf <- function(.data, ...) { @@ -12,6 +42,7 @@ as_standard_xdf <- function(.data, ...) } +#' @rdname as_xdf #' @export as_xdf <- function(.data, ...) { @@ -19,6 +50,7 @@ as_xdf <- function(.data, ...) } +#' @rdname as_xdf #' @export as_xdf.RxXdfData <- function(.data, file=NULL, composite=NULL, overwrite=TRUE, ...) { @@ -44,12 +76,13 @@ as_xdf.RxXdfData <- function(.data, file=NULL, composite=NULL, overwrite=TRUE, . return(copy_xdf(.data, file)) out <- modifyXdf(.data, file=file, createCompositeSet=composite) - rxDataStep(.data, out, rowsPerRead=.dxOptions$rowsPerRead, overwrite=TRUE, ...) + rxDataStep(.data, out, rowsPerRead=.dxOptions$rowsPerRead, overwrite=overwrite, ...) } +#' @rdname as_xdf #' @export -as_xdf.RxFileData <- function(.data, file=NULL, composite=in_hdfs(.data), ...) +as_xdf.RxFileData <- function(.data, file=NULL, composite=in_hdfs(.data), overwrite=TRUE, ...) { if(in_hdfs(.data) && !composite) stop("only composite Xdf files supported in HDFS") @@ -59,14 +92,15 @@ as_xdf.RxFileData <- function(.data, file=NULL, composite=in_hdfs(.data), ...) file <- validateXdfFile(file, composite) out <- RxXdfData(file=file, fileSystem=rxGetFileSystem(.data), createCompositeSet=composite) - rxDataStep(.data, out, rowsPerRead=.dxOptions$rowsPerRead, overwrite=TRUE, ...) + rxDataStep(.data, out, rowsPerRead=.dxOptions$rowsPerRead, overwrite=overwrite, ...) } +#' @rdname as_xdf #' @export -as_xdf.RxDataSource <- function(.data, file=NULL, composite=NULL, ...) +as_xdf.RxDataSource <- function(.data, file=NULL, composite=NULL, overwrite=TRUE, ...) { - hdfsDetected <- !is.na(isRemoteHdfsClient(FALSE)) || in_hdfs() + hdfsDetected <- !is.na(isRemoteHdfsClient(FALSE)) || in_hdfs(.data) if(is.null(composite)) composite <- hdfsDetected @@ -78,12 +112,15 @@ as_xdf.RxDataSource <- function(.data, file=NULL, composite=NULL, ...) file <- validateXdfFile(file, composite) out <- RxXdfData(file=file, fileSystem=rxGetFileSystem(.data), createCompositeSet=composite) - rxDataStep(.data, out, rowsPerRead=.dxOptions$rowsPerRead, overwrite=TRUE, ...) + if(in_hdfs(out)) + rxDataStep(.data, out, rowsPerRead=.dxOptions$rowsPerRead, overwrite=overwrite, ...) + else local_exec(rxDataStep(.data, out, rowsPerRead=.dxOptions$rowsPerRead, overwrite=overwrite, ...)) } +#' @rdname as_xdf #' @export -as_xdf.default <- function(.data, file=NULL, composite=NULL, ...) +as_xdf.default <- function(.data, file=NULL, composite=NULL, overwrite=TRUE, ...) { hdfsDetected <- !is.na(isRemoteHdfsClient(FALSE)) || in_hdfs() if(is.null(composite)) @@ -96,5 +133,5 @@ as_xdf.default <- function(.data, file=NULL, composite=NULL, ...) .data <- as.data.frame(.data) out <- RxXdfData(file=file, fileSystem=RxNativeFileSystem(), createCompositeSet=composite) - local_exec(rxDataStep(.data, out, rowsPerRead=.dxOptions$rowsPerRead, overwrite=TRUE, ...)) + local_exec(rxDataStep(.data, out, rowsPerRead=.dxOptions$rowsPerRead, overwrite=overwrite, ...)) } diff --git a/R/copy_to_hdfs.R b/R/copy_to_hdfs.R index 662c538..9d3c3bd 100644 --- a/R/copy_to_hdfs.R +++ b/R/copy_to_hdfs.R @@ -1,3 +1,25 @@ +#' Upload a dataset to HDFS +#' +#' @param dest The destination source: an object of class \code{\link{RxHdfsFileSystem}}. +#' @param df A dataset: can be a filename, an Xdf data source object, another RevoScaleR data source, or anything that can be coerced to a data frame. +#' @param path The HDFS directory in which to store the uploaded dataset. Defaults to the user's HDFS home directory. +#' @param overwrite Whether to overwrite any existing file. +#' @param force_composite: Whether to force the uploaded dataset to be a composite Xdf file. See details below. +#' @param ... For \code{copy_to}, further arguments to \code{\link{rxHadoopCommand}}. +#' +#' @details +#' This is the RevoScaleR HDFS method for the dplyr \code{\link[dplyr]{copy_to}} function, for uploading data to a remote database/src. The method should work with any RevoScaleR data source, or with any R object that can be converted into a data frame. If the data is not already in Xdf format, it is first imported into Xdf, and then uploaded. +#' +#' The code will handle both the cases where you are logged into the edge node of a Hadoop/Spark cluster, and if you are a remote client. For the latter case, the uploading is a two-stage process: the data is first transferred to the native filesystem of the edge node, and then copied from the edge node into HDFS. +#' +#' @return +#' An Xdf data source object pointing to the uploaded data. +#' +#' @seealso +#' \code{\link{rxHadoopCopyFromClient}}, \code{\link{rxHadoopCopyFromLocal}}, +#' \code{\link{collect}} and \code{\link{compute}} for downloading data from HDFS +#' @aliases copy_to +#' @rdname copy_to #' @export copy_to.RxHdfsFileSystem <- function(dest, df, path=NULL, overwrite=FALSE, force_composite=TRUE, ...) { @@ -37,6 +59,16 @@ copy_to.RxHdfsFileSystem <- function(dest, df, path=NULL, overwrite=FALSE, force } +#' @details +#' The \code{copy_to_hdfs} function is a simple wrapper that avoids having to create an explicit filesystem object. +#' @rdname copy_to +#' @export +copy_to_hdfs <- function(...) +{ + copy_to(RxHdfsFileSystem(), ...) +} + + hdfsUpload <- function(src, dest, nativeTarget="/tmp", overwrite, isDir, ...) { # based on rxHadoopCopyFromClient diff --git a/R/hdfs_utils.R b/R/hdfs_utils.R index d2f5a51..b128ad2 100644 --- a/R/hdfs_utils.R +++ b/R/hdfs_utils.R @@ -1,3 +1,32 @@ +#' Utilities for HDFS +#' +#' Functions for working with files in HDFS: directory listing; file copy, move and delete; directory create and delete; test for file/directory existence; check if in HDFS; expunge Trash. +#' +#' @param path A HDFS pathname. +#' @param full_path For \code{hdfs_dir}, whether to prepend the directory path to filenames to give a full path. If FALSE, only file names are returned. +#' @param include_dirs For \code{hdfs_dir}, if subdirectory names should be included. Always TRUE for non-recursive listings. +#' @param recursive For \code{hdfs_dir}, if the listing should recurse into subdirectories. +#' @param dirs_only For \code{hdfs_dir} if \emph{only} subdirectory names should be included. +#' @param pattern For \code{hdfs_dir}, an optional \link{regular expression}. Only file names that match will be returned. +#' @param ... For \code{hdfs_dir}, further switches, prefixed by \code{"-"}, to pass to the Hadoop \code{fs -ls} command. For other functions, further arguments to pass to \code{\link{rxHadoopCommand.}} +#' @param convert_backslashes Whether to convert any backslashes found in the input to forward slashes. +#' @param src,dest For \code{hdfs_file_copy} and \code{hdfs_file_move}, the source and destination paths. +#' +#' @details +#' These are utility functions to simplify working with files and directories in HDFS. For the most part, they wrap lower-level functions provided by RevoScaleR, which in turn wrap various Hadoop file system commands. They work with any file that is stored in HDFS, not just Xdf files. +#' +#' The \code{hdfs_dir} function is analogous to \code{dir} for the native filesystem. Like that function, and unlike \code{\link{rxHadoopListFiles}}, it returns a vector of filenames (\code{rxHadoopListFiles} returns a vector of \emph{printed output} from the \code{hadoop fs -ls} command, which is not quite the same thing). Again unlike \code{rxHadoopListFiles}, it does not print anything by default (the \code{print} method takes care of that). +#' +#' @return +#' \code{hdfs_dir} returns a vector of filenames, optionally with the full path attached. +#' +#' @seealso +#' \code{\link{dir}}, \code{link{dir.exists}}, \code{\link{file.exists}}, \code{\link{dir.create}}, +#' \code{\link{file.copy}}, \code{\link{file.rename}}, \code{\link{file.remove}}, \code{\link{unlink}}, +#' \code{\link{rxHadoopListFiles}}, \code{\link{rxHadoopFileExists}}, +#' \code{\link{rxHadoopMakeDir}}, \code{\link{rxHadoopRemoveDir}}, +#' \code{\link{rxHadoopCopy}}, \code{\link{rxHadoopMove}}, \code{\link{rxHadoopRemove}} +#' @rdname hdfs #' @export hdfs_dir <- function(path=".", ..., full_path=FALSE, include_dirs=FALSE, recursive=FALSE, dirs_only=FALSE, pattern=NULL, convert_backslashes=TRUE) @@ -25,6 +54,7 @@ hdfs_dir <- function(path=".", ..., full_path=FALSE, include_dirs=FALSE, recursi output <- output[substr(output, 1, 1) == "d"] #output <- gsub("^[^/]*(/.*)$", "\\1", output) + ## NOTE: regex below will break on filenames with a space output <- substr(output, regexpr("[^ ]+$", output), nchar(output)) if(!full_path && !recursive) @@ -38,6 +68,7 @@ hdfs_dir <- function(path=".", ..., full_path=FALSE, include_dirs=FALSE, recursi } +#' @rdname hdfs #' @export print.dplyrXdf_hdfs_dir <- function(x, ...) { @@ -50,6 +81,12 @@ print.dplyrXdf_hdfs_dir <- function(x, ...) } +#' @details +#' \code{hdfs_dir_exists} and \code{hdfs_file_exists} test for the existence of a given directory and file, respectively. They are analogous to \code{dir.exists} and \code{file.exists} for the native filesystem. +#' +#' @return +#' \code{hdfs_dir_exists} and \code{hdfs_file_exists} return TRUE or FALSE depending on whether the directory or file exists. +#' @rdname hdfs #' @export hdfs_dir_exists <- function(path, convert_backslashes=TRUE) { @@ -60,6 +97,7 @@ hdfs_dir_exists <- function(path, convert_backslashes=TRUE) # for symmetry with hdfs_dir_exists +#' @rdname hdfs #' @export hdfs_file_exists <- function(path, convert_backslashes=TRUE) { @@ -68,6 +106,12 @@ hdfs_file_exists <- function(path, convert_backslashes=TRUE) } +#' @details +#' \code{hdfs_dir_create} and \code{hdfs_dir_remove} create and remove directories. They are analogous to \code{dir.create} and \code{unlink(recursive=TRUE)} for the native filesystem. +#' +#' @return +#' The other \code{hdfs_*} functions return TRUE or FALSE depending on whether the operation succeeded. +#' @rdname hdfs #' @export hdfs_dir_create <- function(path, ..., convert_backslashes=TRUE) { @@ -76,6 +120,7 @@ hdfs_dir_create <- function(path, ..., convert_backslashes=TRUE) } +#' @rdname hdfs #' @export hdfs_dir_remove <- function(path, ..., convert_backslashes=TRUE) { @@ -84,6 +129,9 @@ hdfs_dir_remove <- function(path, ..., convert_backslashes=TRUE) } +#' @details +#' \code{hdfs_file_copy} and \code{hdfs_file_move} copy and move files. They are analogous to \code{file.copy} and \code{file.rename} for the native filesystem. Unlike \code{\link{rxHadoopCopy}} and \code{\link{rxHadoopMove}}, they are vectorised in both \code{src} and \code{dest}. +#' @rdname hdfs #' @export hdfs_file_copy <- function(src, dest, ..., overwrite=TRUE, convert_backslashes=TRUE) { @@ -95,14 +143,7 @@ hdfs_file_copy <- function(src, dest, ..., overwrite=TRUE, convert_backslashes=T } -#' @export -hdfs_file_remove <- function(path, ..., convert_backslashes=TRUE) -{ - path <- convertBS(path, convert_backslashes) - rxHadoopRemove(path, ...) -} - - +#' @rdname hdfs #' @export hdfs_file_move <- function(src, dest, ..., convert_backslashes=TRUE) { @@ -114,6 +155,20 @@ hdfs_file_move <- function(src, dest, ..., convert_backslashes=TRUE) } +#' @details +#' \code{hdfs_file_remove} deletes files. It is analogous to \code{file.remove} and \code{unlink} for the native filesystem. +#' @rdname hdfs +#' @export +hdfs_file_remove <- function(path, ..., convert_backslashes=TRUE) +{ + path <- convertBS(path, convert_backslashes) + rxHadoopRemove(path, ...) +} + + +#' @details +#' \code{hdfs_expunge} empties the HDFS trash. +#' @rdname hdfs #' @export hdfs_expunge <- function() { @@ -121,6 +176,11 @@ hdfs_expunge <- function() } +#' @param obj For \code{in_hdfs}, An R object, typically a RevoScaleR data source object. +#' +#' @return +#' \code{in_hdfs} returns whether the given object is stored in HDFS. This will be TRUE for an Xdf data source or file data source in HDFS, or a Spark data source. Classes for the latter include \code{RxHiveData}, \code{RxParquetData} and \code{RxOrcData}. If no argument is specified, returns whether the default filesystem is HDFS. +#' @rdname hdfs #' @export in_hdfs <- function(obj=NULL) { @@ -131,6 +191,20 @@ in_hdfs <- function(obj=NULL) } +#' Runs an expression in the local compute context +#' +#' @param expr An expression to execute. Normally something that depends on the compute context, such as \code{rxDataStep}. +#' @param context The compute context in which to execute \code{expr}. Defaults to local. +#' +#' @details +#' This function is useful when you are working with datasets in both the native filesystem and HDFS. The workhorse RevoScaleR function for data transformation, \code{rxDataStep}, will complain if you are in a distributed compute context such as \code{\link{RxHadoopMR}} or \code{\link{RxSpark}}, and you want to process a dataset in the native filesystem. You can wrap your code inside a \code{local_exec} call to switch to the local compute context temporarily, and then switch back when it has finished running. +#' +#' @return +#' The value of \code{expr}. +#' +#' @seealso +#' \code{\link{eval}} +#' @rdname local_exec #' @export local_exec <- function(expr, context="local") { diff --git a/R/is_xdf.R b/R/is_xdf.R new file mode 100644 index 0000000..adf3da3 --- /dev/null +++ b/R/is_xdf.R @@ -0,0 +1,37 @@ +#' Detect and coerce to Xdf data source objects +#' +#' Functions to detect and coerce to Xdf data source objects. +#' +#' @param x An R object. +#' +#' @details +#' The \code{is_xdf} function returns TRUE if \code{x} is an Xdf data source object; ie, it inherits from the \code{RxXdfData} class. This includes both raw Xdf data sources and \code{tbl_xdf} objects as created by dplyrXdf. The \code{is_composite_xdf} function returns TRUE if \code{x} is a \emph{composite} Xdf data source. +#' +#' Detecting whether an object is a composite Xdf can be tricky and \code{is_composite_xdf} goes through a few steps to do this. If \code{x} has a non-NULL \code{createCompositeSet} slot, then that value is returned. Otherwise, it checks whether the \code{file} slot refers to an existing directory, whose name does \emph{not} have an extension (that is, \code{"foo"} qualifies as a valid filename for a composite Xdf, but not \code{"foo.xdf"}). This is necessary because of the semantics of \code{rxDataStep}. +#' +#' To remove any ambiguity, it's recommended that you always explicitly specify the \code{createCompositeSet} argument when creating an Xdf data source object (objects created by dplyrXdf will always do this). +#' +#' @rdname as_xdf +#' @export +is_xdf <- function(x) +{ + inherits(x, "RxXdfData") +} + +#' @rdname as_xdf +#' @export +is_composite_xdf <- function(x) +{ + if(!is_xdf(x)) + return(FALSE) + + composite <- x@createCompositeSet + if(!is.null(composite)) + return(composite) + + # check if this file refers to an existing directory + file <- x@file + if(in_hdfs(x)) + return(tools::file_ext(file) == "" && hdfs_dir_exists(file)) + else return(tools::file_ext(file) == "" && dir.exists(file)) +} diff --git a/R/tbl_df_methods.R b/R/tbl_df_methods.R index e957e4a..0c8f3e5 100644 --- a/R/tbl_df_methods.R +++ b/R/tbl_df_methods.R @@ -10,11 +10,11 @@ NULL #' @details #' These are simple wrappers around \code{\link[RevoScaleR]{rxDataStep}}, with the check on the maximum table size turned off. You should ensure that you have enough memory for your data. #' -#' \code{as.data.frame} converts a data source object (typically an xdf file, but can also be any data source type that \code{rxDataStep} supports) into a data frame. The \code{$} and \code{[[} methods extract a single column from a data source, as a vector. +#' \code{as.data.frame} converts a data source object (typically an Xdf file, but can also be any data source type that \code{rxDataStep} supports) into a data frame. The \code{$} and \code{[[} methods extract a single column from a data source, as a vector. #' #' @seealso #' \code{\link[base]{as.data.frame}}, \code{\link[dplyr]{collect}} -#' @aliases collect compute as.data.frame +#' @aliases as.data.frame #' @rdname as.data.frame #' @export as.data.frame.RxFileData <- function(x, maxRowsByCols=NULL, row.names=NULL, optional=TRUE, ...) @@ -26,13 +26,29 @@ as.data.frame.RxFileData <- function(x, maxRowsByCols=NULL, row.names=NULL, opti } -#' @rdname as.data.frame +#' Download a dataset to the local machine +#' +#' @param x An Xdf data source object. +#' @param as_data_frame Should the downloaded data be converted to a data frame, or left as an Xdf file? +#' +#' @details +#' The \code{collect} and \code{compute} functions can be used for two purposes: to download a dataset stored in HDFS to the native filesystem; or to convert a dataset (whether stored in HDFS or not) to a data frame. If \code{x} is an Xdf data source in HDFS, the data is downloaded as a tbl_xdf in the dplyrXdf working directory. +#' +#' The functions differ only in the default value of the \code{as_data_frame} argument. By default \code{collect} will always output a data frame, while \code{compute} will only do so if the source data was \emph{not} downloaded from HDFS. +#' +#' The code will handle both the cases where you are logged into the edge node of a Hadoop/Spark cluster, and if you are a remote client. For the latter case, the downloading is a two-stage process: the data is first transferred from HDFS to the native filesystem of the edge node, and then downloaded from the edge node to the client. +#' +#' @return +#' If \code{as_data_frame} is FALSE, a data frame. Otherwise, a tbl_xdf data source. +#' +#' @seealso +#' \code{\link[dplyr]{compute}} in package dplyr, \code{\link{copy_to}} for uploading to HDFS +#' +#' @aliases collect, compute +#' @rdname compute #' @export -collect.RxFileData <- function(x, as_data_frame=TRUE, ...) +collect.RxXdfData <- function(x, as_data_frame=TRUE, ...) { - if(is.null(as_data_frame)) - as_data_frame <- !in_hdfs(x) - if(in_hdfs(x)) { # copy from HDFS to native filesystem @@ -49,14 +65,10 @@ collect.RxFileData <- function(x, as_data_frame=TRUE, ...) } -# collect and compute differ only in as_data_frame default value -#' @rdname as.data.frame +#' @rdname compute #' @export -compute.RxFileData <- function(x, as_data_frame=NULL, ...) +compute.RxXdfData <- function(x, as_data_frame=!in_hdfs(x), ...) { - if(is.null(as_data_frame)) - as_data_frame <- !in_hdfs(x) - if(in_hdfs(x)) { # copy from HDFS to native filesystem diff --git a/R/workdir_utils.R b/R/workdir_utils.R index b703243..1c0462a 100644 --- a/R/workdir_utils.R +++ b/R/workdir_utils.R @@ -1,4 +1,4 @@ -#' Get and set the xdf tbl directory +#' Get, set and clean the xdf tbl directory #' #' By default, dplyrXdf will save the xdf files it creates in the R temporary directory. This can be a problem if it is in a location with limited disk space. Use \code{set_dplyrxdf_dir} to change the xdf tbl directory, and \code{get_dplyrxdf_dir} to view it. #' @@ -75,35 +75,33 @@ make_dplyrxdf_dir <- function(fileSystem=rxGetFileSystem()) } -#' Delete data files for xdf tbls -#' #' @param fileSystem Filesystem on which to delete the xdf tbls. #' @details -#' This is a utility function to delete the files generated by dplyrXdf. Note that all xdf files in the specified location will be removed! +#' This is a utility function to delete the files generated by dplyrXdf. Note that all files in the specified location will be removed! #' @rdname workdir #' @export clean_dplyrxdf_dir <- function(fileSystem=rxGetFileSystem()) { + fileSystem <- validateFileSystem(fileSystem) + if(inherits(fileSystem, "RxNativeFileSystem")) { path <- get_dplyrxdf_dir("native") files <- dir(path, full.names=TRUE) - # use unlink to allow for the possibility of composite xdfs unlink(files, recursive=TRUE) } else if(inherits(fileSystem, "RxHdfsFileSystem")) { path <- get_dplyrxdf_dir("hdfs") - # rxHadoopFileExists doesn't always exist, duplicate its functionality - pathExists <- (path == .dxOptions$hdfsWorkDir && .dxOptions$hdfsWorkDirCreated) || - (rxHadoopCommand(paste0("fs -test -e '", path, "'"), intern=FALSE) == 0) + + pathExists <- hdfs_dir_exists(path) if(pathExists) { files <- hdfs_dir(path, full_path=TRUE) - files <- substr(files, regexpr("\\s[[:alnum:][:punct:]]*$", files) + 1, nchar(files)) - rxHadoopRemoveDir(files, skipTrash=TRUE, intern=TRUE) + rxHadoopRemoveDir(files, skipTrash=TRUE) } } + invisible(NULL) } diff --git a/R/xdf_utils.R b/R/xdf_utils.R index 95a0d9b..82e804f 100644 --- a/R/xdf_utils.R +++ b/R/xdf_utils.R @@ -1,4 +1,23 @@ -# copy an Xdf file via OS commands; avoid rxDataStep +#' Utility functions for working with Xdf files +#' +#' Copy, move, rename and delete an Xdf file. +#' +#' @param src For \code{copy_xdf}, \code{move_xdf} and \code{rename_xdf}, an Xdf data source object (\emph{not} a filename). +#' @param dest For \code{copy_xdf}, \code{move_xdf} and \code{rename_xdf}, a character string giving the destination file. Note that for \code{rename_xdf}, this should only be a base name, not a full path. +#' @param overwrite Logical; for \code{copy_xdf} and \code{move_xdf}, whether to overwrite any existing file. +#' +#' @details +#' The \code{copy_xdf} function copies the Xdf file given by \code{src} to the location specified by \code{dest}, possibly renaming it as well; \code{move_xdf} moves the file. \code{rename_xdf} does a strict rename (the location of the file is unchanged, only its name). \code{delete_xdf} deletes the file given by \code{xdf}. +#' +#' These are utility functions for working with the files/directories where the data for an Xdf data source is stored. They make use of low-level OS functionality, so should be more efficient that running \code{rxDataStep}. They work with both standard and composite Xdf files, and with data stored both in the native filesystem and in HDFS. +#' +#' @return +#' \code{copy_xdf}, \code{move_xdf} and \code{rename_xdf} return an Xdf data source object pointing to the new file location. \code{delete_xdf} returns TRUE/FALSE depending on whether the delete operation succeeded. +#' +#' @seealso +#' \code{\link{file.copy}}, \code{\link{file.rename}}, \code{\link{unlink}}, +#' \code{\link{rxHadoopCopy}}, \code{\link{rxHadoopMove}}, \code{\link{rxHadoopRemove}}, \code{\link{rxHadoopRemoveDir}} +#' @rdname xdf_utils #' @export copy_xdf <- function(src, dest, overwrite=TRUE) { @@ -6,7 +25,7 @@ copy_xdf <- function(src, dest, overwrite=TRUE) } -# move an Xdf file via OS commands; avoid rxDataStep +#' @rdname xdf_utils #' @export move_xdf <- function(src, dest, overwrite=TRUE) { @@ -14,83 +33,62 @@ move_xdf <- function(src, dest, overwrite=TRUE) } +#' @rdname xdf_utils #' @export -rename_xdf <- function(src, newFile) +rename_xdf <- function(src, dest) { - if(dirname(newFile) == dirname(src@file)) - newFile <- basename(newFile) - else if(basename(newFile) != newFile) + if(dirname(dest) == dirname(src@file)) + dest <- basename(dest) + else if(basename(dest) != dest) stop("to move an Xdf file to a new location, use move_xdf", call.=FALSE) composite <- is_composite_xdf(src) if(in_hdfs(src)) { - newPath <- file.path(dirname(src@file), newFile, fsep="/") - rxHadoopMove(src@file, newPath) + destPath <- file.path(dirname(src@file), dest, fsep="/") + rxHadoopMove(src@file, destPath) if(composite) { # rename all files in data and metadata subdirs pat <- sprintf("^%s", basename(src@file)) - dataFiles <- hdfs_dir(newPath, full_path=TRUE, recursive=TRUE) + dataFiles <- hdfs_dir(destPath, full_path=TRUE, recursive=TRUE) dataDirs <- dirname(dataFiles) - newDataFiles <- file.path(dataDirs, sub(pat, newFile, basename(dataFiles))) + newDataFiles <- file.path(dataDirs, sub(pat, dest, basename(dataFiles))) mapply(rxHadoopMove, dataFiles, newDataFiles) } } else { - newPath <- file.path(dirname(src@file), newFile) - file.rename(src@file, newPath) + destPath <- file.path(dirname(src@file), dest) + file.rename(src@file, destPath) if(composite) { # rename all files in data and metadata subdirs pat <- sprintf("^%s", basename(src@file)) - dataFiles <- dir(newPath, pattern=pat, full.names=TRUE, recursive=TRUE) + dataFiles <- dir(destPath, pattern=pat, full.names=TRUE, recursive=TRUE) dataDirs <- dirname(dataFiles) - newDataFiles <- file.path(dataDirs, sub(pat, newFile, basename(dataFiles))) + newDataFiles <- file.path(dataDirs, sub(pat, dest, basename(dataFiles))) file.rename(dataFiles, newDataFiles) } } - modifyXdf(src, file=newPath) + modifyXdf(src, file=destPath) } +#' @param xdf For \code{delete_xdf}, an Xdf data source object. +#' @rdname xdf_utils #' @export delete_xdf <- function(xdf) { + if(!is_xdf(xdf)) + stop("only for deleting Xdf files") if(in_hdfs(xdf)) { if(is_composite_xdf(xdf)) rxHadoopRemoveDir(xdf@file) else rxHadoopRemove(xdf@file) } - else if(inherits(xdf, "RxXdfData")) - unlink(xdf@file, recursive=TRUE) -} - - -#' @export -is_xdf <- function(x) -{ - inherits(x, "RxXdfData") -} - - -#' @export -is_composite_xdf <- function(x) -{ - if(!is_xdf(x)) - return(FALSE) - - composite <- x@createCompositeSet - if(!is.null(composite)) - return(composite) - - # check if this file refers to an existing directory - file <- x@file - if(in_hdfs(x)) - return(tools::file_ext(file) == "" && hdfs_dir_exists(file)) - else return(tools::file_ext(file) == "" && dir.exists(file)) + else unlink(xdf@file, recursive=TRUE) } diff --git a/man/as.data.frame.Rd b/man/as.data.frame.Rd index 6839cb9..6b54065 100644 --- a/man/as.data.frame.Rd +++ b/man/as.data.frame.Rd @@ -2,11 +2,7 @@ % Please edit documentation in R/tbl_df_methods.R \name{as.data.frame.RxFileData} \alias{as.data.frame.RxFileData} -\alias{collect} -\alias{compute} \alias{as.data.frame} -\alias{collect.RxFileData} -\alias{compute.RxFileData} \alias{$.RxFileData} \alias{[[.RxFileData} \alias{pull.RxFileData} @@ -15,10 +11,6 @@ \method{as.data.frame}{RxFileData}(x, maxRowsByCols = NULL, row.names = NULL, optional = TRUE, ...) -\method{collect}{RxFileData}(x, as_data_frame = TRUE, ...) - -\method{compute}{RxFileData}(x, as_data_frame = NULL, ...) - \method{$}{RxFileData}(x, name) \method{[[}{RxFileData}(x, name, maxRowsByCols = NULL, ...) @@ -42,7 +34,7 @@ Convert a data source or tbl to a data frame \details{ These are simple wrappers around \code{\link[RevoScaleR]{rxDataStep}}, with the check on the maximum table size turned off. You should ensure that you have enough memory for your data. -\code{as.data.frame} converts a data source object (typically an xdf file, but can also be any data source type that \code{rxDataStep} supports) into a data frame. The \code{$} and \code{[[} methods extract a single column from a data source, as a vector. +\code{as.data.frame} converts a data source object (typically an Xdf file, but can also be any data source type that \code{rxDataStep} supports) into a data frame. The \code{$} and \code{[[} methods extract a single column from a data source, as a vector. } \seealso{ \code{\link[base]{as.data.frame}}, \code{\link[dplyr]{collect}} diff --git a/man/as_xdf.Rd b/man/as_xdf.Rd new file mode 100644 index 0000000..01189cc --- /dev/null +++ b/man/as_xdf.Rd @@ -0,0 +1,78 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/as_xdf.R, R/is_xdf.R +\name{as_composite_xdf} +\alias{as_composite_xdf} +\alias{as_standard_xdf} +\alias{as_xdf} +\alias{as_xdf.RxXdfData} +\alias{as_xdf.RxFileData} +\alias{as_xdf.RxDataSource} +\alias{as_xdf.default} +\alias{is_xdf} +\alias{is_composite_xdf} +\title{Detect and coerce to Xdf data source objects} +\usage{ +as_composite_xdf(...) + +as_standard_xdf(.data, ...) + +as_xdf(.data, ...) + +\method{as_xdf}{RxXdfData}(.data, file = NULL, composite = NULL, + overwrite = TRUE, ...) + +\method{as_xdf}{RxFileData}(.data, file = NULL, composite = in_hdfs(.data), + overwrite = TRUE, ...) + +\method{as_xdf}{RxDataSource}(.data, file = NULL, composite = NULL, + overwrite = TRUE, ...) + +\method{as_xdf}{default}(.data, file = NULL, composite = NULL, + overwrite = TRUE, ...) + +is_xdf(x) + +is_composite_xdf(x) +} +\arguments{ +\item{...}{Other arguments to pass to \code{\link{rxDataStep}}.} + +\item{.data}{An R object that can be coerced to an Xdf data source. This includes another existing Xdf data source; see details below.} + +\item{file}{The path/filename for the Xdf data file.} + +\item{composite}{Whether to create a composite Xdf.} + +\item{overwrite}{Whether to overwrite any existing file.} + +\item{x}{An R object.} +} +\value{ +For the \code{as} functions, an Xdf data source object pointing to the created file. For the \code{is} functions, a TRUE/FALSE value. +} +\description{ +Functions to detect and coerce to Xdf data source objects. + +Functions to detect and coerce to Xdf data source objects. +} +\details{ +The \code{as_xdf} function takes the object given by \code{.data} and imports its data into an Xdf file, returning a data source pointing to that file. The file can be either a standard or a \emph{composite} Xdf, as given by the \code{composite} argument. A composite Xdf is actually a directory containing data and metadata files; it can be manipulated by the RevoScaleR functions as if it were a single dataset. + +The \code{as_standard_xdf} and \code{as_composite_xdf} functions are shorthand for \code{as_xdf(*, composite=FALSE)} and \code{as_xdf(*, composite=TRUE)} respectively; they always create either a standard or composite Xdf. You can use this to switch an existing Xdf data source from one type of Xdf to the other. Note that Xdf files in HDFS must always be composite. + +Passing a \code{tbl_xdf} object to an \code{as} function will strip off the tbl information, returning a raw Xdf data source. This can be useful for resetting the beginning of a pipeline. + +The \code{file} argument gives the name of the Xdf file to create. If not specified, this is taken from the input data source where possible (for Xdf and file data sources, including text). Otherwise, a random name is generated. If no directory is specified, the file is created in the current working directory (if in the native filesystem) or in the user directory (in HDFS). + +You can use the \code{as} functions with any RevoScaleR data source, or otherwise with any R object that can be turned into a data frame. The resulting Xdf file will be created in the same filesystem as the input data source. If the input does not have a filesystem, for example if it is an in-database table or a data frame, the file is created in the native filesystem. + +The \code{is_xdf} function returns TRUE if \code{x} is an Xdf data source object; ie, it inherits from the \code{RxXdfData} class. This includes both raw Xdf data sources and \code{tbl_xdf} objects as created by dplyrXdf. The \code{is_composite_xdf} function returns TRUE if \code{x} is a \emph{composite} Xdf data source. + +Detecting whether an object is a composite Xdf can be tricky and \code{is_composite_xdf} goes through a few steps to do this. If \code{x} has a non-NULL \code{createCompositeSet} slot, then that value is returned. Otherwise, it checks whether the \code{file} slot refers to an existing directory, whose name does \emph{not} have an extension (that is, \code{"foo"} qualifies as a valid filename for a composite Xdf, but not \code{"foo.xdf"}). This is necessary because of the semantics of \code{rxDataStep}. + +To remove any ambiguity, it's recommended that you always explicitly specify the \code{createCompositeSet} argument when creating an Xdf data source object (objects created by dplyrXdf will always do this). +} +\seealso{ +\code{\link{as}}, \code{\link{is}}, \code{\link{inherits}}, +\code{\link{rxDataStep}}, \code{\link{rxImport}} +} diff --git a/man/compute.Rd b/man/compute.Rd new file mode 100644 index 0000000..6611fb9 --- /dev/null +++ b/man/compute.Rd @@ -0,0 +1,34 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/tbl_df_methods.R +\name{collect.RxXdfData} +\alias{collect.RxXdfData} +\alias{collect,} +\alias{compute} +\alias{compute.RxXdfData} +\title{Download a dataset to the local machine} +\usage{ +\method{collect}{RxXdfData}(x, as_data_frame = TRUE, ...) + +\method{compute}{RxXdfData}(x, as_data_frame = !in_hdfs(x), ...) +} +\arguments{ +\item{x}{An Xdf data source object.} + +\item{as_data_frame}{Should the downloaded data be converted to a data frame, or left as an Xdf file?} +} +\value{ +If \code{as_data_frame} is FALSE, a data frame. Otherwise, a tbl_xdf data source. +} +\description{ +Download a dataset to the local machine +} +\details{ +The \code{collect} and \code{compute} functions can be used for two purposes: to download a dataset stored in HDFS to the native filesystem; or to convert a dataset (whether stored in HDFS or not) to a data frame. If \code{x} is an Xdf data source in HDFS, the data is downloaded as a tbl_xdf in the dplyrXdf working directory. + +The functions differ only in the default value of the \code{as_data_frame} argument. By default \code{collect} will always output a data frame, while \code{compute} will only do so if the source data was \emph{not} downloaded from HDFS. + +The code will handle both the cases where you are logged into the edge node of a Hadoop/Spark cluster, and if you are a remote client. For the latter case, the downloading is a two-stage process: the data is first transferred from HDFS to the native filesystem of the edge node, and then downloaded from the edge node to the client. +} +\seealso{ +\code{\link[dplyr]{compute}} in package dplyr, \code{\link{copy_to}} for uploading to HDFS +} diff --git a/man/copy_to.Rd b/man/copy_to.Rd new file mode 100644 index 0000000..a74fe8f --- /dev/null +++ b/man/copy_to.Rd @@ -0,0 +1,43 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/copy_to_hdfs.R +\name{copy_to.RxHdfsFileSystem} +\alias{copy_to.RxHdfsFileSystem} +\alias{copy_to} +\alias{copy_to_hdfs} +\title{Upload a dataset to HDFS} +\usage{ +\method{copy_to}{RxHdfsFileSystem}(dest, df, path = NULL, overwrite = FALSE, + force_composite = TRUE, ...) + +copy_to_hdfs(...) +} +\arguments{ +\item{dest}{The destination source: an object of class \code{\link{RxHdfsFileSystem}}.} + +\item{df}{A dataset: can be a filename, an Xdf data source object, another RevoScaleR data source, or anything that can be coerced to a data frame.} + +\item{path}{The HDFS directory in which to store the uploaded dataset. Defaults to the user's HDFS home directory.} + +\item{overwrite}{Whether to overwrite any existing file.} + +\item{...}{For \code{copy_to}, further arguments to \code{\link{rxHadoopCommand}}.} + +\item{force_composite:}{Whether to force the uploaded dataset to be a composite Xdf file. See details below.} +} +\value{ +An Xdf data source object pointing to the uploaded data. +} +\description{ +Upload a dataset to HDFS +} +\details{ +This is the RevoScaleR HDFS method for the dplyr \code{\link[dplyr]{copy_to}} function, for uploading data to a remote database/src. The method should work with any RevoScaleR data source, or with any R object that can be converted into a data frame. If the data is not already in Xdf format, it is first imported into Xdf, and then uploaded. + +The code will handle both the cases where you are logged into the edge node of a Hadoop/Spark cluster, and if you are a remote client. For the latter case, the uploading is a two-stage process: the data is first transferred to the native filesystem of the edge node, and then copied from the edge node into HDFS. + +The \code{copy_to_hdfs} function is a simple wrapper that avoids having to create an explicit filesystem object. +} +\seealso{ +\code{\link{rxHadoopCopyFromClient}}, \code{\link{rxHadoopCopyFromLocal}}, +\code{\link{collect}} and \code{\link{compute}} for downloading data from HDFS +} diff --git a/man/hdfs.Rd b/man/hdfs.Rd new file mode 100644 index 0000000..d16c5a7 --- /dev/null +++ b/man/hdfs.Rd @@ -0,0 +1,95 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/hdfs_utils.R +\name{hdfs_dir} +\alias{hdfs_dir} +\alias{print.dplyrXdf_hdfs_dir} +\alias{hdfs_dir_exists} +\alias{hdfs_file_exists} +\alias{hdfs_dir_create} +\alias{hdfs_dir_remove} +\alias{hdfs_file_copy} +\alias{hdfs_file_move} +\alias{hdfs_file_remove} +\alias{hdfs_expunge} +\alias{in_hdfs} +\title{Utilities for HDFS} +\usage{ +hdfs_dir(path = ".", ..., full_path = FALSE, include_dirs = FALSE, + recursive = FALSE, dirs_only = FALSE, pattern = NULL, + convert_backslashes = TRUE) + +\method{print}{dplyrXdf_hdfs_dir}(x, ...) + +hdfs_dir_exists(path, convert_backslashes = TRUE) + +hdfs_file_exists(path, convert_backslashes = TRUE) + +hdfs_dir_create(path, ..., convert_backslashes = TRUE) + +hdfs_dir_remove(path, ..., convert_backslashes = TRUE) + +hdfs_file_copy(src, dest, ..., overwrite = TRUE, convert_backslashes = TRUE) + +hdfs_file_move(src, dest, ..., convert_backslashes = TRUE) + +hdfs_file_remove(path, ..., convert_backslashes = TRUE) + +hdfs_expunge() + +in_hdfs(obj = NULL) +} +\arguments{ +\item{path}{A HDFS pathname.} + +\item{...}{For \code{hdfs_dir}, further switches, prefixed by \code{"-"}, to pass to the Hadoop \code{fs -ls} command. For other functions, further arguments to pass to \code{\link{rxHadoopCommand.}}} + +\item{full_path}{For \code{hdfs_dir}, whether to prepend the directory path to filenames to give a full path. If FALSE, only file names are returned.} + +\item{include_dirs}{For \code{hdfs_dir}, if subdirectory names should be included. Always TRUE for non-recursive listings.} + +\item{recursive}{For \code{hdfs_dir}, if the listing should recurse into subdirectories.} + +\item{dirs_only}{For \code{hdfs_dir} if \emph{only} subdirectory names should be included.} + +\item{pattern}{For \code{hdfs_dir}, an optional \link{regular expression}. Only file names that match will be returned.} + +\item{convert_backslashes}{Whether to convert any backslashes found in the input to forward slashes.} + +\item{src, dest}{For \code{hdfs_file_copy} and \code{hdfs_file_move}, the source and destination paths.} + +\item{obj}{For \code{in_hdfs}, An R object, typically a RevoScaleR data source object.} +} +\value{ +\code{hdfs_dir} returns a vector of filenames, optionally with the full path attached. + +\code{hdfs_dir_exists} and \code{hdfs_file_exists} return TRUE or FALSE depending on whether the directory or file exists. + +The other \code{hdfs_*} functions return TRUE or FALSE depending on whether the operation succeeded. + +\code{in_hdfs} returns whether the given object is stored in HDFS. This will be TRUE for an Xdf data source or file data source in HDFS, or a Spark data source. Classes for the latter include \code{RxHiveData}, \code{RxParquetData} and \code{RxOrcData}. If no argument is specified, returns whether the default filesystem is HDFS. +} +\description{ +Functions for working with files in HDFS: directory listing; file copy, move and delete; directory create and delete; test for file/directory existence; check if in HDFS; expunge Trash. +} +\details{ +These are utility functions to simplify working with files and directories in HDFS. For the most part, they wrap lower-level functions provided by RevoScaleR, which in turn wrap various Hadoop file system commands. They work with any file that is stored in HDFS, not just Xdf files. + +The \code{hdfs_dir} function is analogous to \code{dir} for the native filesystem. Like that function, and unlike \code{\link{rxHadoopListFiles}}, it returns a vector of filenames (\code{rxHadoopListFiles} returns a vector of \emph{printed output} from the \code{hadoop fs -ls} command, which is not quite the same thing). Again unlike \code{rxHadoopListFiles}, it does not print anything by default (the \code{print} method takes care of that). + +\code{hdfs_dir_exists} and \code{hdfs_file_exists} test for the existence of a given directory and file, respectively. They are analogous to \code{dir.exists} and \code{file.exists} for the native filesystem. + +\code{hdfs_dir_create} and \code{hdfs_dir_remove} create and remove directories. They are analogous to \code{dir.create} and \code{unlink(recursive=TRUE)} for the native filesystem. + +\code{hdfs_file_copy} and \code{hdfs_file_move} copy and move files. They are analogous to \code{file.copy} and \code{file.rename} for the native filesystem. Unlike \code{\link{rxHadoopCopy}} and \code{\link{rxHadoopMove}}, they are vectorised in both \code{src} and \code{dest}. + +\code{hdfs_file_remove} deletes files. It is analogous to \code{file.remove} and \code{unlink} for the native filesystem. + +\code{hdfs_expunge} empties the HDFS trash. +} +\seealso{ +\code{\link{dir}}, \code{link{dir.exists}}, \code{\link{file.exists}}, \code{\link{dir.create}}, +\code{\link{file.copy}}, \code{\link{file.rename}}, \code{\link{file.remove}}, \code{\link{unlink}}, +\code{\link{rxHadoopListFiles}}, \code{\link{rxHadoopFileExists}}, +\code{\link{rxHadoopMakeDir}}, \code{\link{rxHadoopRemoveDir}}, +\code{\link{rxHadoopCopy}}, \code{\link{rxHadoopMove}}, \code{\link{rxHadoopRemove}} +} diff --git a/man/local_exec.Rd b/man/local_exec.Rd new file mode 100644 index 0000000..2a7b45a --- /dev/null +++ b/man/local_exec.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/hdfs_utils.R +\name{local_exec} +\alias{local_exec} +\title{Runs an expression in the local compute context} +\usage{ +local_exec(expr, context = "local") +} +\arguments{ +\item{expr}{An expression to execute. Normally something that depends on the compute context, such as \code{rxDataStep}.} + +\item{context}{The compute context in which to execute \code{expr}. Defaults to local.} +} +\value{ +The value of \code{expr}. +} +\description{ +Runs an expression in the local compute context +} +\details{ +This function is useful when you are working with datasets in both the native filesystem and HDFS. The workhorse RevoScaleR function for data transformation, \code{rxDataStep}, will complain if you are in a distributed compute context such as \code{\link{RxHadoopMR}} or \code{\link{RxSpark}}, and you want to process a dataset in the native filesystem. You can wrap your code inside a \code{local_exec} call to switch to the local compute context temporarily, and then switch back when it has finished running. +} +\seealso{ +\code{\link{eval}} +} diff --git a/man/workdir.Rd b/man/workdir.Rd index 7d793ce..0527f0a 100644 --- a/man/workdir.Rd +++ b/man/workdir.Rd @@ -4,7 +4,7 @@ \alias{set_dplyrxdf_dir} \alias{get_dplyrxdf_dir} \alias{clean_dplyrxdf_dir} -\title{Get and set the xdf tbl directory} +\title{Get, set and clean the xdf tbl directory} \usage{ set_dplyrxdf_dir(path, fileSystem = rxGetFileSystem()) @@ -21,13 +21,11 @@ clean_dplyrxdf_dir(fileSystem = rxGetFileSystem()) } \description{ By default, dplyrXdf will save the xdf files it creates in the R temporary directory. This can be a problem if it is in a location with limited disk space. Use \code{set_dplyrxdf_dir} to change the xdf tbl directory, and \code{get_dplyrxdf_dir} to view it. - -Delete data files for xdf tbls } \details{ If \code{path} is supplied, \code{set_dplyrxdf_dir} creates a new directory (with a unique name) located \emph{under} \code{path}. This ensures that the files managed by dplyrXdf are properly isolated from the rest of the filesystem. -This is a utility function to delete the files generated by dplyrXdf. Note that all xdf files in the specified location will be removed! +This is a utility function to delete the files generated by dplyrXdf. Note that all files in the specified location will be removed! } \seealso{ \code{\link{rxGetFileSystem}}, \code{\link{rxSetFileSystem}} diff --git a/man/xdf_utils.Rd b/man/xdf_utils.Rd new file mode 100644 index 0000000..259c347 --- /dev/null +++ b/man/xdf_utils.Rd @@ -0,0 +1,41 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/xdf_utils.R +\name{copy_xdf} +\alias{copy_xdf} +\alias{move_xdf} +\alias{rename_xdf} +\alias{delete_xdf} +\title{Utility functions for working with Xdf files} +\usage{ +copy_xdf(src, dest, overwrite = TRUE) + +move_xdf(src, dest, overwrite = TRUE) + +rename_xdf(src, dest) + +delete_xdf(xdf) +} +\arguments{ +\item{src}{For \code{copy_xdf}, \code{move_xdf} and \code{rename_xdf}, an Xdf data source object (\emph{not} a filename).} + +\item{dest}{For \code{copy_xdf}, \code{move_xdf} and \code{rename_xdf}, a character string giving the destination file. Note that for \code{rename_xdf}, this should only be a base name, not a full path.} + +\item{overwrite}{Logical; for \code{copy_xdf} and \code{move_xdf}, whether to overwrite any existing file.} + +\item{xdf}{For \code{delete_xdf}, an Xdf data source object.} +} +\value{ +\code{copy_xdf}, \code{move_xdf} and \code{rename_xdf} return an Xdf data source object pointing to the new file location. \code{delete_xdf} returns TRUE/FALSE depending on whether the delete operation succeeded. +} +\description{ +Copy, move, rename and delete an Xdf file. +} +\details{ +The \code{copy_xdf} function copies the Xdf file given by \code{src} to the location specified by \code{dest}, possibly renaming it as well; \code{move_xdf} moves the file. \code{rename_xdf} does a strict rename (the location of the file is unchanged, only its name). \code{delete_xdf} deletes the file given by \code{xdf}. + +These are utility functions for working with the files/directories where the data for an Xdf data source is stored. They make use of low-level OS functionality, so should be more efficient that running \code{rxDataStep}. They work with both standard and composite Xdf files, and with data stored both in the native filesystem and in HDFS. +} +\seealso{ +\code{\link{file.copy}}, \code{\link{file.rename}}, \code{\link{unlink}}, +\code{\link{rxHadoopCopy}}, \code{\link{rxHadoopMove}}, \code{\link{rxHadoopRemove}}, \code{\link{rxHadoopRemoveDir}} +}