From b90c5b8b751533f9414b941f510dc8d9abcc69cf Mon Sep 17 00:00:00 2001 From: Michael Frasco Date: Thu, 20 Jul 2017 15:01:39 -0700 Subject: [PATCH 1/2] clarified documentation for max_hits default parameter and then changed default parameter from NULL to Inf --- r-pkg/R/elasticsearch_parsers.R | 24 +++++++++++------------- r-pkg/man/es_search.Rd | 6 ++++-- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/r-pkg/R/elasticsearch_parsers.R b/r-pkg/R/elasticsearch_parsers.R index ec95c2c..29fcf7b 100644 --- a/r-pkg/R/elasticsearch_parsers.R +++ b/r-pkg/R/elasticsearch_parsers.R @@ -548,7 +548,9 @@ chomp_hits <- function(hits_json = NULL, keep_nested_data_cols = TRUE) { #' @param es_host A string identifying an Elasticsearch host. This should be of the form #' \code{[transfer_protocol][hostname]:[port]}. For example, \code{'http://myindex.thing.com:9200'}. #' @param es_index The name of an Elasticsearch index to be queried. -#' @param max_hits Integer. If specified, \code{es_search} will stop pulling data as soon as it has pulled this many hits. +#' @param max_hits Integer. If specified, \code{es_search} will stop pulling data as soon +#' as it has pulled this many hits. Default is \code{Inf}, meaning that +#' all possible hits will be pulled. #' @param size Number of records per page of results. See \href{https://www.elastic.co/guide/en/elasticsearch/reference/current/search-request-from-size.html}{Elasticsearch docs} for more. #' Note that this will be reset to 0 if you submit a \code{query_body} with #' an "aggs" request in it. Also see \code{max_hits}. @@ -617,7 +619,7 @@ es_search <- function(es_host , size = 10000 , query_body = '{}' , scroll = "5m" - , max_hits = NULL + , max_hits = Inf , n_cores = ceiling(parallel::detectCores()/2) , break_on_duplicates = TRUE , ignore_scroll_restriction = FALSE @@ -680,7 +682,9 @@ es_search <- function(es_host # time you expect to pass between requests. See the # \href{https://www.elastic.co/guide/en/Elasticsearch/guide/current/scroll.html}{Elasticsearch scroll/pagination docs} # for more information. -# [param] max_hits Integer. If specified, \code{.fetch_all} will stop pulling data as soon as it passes this threshold. +# [param] max_hits Integer. If specified, \code{es_search} will stop pulling data as soon +#' as it has pulled this many hits. Default is \code{Inf}, meaning that +#' all possible hits will be pulled. # [param] n_cores Number of cores to distribute fetching + processing over. # [param] break_on_duplicates Boolean, defaults to TRUE. \code{.fetch_all} uses the size of the final object it returns # to check whether or not some data were lost during the processing. @@ -727,7 +731,7 @@ es_search <- function(es_host , size = 10000 , query_body = '{}' , scroll = "5m" - , max_hits = NULL + , max_hits = Inf , n_cores = ceiling(parallel::detectCores()/2) , break_on_duplicates = TRUE , ignore_scroll_restriction = FALSE @@ -756,7 +760,7 @@ es_search <- function(es_host # requesting more hits than you get is not costless: # - ES allocates a temporary data structure of size # - you end up transmitting more data over the wire than the user wants - if (!is.null(max_hits) && max_hits < size){ + if (max_hits < size) { msg <- paste0(sprintf("You requested a maximum of %s hits", max_hits), sprintf(" and a page size of %s.", size), sprintf(" Resetting size to %s for efficiency.", max_hits)) @@ -767,7 +771,7 @@ es_search <- function(es_host } # Warn if you are gonna give back a few more hits than max_hits - if (!is.null(max_hits) && max_hits %% size != 0){ + if (!is.infinite(max_hits) && max_hits %% size != 0) { msg <- paste0("When max_hits is not an exact multiple of size, it is ", "possible to get a few more than max_hits results back.") futile.logger::flog.warn(msg) @@ -925,7 +929,7 @@ es_search <- function(es_host # - returns the first page + a scroll_id which uniquely identifies the stack # [params] scroll_id - a unique key identifying the search context # out_path - A file path to write temporary output to. Passed in from .fetch_all -# max_hits - max_hits, comes from .fetch_all. If left as NULL in your call to +# max_hits - max_hits, comes from .fetch_all. If left as Inf in your call to # .fetch_all, this param has no influence and you will pull all the data. # otherwise, this is used to limit the result size. # scroll_url - Elasticsearch URL to hit to get the next page of data @@ -947,12 +951,6 @@ es_search <- function(es_host , hits_to_pull ){ - # Deal with case where user tries to say "don't limit me" by setting - # max_hits = NULL explicitly - if (is.null(max_hits)){ - max_hits <- Inf - } - while (hits_pulled < max_hits){ # Grab a page of hits, break if we got back an error diff --git a/r-pkg/man/es_search.Rd b/r-pkg/man/es_search.Rd index 2388a76..491c0e7 100644 --- a/r-pkg/man/es_search.Rd +++ b/r-pkg/man/es_search.Rd @@ -5,7 +5,7 @@ \title{Execute an ES query and get a data.table} \usage{ es_search(es_host, es_index, size = 10000, query_body = "{}", - scroll = "5m", max_hits = NULL, + scroll = "5m", max_hits = Inf, n_cores = ceiling(parallel::detectCores()/2), break_on_duplicates = TRUE, ignore_scroll_restriction = FALSE, intermediates_dir = getwd()) } @@ -29,7 +29,9 @@ time you expect to pass between requests. See the \href{https://www.elastic.co/guide/en/elasticsearch/reference/current/search-request-scroll.html}{Elasticsearch scroll/pagination docs} for more information.} -\item{max_hits}{Integer. If specified, \code{es_search} will stop pulling data as soon as it has pulled this many hits.} +\item{max_hits}{Integer. If specified, \code{es_search} will stop pulling data as soon +as it has pulled this many hits. Default is \code{Inf}, meaning that +all possible hits will be pulled.} \item{n_cores}{Number of cores to distribute fetching + processing over.} From 985789e34aaedd90e69010286b02ee764f7db94e Mon Sep 17 00:00:00 2001 From: Michael Frasco Date: Fri, 21 Jul 2017 08:51:15 -0700 Subject: [PATCH 2/2] removed apostrophe in non-roxygen documentation --- r-pkg/R/elasticsearch_parsers.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/r-pkg/R/elasticsearch_parsers.R b/r-pkg/R/elasticsearch_parsers.R index 29fcf7b..9c087cd 100644 --- a/r-pkg/R/elasticsearch_parsers.R +++ b/r-pkg/R/elasticsearch_parsers.R @@ -683,8 +683,8 @@ es_search <- function(es_host # \href{https://www.elastic.co/guide/en/Elasticsearch/guide/current/scroll.html}{Elasticsearch scroll/pagination docs} # for more information. # [param] max_hits Integer. If specified, \code{es_search} will stop pulling data as soon -#' as it has pulled this many hits. Default is \code{Inf}, meaning that -#' all possible hits will be pulled. +# as it has pulled this many hits. Default is \code{Inf}, meaning that +# all possible hits will be pulled. # [param] n_cores Number of cores to distribute fetching + processing over. # [param] break_on_duplicates Boolean, defaults to TRUE. \code{.fetch_all} uses the size of the final object it returns # to check whether or not some data were lost during the processing.