From 586c05d265ef4c51b974a7c356027b347a6167cf Mon Sep 17 00:00:00 2001 From: Michael Frasco Date: Thu, 20 Jul 2017 16:32:58 -0700 Subject: [PATCH 01/13] initial structure for retrieve_mapping --- r-pkg/R/elasticsearch_eda_funs.R | 73 ++++++++++++++++++++++++++++++++ r-pkg/R/elasticsearch_parsers.R | 1 - 2 files changed, 73 insertions(+), 1 deletion(-) diff --git a/r-pkg/R/elasticsearch_eda_funs.R b/r-pkg/R/elasticsearch_eda_funs.R index 5632648..6692513 100644 --- a/r-pkg/R/elasticsearch_eda_funs.R +++ b/r-pkg/R/elasticsearch_eda_funs.R @@ -114,3 +114,76 @@ get_counts <- function(field } +#' @title Retrieve the mapping definitions for an index or index/type in Elasticsearch +#' @name retrieve_mapping +#' @description For one or multiple index or index/type, return a data table with +#' field names and types. +#' @importFrom httr GET, content +#' @importFrom futile.logger flog.fatal +#' @param es_host A string identifying an Elasticsearch host. This should be of +#' the form \code{[transfer_protocol][hostname]:[port]}. For example, +#' \code{'http://myindex.thing.com:9200'}. +#' @param es_index A character vector that contains the names of indices for +#' which to get mappings. Default, is \code{'_all'}, which means +#' get the mapping for all indices. +#' @param es_type A character vector that contains the names of types for which +#' to get mappings. Default is \code{NULL}, which means get the +#' mapping for all types in the chosen indices. +#' @param es_field A character vector that contains the names of fields for which +#' to get mappings, which can be used when the entire mapping is +#' not desired. Default is \code{NULL}, which means get the +#' mapping for all fields in the chosen types. +#' @export +#' @return A data table containing the field - definition mapping for the selected +#' indices, types, and fields +retrieve_mapping <- function(es_host + , es_index = '_all' + , es_type = NULL + , es_field = NULL +) { + + # Input checking + url <- .ValidateAndFormatHost(es_host) + + # collapse character vectors into comma separated strings. If any arguments + # are NULL, create an empty string + indices <- paste(es_index, collapse = ',') + types <- paste(es_type, collapse = ',') + fields <- paste(es_field, collapse = ',') + + # build the query + if (nchar(indices) > 0) { + url <- paste(url, indices, '_mapping', sep = '/') + } else { + msg <- paste("retrive_mapping must be passed a valid es_index." + , "You provided", paste(es_index, collapse = ', ') + , 'which resulted in an empty string') + futile.logger::flog.fatal(msg) + stop(msg) + } + + if (nchar(types) > 0) { + url <- paste(url, types, sep = '/') + } + + if (nchar(fields) > 0) { + url <- paste(url, 'field', fields, sep = '/') + } + + # make the query + result <- httr::GET(url = url) + resultContent <- httr::content(result) + + # parse the result into a data table + +} + + + + + + + + + + diff --git a/r-pkg/R/elasticsearch_parsers.R b/r-pkg/R/elasticsearch_parsers.R index 29fcf7b..0211ef8 100644 --- a/r-pkg/R/elasticsearch_parsers.R +++ b/r-pkg/R/elasticsearch_parsers.R @@ -1038,7 +1038,6 @@ es_search <- function(es_host } - # [title] Execute a Search request against an Elasticsearch cluster # [name] .search_request # [description] Given a query string (JSON with valid DSL), execute a request From 49dd58674630e32622df8809923346341ae17820 Mon Sep 17 00:00:00 2001 From: Michael Frasco Date: Fri, 21 Jul 2017 01:27:40 -0700 Subject: [PATCH 02/13] completed initial version of retrieve_mapping. --- r-pkg/NAMESPACE | 4 + r-pkg/R/elasticsearch_eda_funs.R | 86 ++++++++++--------- r-pkg/R/uptasticsearch.R | 2 + r-pkg/man/retrieve_mapping.Rd | 35 ++++++++ .../testthat/test-repo_characteristics.R | 2 +- 5 files changed, 88 insertions(+), 41 deletions(-) create mode 100644 r-pkg/man/retrieve_mapping.Rd diff --git a/r-pkg/NAMESPACE b/r-pkg/NAMESPACE index ff99430..a952324 100644 --- a/r-pkg/NAMESPACE +++ b/r-pkg/NAMESPACE @@ -5,6 +5,7 @@ export(chomp_hits) export(es_search) export(get_counts) export(parse_date_time) +export(retrieve_mapping) export(unpack_nested_data) importFrom(data.table,":=") importFrom(data.table,as.data.table) @@ -17,6 +18,7 @@ importFrom(data.table,setnames) importFrom(futile.logger,flog.fatal) importFrom(futile.logger,flog.info) importFrom(futile.logger,flog.warn) +importFrom(httr,GET) importFrom(httr,POST) importFrom(httr,content) importFrom(httr,stop_for_status) @@ -30,4 +32,6 @@ importFrom(purrr,map2) importFrom(purrr,simplify) importFrom(purrr,transpose) importFrom(stringr,str_extract) +importFrom(stringr,str_replace) +importFrom(stringr,str_split_fixed) importFrom(uuid,UUIDgenerate) diff --git a/r-pkg/R/elasticsearch_eda_funs.R b/r-pkg/R/elasticsearch_eda_funs.R index 6692513..837333e 100644 --- a/r-pkg/R/elasticsearch_eda_funs.R +++ b/r-pkg/R/elasticsearch_eda_funs.R @@ -118,28 +118,29 @@ get_counts <- function(field #' @name retrieve_mapping #' @description For one or multiple index or index/type, return a data table with #' field names and types. -#' @importFrom httr GET, content +#' @importFrom data.table := data.table setnames #' @importFrom futile.logger flog.fatal +#' @importFrom httr GET content stop_for_status +#' @importFrom stringr str_split_fixed str_replace #' @param es_host A string identifying an Elasticsearch host. This should be of #' the form \code{[transfer_protocol][hostname]:[port]}. For example, #' \code{'http://myindex.thing.com:9200'}. -#' @param es_index A character vector that contains the names of indices for -#' which to get mappings. Default, is \code{'_all'}, which means -#' get the mapping for all indices. -#' @param es_type A character vector that contains the names of types for which -#' to get mappings. Default is \code{NULL}, which means get the -#' mapping for all types in the chosen indices. -#' @param es_field A character vector that contains the names of fields for which -#' to get mappings, which can be used when the entire mapping is -#' not desired. Default is \code{NULL}, which means get the -#' mapping for all fields in the chosen types. +#' @param es_indexes A character vector that contains the names of indexes for +#' which to get mappings. Default, is \code{'_all'}, which means +#' get the mapping for all indexes +#' @param es_types A character vector that contains the names of types for which +#' to get mappings. Default is \code{NULL}, which means get the +#' mapping for all types in the chosen indexes #' @export -#' @return A data table containing the field - definition mapping for the selected -#' indices, types, and fields +#' @return A data.table containing four columns: index, type, field, and datatype +#' @examples \dontrun{ +#' # get the mapping for all types in the ticket_sales index +#' mappingDT <- retrieve_mapping(es_host = "http://es.custdb.mycompany.com:9200" +#' , es_indexes = "ticket_sales") +#' } retrieve_mapping <- function(es_host - , es_index = '_all' - , es_type = NULL - , es_field = NULL + , es_indexes = '_all' + , es_types = NULL ) { # Input checking @@ -147,43 +148,48 @@ retrieve_mapping <- function(es_host # collapse character vectors into comma separated strings. If any arguments # are NULL, create an empty string - indices <- paste(es_index, collapse = ',') - types <- paste(es_type, collapse = ',') - fields <- paste(es_field, collapse = ',') + indexes <- paste(es_indexes, collapse = ',') + types <- paste(es_types, collapse = ',') - # build the query - if (nchar(indices) > 0) { - url <- paste(url, indices, '_mapping', sep = '/') + ########################## build the query ################################ + if (nchar(indexes) > 0) { + url <- paste(url, indexes, '_mapping', sep = '/') } else { - msg <- paste("retrive_mapping must be passed a valid es_index." - , "You provided", paste(es_index, collapse = ', ') + msg <- paste("retrive_mapping must be passed a valid es_indexes." + , "You provided", paste(es_indexes, collapse = ', ') , 'which resulted in an empty string') futile.logger::flog.fatal(msg) stop(msg) } + # check if the user specified any types if (nchar(types) > 0) { url <- paste(url, types, sep = '/') } - if (nchar(fields) > 0) { - url <- paste(url, 'field', fields, sep = '/') - } - - # make the query + ########################## make the query ################################ result <- httr::GET(url = url) + httr::stop_for_status(result) resultContent <- httr::content(result) - # parse the result into a data table + ######################### parse the result ############################### + # flatten the list object that is returned from the query + flattened <- unlist(resultContent) + + # the names of the flattened object has the index, type, and field name + # however, it also has extra terms that we can use to split the name + # into three distinct parts + mappingCols <- stringr::str_split_fixed(names(flattened), '\\.(mappings|properties)\\.', n = 3) + mappingDT <- data.table::as.data.table(mappingCols) + data.table::setnames(mappingDT, c('index', 'type', 'field')) + # if the field is a nested object or has multiple indexes, the field name + # have extra terms that we can remove + removeRegEx <- '\\.(properties|fields|type)' + mappingDT[, field := stringr::str_replace_all(field, removeRegEx, '')] + + # add the actual data type as a new column in the data table + mappingDT[, datatype := as.character(flattened)] + + return(mappingDT) } - - - - - - - - - - diff --git a/r-pkg/R/uptasticsearch.R b/r-pkg/R/uptasticsearch.R index d8277f0..b80f231 100644 --- a/r-pkg/R/uptasticsearch.R +++ b/r-pkg/R/uptasticsearch.R @@ -8,4 +8,6 @@ utils::globalVariables(c('.' , '.I' , '.id' + , 'field' + , 'datatype' )) \ No newline at end of file diff --git a/r-pkg/man/retrieve_mapping.Rd b/r-pkg/man/retrieve_mapping.Rd new file mode 100644 index 0000000..7a26b21 --- /dev/null +++ b/r-pkg/man/retrieve_mapping.Rd @@ -0,0 +1,35 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/elasticsearch_eda_funs.R +\name{retrieve_mapping} +\alias{retrieve_mapping} +\title{Retrieve the mapping definitions for an index or index/type in Elasticsearch} +\usage{ +retrieve_mapping(es_host, es_indexes = "_all", es_types = NULL) +} +\arguments{ +\item{es_host}{A string identifying an Elasticsearch host. This should be of +the form \code{[transfer_protocol][hostname]:[port]}. For example, +\code{'http://myindex.thing.com:9200'}.} + +\item{es_indexes}{A character vector that contains the names of indexes for +which to get mappings. Default, is \code{'_all'}, which means +get the mapping for all indexes} + +\item{es_types}{A character vector that contains the names of types for which +to get mappings. Default is \code{NULL}, which means get the +mapping for all types in the chosen indexes} +} +\value{ +A data.table containing four columns: index, type, field, and datatype +} +\description{ +For one or multiple index or index/type, return a data table with + field names and types. +} +\examples{ +\dontrun{ +# get the mapping for all types in the ticket_sales index +mappingDT <- retrieve_mapping(es_host = "http://es.custdb.mycompany.com:9200" + , es_indexes = "ticket_sales") +} +} diff --git a/r-pkg/tests/testthat/test-repo_characteristics.R b/r-pkg/tests/testthat/test-repo_characteristics.R index 6b916bd..631ee23 100644 --- a/r-pkg/tests/testthat/test-repo_characteristics.R +++ b/r-pkg/tests/testthat/test-repo_characteristics.R @@ -35,7 +35,7 @@ test_that('R CMD check should not return any unexpected errors, warnings, or not testthat::skip_on_cran() # Check the package - x <- devtools::check(pkg = '../../../uptasticsearch' + x <- devtools::check(pkg = '../../../r-pkg' , document = TRUE , args = '--no-tests --ignore-vignettes' , quiet = FALSE) From 0cb15d90d22e8b56cd7ca8a4441a7df88baf0a0c Mon Sep 17 00:00:00 2001 From: Michael Frasco Date: Fri, 21 Jul 2017 11:25:42 -0700 Subject: [PATCH 03/13] renamed retrieve_mapping to get_fields, clarified documentation, and removed meta data from resultDT --- r-pkg/NAMESPACE | 3 +- r-pkg/R/elasticsearch_eda_funs.R | 54 +++++++++---------- r-pkg/R/uptasticsearch.R | 1 - .../{retrieve_mapping.Rd => get_fields.Rd} | 22 ++++---- 4 files changed, 35 insertions(+), 45 deletions(-) rename r-pkg/man/{retrieve_mapping.Rd => get_fields.Rd} (50%) diff --git a/r-pkg/NAMESPACE b/r-pkg/NAMESPACE index a952324..6f57094 100644 --- a/r-pkg/NAMESPACE +++ b/r-pkg/NAMESPACE @@ -4,8 +4,8 @@ export(chomp_aggs) export(chomp_hits) export(es_search) export(get_counts) +export(get_fields) export(parse_date_time) -export(retrieve_mapping) export(unpack_nested_data) importFrom(data.table,":=") importFrom(data.table,as.data.table) @@ -31,6 +31,7 @@ importFrom(parallel,stopCluster) importFrom(purrr,map2) importFrom(purrr,simplify) importFrom(purrr,transpose) +importFrom(stringr,str_detect) importFrom(stringr,str_extract) importFrom(stringr,str_replace) importFrom(stringr,str_split_fixed) diff --git a/r-pkg/R/elasticsearch_eda_funs.R b/r-pkg/R/elasticsearch_eda_funs.R index 837333e..75ab8c2 100644 --- a/r-pkg/R/elasticsearch_eda_funs.R +++ b/r-pkg/R/elasticsearch_eda_funs.R @@ -114,42 +114,38 @@ get_counts <- function(field } -#' @title Retrieve the mapping definitions for an index or index/type in Elasticsearch -#' @name retrieve_mapping -#' @description For one or multiple index or index/type, return a data table with -#' field names and types. -#' @importFrom data.table := data.table setnames +#' @title Get the names and data types of the indexed fields in an index +#' @name get_fields +#' @description For a given Elasticsearch index, return the mapping from field name +#' to data type for all indexed fields. +#' @importFrom data.table := as.data.table setnames #' @importFrom futile.logger flog.fatal #' @importFrom httr GET content stop_for_status -#' @importFrom stringr str_split_fixed str_replace +#' @importFrom stringr str_detect str_split_fixed str_replace #' @param es_host A string identifying an Elasticsearch host. This should be of #' the form \code{[transfer_protocol][hostname]:[port]}. For example, #' \code{'http://myindex.thing.com:9200'}. #' @param es_indexes A character vector that contains the names of indexes for #' which to get mappings. Default, is \code{'_all'}, which means -#' get the mapping for all indexes -#' @param es_types A character vector that contains the names of types for which -#' to get mappings. Default is \code{NULL}, which means get the -#' mapping for all types in the chosen indexes +#' get the mapping for all indexes. Names of indexes can be +#' treated as regular expressions. #' @export #' @return A data.table containing four columns: index, type, field, and datatype #' @examples \dontrun{ -#' # get the mapping for all types in the ticket_sales index +#' # get the mapping for all types in the ticket_sales and customers indexes #' mappingDT <- retrieve_mapping(es_host = "http://es.custdb.mycompany.com:9200" -#' , es_indexes = "ticket_sales") +#' , es_indexes = c("ticket_sales", "indexes")) #' } -retrieve_mapping <- function(es_host - , es_indexes = '_all' - , es_types = NULL +get_fields <- function(es_host + , es_indexes = '_all' ) { # Input checking - url <- .ValidateAndFormatHost(es_host) + url <- uptasticsearch:::.ValidateAndFormatHost(es_host) # collapse character vectors into comma separated strings. If any arguments # are NULL, create an empty string indexes <- paste(es_indexes, collapse = ',') - types <- paste(es_types, collapse = ',') ########################## build the query ################################ if (nchar(indexes) > 0) { @@ -162,11 +158,6 @@ retrieve_mapping <- function(es_host stop(msg) } - # check if the user specified any types - if (nchar(types) > 0) { - url <- paste(url, types, sep = '/') - } - ########################## make the query ################################ result <- httr::GET(url = url) httr::stop_for_status(result) @@ -180,16 +171,19 @@ retrieve_mapping <- function(es_host # however, it also has extra terms that we can use to split the name # into three distinct parts mappingCols <- stringr::str_split_fixed(names(flattened), '\\.(mappings|properties)\\.', n = 3) - mappingDT <- data.table::as.data.table(mappingCols) - data.table::setnames(mappingDT, c('index', 'type', 'field')) - # if the field is a nested object or has multiple indexes, the field name - # have extra terms that we can remove - removeRegEx <- '\\.(properties|fields|type)' - mappingDT[, field := stringr::str_replace_all(field, removeRegEx, '')] + # convert to data table and add the data type column + mappingDT <- data.table::data.table(mappingCols, as.character(flattened)) + data.table::setnames(mappingDT, c('index', 'type', 'field', 'datatype')) + + # remove any rows, where the field does not end in ".type" to remove meta info + mappingDT <- mappingDT[stringr::str_detect(field, '\\.type')] - # add the actual data type as a new column in the data table - mappingDT[, datatype := as.character(flattened)] + # mappings in nested objects have sub-fields called properties + # mappings of fields that are indexed in different ways have multiple fields + # we want to remove these terms from the field name + metaRegEx <- '\\.(properties|fields|type)' + mappingDT[, field := stringr::str_replace_all(field, metaRegEx, '')] return(mappingDT) } diff --git a/r-pkg/R/uptasticsearch.R b/r-pkg/R/uptasticsearch.R index b80f231..196d28b 100644 --- a/r-pkg/R/uptasticsearch.R +++ b/r-pkg/R/uptasticsearch.R @@ -9,5 +9,4 @@ utils::globalVariables(c('.' , '.I' , '.id' , 'field' - , 'datatype' )) \ No newline at end of file diff --git a/r-pkg/man/retrieve_mapping.Rd b/r-pkg/man/get_fields.Rd similarity index 50% rename from r-pkg/man/retrieve_mapping.Rd rename to r-pkg/man/get_fields.Rd index 7a26b21..e5ead2a 100644 --- a/r-pkg/man/retrieve_mapping.Rd +++ b/r-pkg/man/get_fields.Rd @@ -1,10 +1,10 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/elasticsearch_eda_funs.R -\name{retrieve_mapping} -\alias{retrieve_mapping} -\title{Retrieve the mapping definitions for an index or index/type in Elasticsearch} +\name{get_fields} +\alias{get_fields} +\title{Get the names and data types of the indexed fields in an index} \usage{ -retrieve_mapping(es_host, es_indexes = "_all", es_types = NULL) +get_fields(es_host, es_indexes = "_all") } \arguments{ \item{es_host}{A string identifying an Elasticsearch host. This should be of @@ -13,23 +13,19 @@ the form \code{[transfer_protocol][hostname]:[port]}. For example, \item{es_indexes}{A character vector that contains the names of indexes for which to get mappings. Default, is \code{'_all'}, which means -get the mapping for all indexes} - -\item{es_types}{A character vector that contains the names of types for which -to get mappings. Default is \code{NULL}, which means get the -mapping for all types in the chosen indexes} +get the mapping for all indexes.} } \value{ A data.table containing four columns: index, type, field, and datatype } \description{ -For one or multiple index or index/type, return a data table with - field names and types. +For a given Elasticsearch index, return the mapping from field name + to data type for all indexed fields. } \examples{ \dontrun{ -# get the mapping for all types in the ticket_sales index +# get the mapping for all types in the ticket_sales and customers indexes mappingDT <- retrieve_mapping(es_host = "http://es.custdb.mycompany.com:9200" - , es_indexes = "ticket_sales") + , es_indexes = c("ticket_sales", "indexes")) } } From c699c3a2242e86bce7ba0144466ded843fb09fd1 Mon Sep 17 00:00:00 2001 From: Michael Frasco Date: Fri, 21 Jul 2017 14:11:05 -0700 Subject: [PATCH 04/13] fixed package namespacing and roxygen errors --- r-pkg/R/elasticsearch_eda_funs.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/r-pkg/R/elasticsearch_eda_funs.R b/r-pkg/R/elasticsearch_eda_funs.R index 75ab8c2..257276e 100644 --- a/r-pkg/R/elasticsearch_eda_funs.R +++ b/r-pkg/R/elasticsearch_eda_funs.R @@ -118,10 +118,10 @@ get_counts <- function(field #' @name get_fields #' @description For a given Elasticsearch index, return the mapping from field name #' to data type for all indexed fields. -#' @importFrom data.table := as.data.table setnames +#' @importFrom data.table := data.table setnames #' @importFrom futile.logger flog.fatal #' @importFrom httr GET content stop_for_status -#' @importFrom stringr str_detect str_split_fixed str_replace +#' @importFrom stringr str_detect str_split_fixed str_replace_all #' @param es_host A string identifying an Elasticsearch host. This should be of #' the form \code{[transfer_protocol][hostname]:[port]}. For example, #' \code{'http://myindex.thing.com:9200'}. @@ -141,7 +141,7 @@ get_fields <- function(es_host ) { # Input checking - url <- uptasticsearch:::.ValidateAndFormatHost(es_host) + url <- .ValidateAndFormatHost(es_host) # collapse character vectors into comma separated strings. If any arguments # are NULL, create an empty string @@ -151,7 +151,7 @@ get_fields <- function(es_host if (nchar(indexes) > 0) { url <- paste(url, indexes, '_mapping', sep = '/') } else { - msg <- paste("retrive_mapping must be passed a valid es_indexes." + msg <- paste("get_fields must be passed a valid es_indexes." , "You provided", paste(es_indexes, collapse = ', ') , 'which resulted in an empty string') futile.logger::flog.fatal(msg) From 4e02a7ce6874900d802ad06b446a6ec9a2f053d3 Mon Sep 17 00:00:00 2001 From: Michael Frasco Date: Fri, 21 Jul 2017 14:19:01 -0700 Subject: [PATCH 05/13] adding documentation --- r-pkg/NAMESPACE | 2 +- r-pkg/man/get_fields.Rd | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/r-pkg/NAMESPACE b/r-pkg/NAMESPACE index 6f57094..2a339a3 100644 --- a/r-pkg/NAMESPACE +++ b/r-pkg/NAMESPACE @@ -33,6 +33,6 @@ importFrom(purrr,simplify) importFrom(purrr,transpose) importFrom(stringr,str_detect) importFrom(stringr,str_extract) -importFrom(stringr,str_replace) +importFrom(stringr,str_replace_all) importFrom(stringr,str_split_fixed) importFrom(uuid,UUIDgenerate) diff --git a/r-pkg/man/get_fields.Rd b/r-pkg/man/get_fields.Rd index e5ead2a..66ae3ef 100644 --- a/r-pkg/man/get_fields.Rd +++ b/r-pkg/man/get_fields.Rd @@ -13,7 +13,8 @@ the form \code{[transfer_protocol][hostname]:[port]}. For example, \item{es_indexes}{A character vector that contains the names of indexes for which to get mappings. Default, is \code{'_all'}, which means -get the mapping for all indexes.} +get the mapping for all indexes. Names of indexes can be +treated as regular expressions.} } \value{ A data.table containing four columns: index, type, field, and datatype From 84f847491559d4f7bef3719fdf1487aeee3a9761 Mon Sep 17 00:00:00 2001 From: Michael Frasco Date: Sat, 22 Jul 2017 18:51:02 -0700 Subject: [PATCH 06/13] addressed comments on get_fields and added unit tests --- r-pkg/DESCRIPTION | 1 + r-pkg/NAMESPACE | 1 + r-pkg/R/elasticsearch_eda_funs.R | 35 ++++++++--- r-pkg/R/uptasticsearch.R | 1 + r-pkg/inst/testdata/one_index_mapping.json | 29 +++++++++ r-pkg/inst/testdata/two_index_mapping.json | 52 +++++++++++++++ r-pkg/man/get_fields.Rd | 4 +- .../testthat/test-elasticsearch_eda_funs.R | 63 +++++++++++++++++++ 8 files changed, 175 insertions(+), 11 deletions(-) create mode 100644 r-pkg/inst/testdata/one_index_mapping.json create mode 100644 r-pkg/inst/testdata/two_index_mapping.json create mode 100644 r-pkg/tests/testthat/test-elasticsearch_eda_funs.R diff --git a/r-pkg/DESCRIPTION b/r-pkg/DESCRIPTION index 38965dd..6675d48 100644 --- a/r-pkg/DESCRIPTION +++ b/r-pkg/DESCRIPTION @@ -6,6 +6,7 @@ Authors@R: c( person("James", "Lamb", email = "james.lamb@uptake.com", role = c("aut", "cre")), person("Nick", "Paras", email = "nick.paras@uptake.com", role = c("aut")), person("Austin", "Dickey", email = "austin.dickey@uptake.com", role = c("aut")), + person("Michael", "Frasco", email = "mfrasco6@gmail.com", role = c("ctb")), person("Uptake Technologies Inc.", role = c("cph"))) Maintainer: James Lamb Description: diff --git a/r-pkg/NAMESPACE b/r-pkg/NAMESPACE index 2a339a3..4fb4dba 100644 --- a/r-pkg/NAMESPACE +++ b/r-pkg/NAMESPACE @@ -15,6 +15,7 @@ importFrom(data.table,rbindlist) importFrom(data.table,setcolorder) importFrom(data.table,setkeyv) importFrom(data.table,setnames) +importFrom(data.table,uniqueN) importFrom(futile.logger,flog.fatal) importFrom(futile.logger,flog.info) importFrom(futile.logger,flog.warn) diff --git a/r-pkg/R/elasticsearch_eda_funs.R b/r-pkg/R/elasticsearch_eda_funs.R index 257276e..23cd17b 100644 --- a/r-pkg/R/elasticsearch_eda_funs.R +++ b/r-pkg/R/elasticsearch_eda_funs.R @@ -118,10 +118,8 @@ get_counts <- function(field #' @name get_fields #' @description For a given Elasticsearch index, return the mapping from field name #' to data type for all indexed fields. -#' @importFrom data.table := data.table setnames -#' @importFrom futile.logger flog.fatal +#' @importFrom futile.logger flog.fatal flog.info #' @importFrom httr GET content stop_for_status -#' @importFrom stringr str_detect str_split_fixed str_replace_all #' @param es_host A string identifying an Elasticsearch host. This should be of #' the form \code{[transfer_protocol][hostname]:[port]}. For example, #' \code{'http://myindex.thing.com:9200'}. @@ -130,9 +128,9 @@ get_counts <- function(field #' get the mapping for all indexes. Names of indexes can be #' treated as regular expressions. #' @export -#' @return A data.table containing four columns: index, type, field, and datatype +#' @return A data.table containing four columns: index, type, field, and data_type #' @examples \dontrun{ -#' # get the mapping for all types in the ticket_sales and customers indexes +#' # get the mapping for all indexed fields in the ticket_sales and customers indexes #' mappingDT <- retrieve_mapping(es_host = "http://es.custdb.mycompany.com:9200" #' , es_indexes = c("ticket_sales", "indexes")) #' } @@ -159,13 +157,26 @@ get_fields <- function(es_host } ########################## make the query ################################ + futile.logger::flog.info(paste('Getting indexed fields for indexes:', indexes)) + result <- httr::GET(url = url) httr::stop_for_status(result) resultContent <- httr::content(result) + ##################### return the flattened result ######################### + return(.flatten_mapping(mapping = resultContent)) +} + +# [title] Flatten a mapping list of field name to data type into a data table +# [mapping] A list of json that is returned from a request to the mappings API +#' @importFrom data.table := data.table setnames uniqueN +#' @importFrom futile.logger flog.info +#' @importFrom stringr str_detect str_split_fixed str_replace_all +.flatten_mapping <- function(mapping) { + ######################### parse the result ############################### # flatten the list object that is returned from the query - flattened <- unlist(resultContent) + flattened <- unlist(mapping) # the names of the flattened object has the index, type, and field name # however, it also has extra terms that we can use to split the name @@ -173,11 +184,12 @@ get_fields <- function(es_host mappingCols <- stringr::str_split_fixed(names(flattened), '\\.(mappings|properties)\\.', n = 3) # convert to data table and add the data type column - mappingDT <- data.table::data.table(mappingCols, as.character(flattened)) - data.table::setnames(mappingDT, c('index', 'type', 'field', 'datatype')) + mappingDT <- data.table::data.table(meta = mappingCols, data_type = as.character(flattened)) + newColNames <- c('index', 'type', 'field', 'data_type') + data.table::setnames(mappingDT, old = names(mappingDT), new = newColNames) # remove any rows, where the field does not end in ".type" to remove meta info - mappingDT <- mappingDT[stringr::str_detect(field, '\\.type')] + mappingDT <- mappingDT[stringr::str_detect(field, '\\.type$')] # mappings in nested objects have sub-fields called properties # mappings of fields that are indexed in different ways have multiple fields @@ -185,5 +197,10 @@ get_fields <- function(es_host metaRegEx <- '\\.(properties|fields|type)' mappingDT[, field := stringr::str_replace_all(field, metaRegEx, '')] + # log some information about this request to the user + numFields <- nrow(mappingDT) + numIndex <- mappingDT[, data.table::uniqueN(index)] + futile.logger::flog.info(paste('Retrieved', numFields, 'fields across', numIndex, 'indexes')) + return(mappingDT) } diff --git a/r-pkg/R/uptasticsearch.R b/r-pkg/R/uptasticsearch.R index 196d28b..b3497dd 100644 --- a/r-pkg/R/uptasticsearch.R +++ b/r-pkg/R/uptasticsearch.R @@ -9,4 +9,5 @@ utils::globalVariables(c('.' , '.I' , '.id' , 'field' + , 'index' )) \ No newline at end of file diff --git a/r-pkg/inst/testdata/one_index_mapping.json b/r-pkg/inst/testdata/one_index_mapping.json new file mode 100644 index 0000000..d1c2c68 --- /dev/null +++ b/r-pkg/inst/testdata/one_index_mapping.json @@ -0,0 +1,29 @@ +{ + "basketball": { + "mappings": { + "players": { + "properties": { + "team": { + "type": "keyword" + }, + "name": { + "properties": { + "first": { + "type": "text" + }, + "last": { + "type": "text" + } + } + }, + "age": { + "type": "integer" + }, + "position": { + "type": "keyword" + } + } + } + } + } +} \ No newline at end of file diff --git a/r-pkg/inst/testdata/two_index_mapping.json b/r-pkg/inst/testdata/two_index_mapping.json new file mode 100644 index 0000000..e9f85f1 --- /dev/null +++ b/r-pkg/inst/testdata/two_index_mapping.json @@ -0,0 +1,52 @@ +{ + "company": { + "mappings": { + "building": { + "properties": { + "id": { + "type": "long" + }, + "address": { + "type": "text", + "fields": { + "keyword": { + "type": "keyword", + "ignore_above": 256 + } + } + } + } + } + } + }, + "hotel": { + "mappings": { + "bed_room": { + "properties": { + "num_beds": { + "type": "integer" + }, + "description": { + "type": "text" + } + } + }, + "conference_room": { + "properties": { + "num_people": { + "type": "integer" + }, + "purpose": { + "type": "text", + "fields": { + "keyword": { + "type": "keyword", + "ignore_above": 256 + } + } + } + } + } + } + } +} \ No newline at end of file diff --git a/r-pkg/man/get_fields.Rd b/r-pkg/man/get_fields.Rd index 66ae3ef..4b8b40c 100644 --- a/r-pkg/man/get_fields.Rd +++ b/r-pkg/man/get_fields.Rd @@ -17,7 +17,7 @@ get the mapping for all indexes. Names of indexes can be treated as regular expressions.} } \value{ -A data.table containing four columns: index, type, field, and datatype +A data.table containing four columns: index, type, field, and data_type } \description{ For a given Elasticsearch index, return the mapping from field name @@ -25,7 +25,7 @@ For a given Elasticsearch index, return the mapping from field name } \examples{ \dontrun{ -# get the mapping for all types in the ticket_sales and customers indexes +# get the mapping for all indexed fields in the ticket_sales and customers indexes mappingDT <- retrieve_mapping(es_host = "http://es.custdb.mycompany.com:9200" , es_indexes = c("ticket_sales", "indexes")) } diff --git a/r-pkg/tests/testthat/test-elasticsearch_eda_funs.R b/r-pkg/tests/testthat/test-elasticsearch_eda_funs.R new file mode 100644 index 0000000..91abdce --- /dev/null +++ b/r-pkg/tests/testthat/test-elasticsearch_eda_funs.R @@ -0,0 +1,63 @@ +context("Elasticsearch eda functions") + +# Configure logger (suppress all logs in testing) +loggerOptions <- futile.logger::logger.options() +if (!identical(loggerOptions, list())){ + origLogThreshold <- loggerOptions[[1]][['threshold']] +} else { + origLogThreshold <- futile.logger::INFO +} +futile.logger::flog.threshold(0) + +#--- 1. get_counts + + + +#--- 2. get_fields + + # Gives an informative error if es_indexes is NULL or an empty string + test_that("get_fields should give an informative error if es_indexes is NULL or an empty string", + { + expect_error(get_fields(es_host = "http://es.custdb.mycompany.com:9200" + , es_indexes = NULL), + regexp = "get_fields must be passed a valid es_indexes") + expect_error(get_fields(es_host = "http://es.custdb.mycompany.com:9200" + , es_indexes = ''), + regexp = "get_fields must be passed a valid es_indexes") + } + ) + + # Works if one index is passed + test_that("get_fields should work if the mapping for one index is provided", + { + test_json <- system.file("testdata", "one_index_mapping.json", package = "uptasticsearch") + mapping <- jsonlite::fromJSON(txt = test_json) + mappingDT <- uptasticsearch:::.flatten_mapping(mapping = mapping) + expected <- data.table::data.table( + index = rep('basketball', 5) + , type = rep('players', 5) + , field = c('team', 'name.first', 'name.last', 'age', 'position') + , data_type = c('keyword', 'text', 'text', 'integer', 'keyword') + ) + expect_identical(mappingDT, expected) + } + ) + + # works if multiple indexes are passed + test_that("get_fields should work if the mapping for multiple indexes are provided", + { + test_json <- system.file("testdata", "two_index_mapping.json", package = "uptasticsearch") + mapping <- jsonlite::fromJSON(txt = test_json) + mappingDT <- uptasticsearch:::.flatten_mapping(mapping = mapping) + expected <- data.table::data.table( + index = c(rep('company', 3), rep('hotel', 5)) + , type = c(rep('building', 3), rep('bed_room', 2), rep('conference_room', 3)) + , field = c('id', 'address', 'address.keyword', 'num_beds', 'description' + , 'num_people', 'purpose', 'purpose.keyword') + , data_type = c('long', 'text', 'keyword', 'integer', 'text', 'integer' + , 'text', 'keyword') + ) + expect_identical(mappingDT, expected) + } + ) + \ No newline at end of file From a00cd2dd8a61578d3893f9e73761e20b57592438 Mon Sep 17 00:00:00 2001 From: Michael Frasco Date: Sun, 23 Jul 2017 20:33:35 -0700 Subject: [PATCH 07/13] added support for aliases --- r-pkg/NAMESPACE | 2 + r-pkg/R/elasticsearch_eda_funs.R | 76 ++++++++++++++++--- r-pkg/man/get_fields.Rd | 6 +- .../testthat/test-elasticsearch_eda_funs.R | 29 ++++++- 4 files changed, 100 insertions(+), 13 deletions(-) diff --git a/r-pkg/NAMESPACE b/r-pkg/NAMESPACE index 4fb4dba..b741b4a 100644 --- a/r-pkg/NAMESPACE +++ b/r-pkg/NAMESPACE @@ -34,6 +34,8 @@ importFrom(purrr,simplify) importFrom(purrr,transpose) importFrom(stringr,str_detect) importFrom(stringr,str_extract) +importFrom(stringr,str_replace) importFrom(stringr,str_replace_all) +importFrom(stringr,str_split) importFrom(stringr,str_split_fixed) importFrom(uuid,UUIDgenerate) diff --git a/r-pkg/R/elasticsearch_eda_funs.R b/r-pkg/R/elasticsearch_eda_funs.R index 23cd17b..77f95d8 100644 --- a/r-pkg/R/elasticsearch_eda_funs.R +++ b/r-pkg/R/elasticsearch_eda_funs.R @@ -120,6 +120,7 @@ get_counts <- function(field #' to data type for all indexed fields. #' @importFrom futile.logger flog.fatal flog.info #' @importFrom httr GET content stop_for_status +#' @importFrom data.table := uniqueN #' @param es_host A string identifying an Elasticsearch host. This should be of #' the form \code{[transfer_protocol][hostname]:[port]}. For example, #' \code{'http://myindex.thing.com:9200'}. @@ -127,6 +128,9 @@ get_counts <- function(field #' which to get mappings. Default, is \code{'_all'}, which means #' get the mapping for all indexes. Names of indexes can be #' treated as regular expressions. +#' @param use_alias A boolean flag that controls whether the true Elasticsearch +#' index name or the aliased name for an index is returned. +#' Default is \code{TRUE}. #' @export #' @return A data.table containing four columns: index, type, field, and data_type #' @examples \dontrun{ @@ -136,6 +140,7 @@ get_counts <- function(field #' } get_fields <- function(es_host , es_indexes = '_all' + , use_alias = TRUE ) { # Input checking @@ -163,14 +168,30 @@ get_fields <- function(es_host httr::stop_for_status(result) resultContent <- httr::content(result) - ##################### return the flattened result ######################### - return(.flatten_mapping(mapping = resultContent)) + ######################### flatten the result ############################## + mappingDT <- .flatten_mapping(mapping = resultContent) + + ##################### get aliases for index names ######################### + if (use_alias) { + aliasDT <- .get_aliases(es_host = es_host) + if (!is.null(aliasDT)) { + lookup <- aliasDT[['alias']] + names(lookup) <- aliasDT[['index']] + mappingDT[index %in% names(lookup), index := lookup[index]] + } + } + + # log some information about this request to the user + numFields <- nrow(mappingDT) + numIndex <- mappingDT[, data.table::uniqueN(index)] + futile.logger::flog.info(paste('Retrieved', numFields, 'fields across', numIndex, 'indexes')) + + return(mappingDT) } # [title] Flatten a mapping list of field name to data type into a data table # [mapping] A list of json that is returned from a request to the mappings API -#' @importFrom data.table := data.table setnames uniqueN -#' @importFrom futile.logger flog.info +#' @importFrom data.table := data.table setnames #' @importFrom stringr str_detect str_split_fixed str_replace_all .flatten_mapping <- function(mapping) { @@ -197,10 +218,47 @@ get_fields <- function(es_host metaRegEx <- '\\.(properties|fields|type)' mappingDT[, field := stringr::str_replace_all(field, metaRegEx, '')] - # log some information about this request to the user - numFields <- nrow(mappingDT) - numIndex <- mappingDT[, data.table::uniqueN(index)] - futile.logger::flog.info(paste('Retrieved', numFields, 'fields across', numIndex, 'indexes')) - return(mappingDT) } + +# [title] Get a data.table containing names of indexes and aliases +# [es_host] A string identifying an Elasticsearch host. +#' @importFrom httr content GET stop_for_status +.get_aliases <- function(es_host) { + + # construct the url to the alias endpoint + url <- paste0(es_host, '/_cat/aliases') + + # make the request + result <- httr::GET(url = url) + httr::stop_for_status(result) + resultContent <- httr::content(result) + + if (is.null(resultContent)) { + # there are no aliases in this Elasticsearch cluster + return(NULL) + } else { + return(.process_alias(alias_string = resultContent)) + } +} + +# [title] Process the string returned by the GET alias API into a data table +# [alias_string] A string returned by the alias API with index and alias name +#' @importFrom stringr str_replace str_split +#' @importFrom data.table as.data.table setnames +.process_alias <- function(alias_string) { + # remove the new line at the end of the string, if it exists + aliasString <- stringr::str_replace(alias_string, '\n$', '') + + # split each entry, separated by a new line, into a vector in a list + aliases <- stringr::str_split(aliasString, '\n')[[1]] + + # remove white space and only take the first two entries + aliases <- stringr::str_split(aliases, '\\s+') + aliases <- lapply(aliases, function(pair) pair[1:2]) + + # create a data table from the resulting list + aliasDT <- data.table::as.data.table(matrix(unlist(aliases), byrow = TRUE, ncol = 2)) + data.table::setnames(aliasDT, old = colnames(aliasDT), new = c('alias', 'index')) + return(aliasDT) +} diff --git a/r-pkg/man/get_fields.Rd b/r-pkg/man/get_fields.Rd index 4b8b40c..cb36899 100644 --- a/r-pkg/man/get_fields.Rd +++ b/r-pkg/man/get_fields.Rd @@ -4,7 +4,7 @@ \alias{get_fields} \title{Get the names and data types of the indexed fields in an index} \usage{ -get_fields(es_host, es_indexes = "_all") +get_fields(es_host, es_indexes = "_all", use_alias = TRUE) } \arguments{ \item{es_host}{A string identifying an Elasticsearch host. This should be of @@ -15,6 +15,10 @@ the form \code{[transfer_protocol][hostname]:[port]}. For example, which to get mappings. Default, is \code{'_all'}, which means get the mapping for all indexes. Names of indexes can be treated as regular expressions.} + +\item{use_alias}{A boolean flag that controls whether the true Elasticsearch +index name or the aliased name for an index is returned. +Default is \code{TRUE}.} } \value{ A data.table containing four columns: index, type, field, and data_type diff --git a/r-pkg/tests/testthat/test-elasticsearch_eda_funs.R b/r-pkg/tests/testthat/test-elasticsearch_eda_funs.R index 91abdce..dd8fdef 100644 --- a/r-pkg/tests/testthat/test-elasticsearch_eda_funs.R +++ b/r-pkg/tests/testthat/test-elasticsearch_eda_funs.R @@ -27,8 +27,10 @@ futile.logger::flog.threshold(0) } ) +#--- 3. .flatten_mapping + # Works if one index is passed - test_that("get_fields should work if the mapping for one index is provided", + test_that(".flatten_mapping should work if the mapping for one index is provided", { test_json <- system.file("testdata", "one_index_mapping.json", package = "uptasticsearch") mapping <- jsonlite::fromJSON(txt = test_json) @@ -44,7 +46,7 @@ futile.logger::flog.threshold(0) ) # works if multiple indexes are passed - test_that("get_fields should work if the mapping for multiple indexes are provided", + test_that(".flatten_mapping should work if the mapping for multiple indexes are provided", { test_json <- system.file("testdata", "two_index_mapping.json", package = "uptasticsearch") mapping <- jsonlite::fromJSON(txt = test_json) @@ -60,4 +62,25 @@ futile.logger::flog.threshold(0) expect_identical(mappingDT, expected) } ) - \ No newline at end of file + +#--- 4. .process_alias + + # works if one alias is passed + test_that(".process_alias works if one alias is included", + { + alias_string <- 'dwm shakespeare - - -\n' + aliasDT <- uptasticsearch:::.process_alias(alias_string = alias_string) + expected <- data.table::data.table(alias = 'dwm', index = 'shakespeare') + expect_identical(aliasDT, expected) + } + ) + + # works if multiple aliases are passed + test_that(".process_alias works if one alias is included", + { + alias_string <- 'dwm shakespeare - - -\nmoney bank - - -\n' + aliasDT <- uptasticsearch:::.process_alias(alias_string = alias_string) + expected <- data.table::data.table(alias = c('dwm', 'money'), index = c('shakespeare', 'bank')) + expect_identical(aliasDT, expected) + } + ) From 38b2d16e110825050c9d500439a47c5294c4617b Mon Sep 17 00:00:00 2001 From: Michael Frasco Date: Sun, 23 Jul 2017 22:39:00 -0700 Subject: [PATCH 08/13] removed use_alias argument --- r-pkg/R/elasticsearch_eda_funs.R | 20 +++++++------------- r-pkg/man/get_fields.Rd | 6 +----- 2 files changed, 8 insertions(+), 18 deletions(-) diff --git a/r-pkg/R/elasticsearch_eda_funs.R b/r-pkg/R/elasticsearch_eda_funs.R index 77f95d8..ed05aa8 100644 --- a/r-pkg/R/elasticsearch_eda_funs.R +++ b/r-pkg/R/elasticsearch_eda_funs.R @@ -128,9 +128,6 @@ get_counts <- function(field #' which to get mappings. Default, is \code{'_all'}, which means #' get the mapping for all indexes. Names of indexes can be #' treated as regular expressions. -#' @param use_alias A boolean flag that controls whether the true Elasticsearch -#' index name or the aliased name for an index is returned. -#' Default is \code{TRUE}. #' @export #' @return A data.table containing four columns: index, type, field, and data_type #' @examples \dontrun{ @@ -140,7 +137,6 @@ get_counts <- function(field #' } get_fields <- function(es_host , es_indexes = '_all' - , use_alias = TRUE ) { # Input checking @@ -166,19 +162,17 @@ get_fields <- function(es_host result <- httr::GET(url = url) httr::stop_for_status(result) - resultContent <- httr::content(result) + resultContent <- httr::content(result, as = 'parsed') ######################### flatten the result ############################## mappingDT <- .flatten_mapping(mapping = resultContent) ##################### get aliases for index names ######################### - if (use_alias) { - aliasDT <- .get_aliases(es_host = es_host) - if (!is.null(aliasDT)) { - lookup <- aliasDT[['alias']] - names(lookup) <- aliasDT[['index']] - mappingDT[index %in% names(lookup), index := lookup[index]] - } + aliasDT <- .get_aliases(es_host = es_host) + if (!is.null(aliasDT)) { + lookup <- aliasDT[['alias']] + names(lookup) <- aliasDT[['index']] + mappingDT[index %in% names(lookup), index := lookup[index]] } # log some information about this request to the user @@ -232,7 +226,7 @@ get_fields <- function(es_host # make the request result <- httr::GET(url = url) httr::stop_for_status(result) - resultContent <- httr::content(result) + resultContent <- httr::content(result, as = 'text') if (is.null(resultContent)) { # there are no aliases in this Elasticsearch cluster diff --git a/r-pkg/man/get_fields.Rd b/r-pkg/man/get_fields.Rd index cb36899..4b8b40c 100644 --- a/r-pkg/man/get_fields.Rd +++ b/r-pkg/man/get_fields.Rd @@ -4,7 +4,7 @@ \alias{get_fields} \title{Get the names and data types of the indexed fields in an index} \usage{ -get_fields(es_host, es_indexes = "_all", use_alias = TRUE) +get_fields(es_host, es_indexes = "_all") } \arguments{ \item{es_host}{A string identifying an Elasticsearch host. This should be of @@ -15,10 +15,6 @@ the form \code{[transfer_protocol][hostname]:[port]}. For example, which to get mappings. Default, is \code{'_all'}, which means get the mapping for all indexes. Names of indexes can be treated as regular expressions.} - -\item{use_alias}{A boolean flag that controls whether the true Elasticsearch -index name or the aliased name for an index is returned. -Default is \code{TRUE}.} } \value{ A data.table containing four columns: index, type, field, and data_type From 91b4c56d9ac186431ad875396d2fca49ba8007f0 Mon Sep 17 00:00:00 2001 From: Michael Frasco Date: Tue, 25 Jul 2017 11:54:13 -0700 Subject: [PATCH 09/13] added mocked test for get_fields --- .../testthat/test-elasticsearch_eda_funs.R | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/r-pkg/tests/testthat/test-elasticsearch_eda_funs.R b/r-pkg/tests/testthat/test-elasticsearch_eda_funs.R index dd8fdef..74b2958 100644 --- a/r-pkg/tests/testthat/test-elasticsearch_eda_funs.R +++ b/r-pkg/tests/testthat/test-elasticsearch_eda_funs.R @@ -27,6 +27,36 @@ futile.logger::flog.threshold(0) } ) + # works as expected when mocked + test_that('get_fields works as expected when mocked', + { + test_json <- system.file("testdata", "two_index_mapping.json", package = "uptasticsearch") + aliasDT <- data.table::data.table(alias = c('alias1', 'alias2') + , index = c('company', 'otherIndex')) + testthat::with_mock( + `httr::stop_for_status` = function(...) {return(NULL)}, + `httr::GET` = function(...) {return(NULL)}, + `httr::content` = function(...) {return(jsonlite::fromJSON(txt = test_json))}, + `uptasticsearch::.get_aliases` = function(...) {return(aliasDT)}, + { + outDT <- get_fields(es_host = 'http://db.mycompany.com:9200' + , es_indexes = c('company', 'hotel')) + data.table::setkey(outDT, NULL) + expected <- data.table::data.table( + index = c(rep('alias1', 3), rep('hotel', 5)) + , type = c(rep('building', 3), rep('bed_room', 2), rep('conference_room', 3)) + , field = c('id', 'address', 'address.keyword', 'num_beds', 'description' + , 'num_people', 'purpose', 'purpose.keyword') + , data_type = c('long', 'text', 'keyword', 'integer', 'text', 'integer' + , 'text', 'keyword') + ) + expect_identical(outDT, expected) + } + ) + } + ) + + #--- 3. .flatten_mapping # Works if one index is passed From d25b6c646b17803d3ef6619fdad28b9889e654e0 Mon Sep 17 00:00:00 2001 From: Michael Frasco Date: Wed, 26 Jul 2017 19:38:41 -0700 Subject: [PATCH 10/13] changed indexes to indices --- r-pkg/R/elasticsearch_eda_funs.R | 26 +++++++++---------- r-pkg/man/get_fields.Rd | 10 +++---- .../testthat/test-elasticsearch_eda_funs.R | 18 ++++++------- 3 files changed, 27 insertions(+), 27 deletions(-) diff --git a/r-pkg/R/elasticsearch_eda_funs.R b/r-pkg/R/elasticsearch_eda_funs.R index ed05aa8..00061c0 100644 --- a/r-pkg/R/elasticsearch_eda_funs.R +++ b/r-pkg/R/elasticsearch_eda_funs.R @@ -124,19 +124,19 @@ get_counts <- function(field #' @param es_host A string identifying an Elasticsearch host. This should be of #' the form \code{[transfer_protocol][hostname]:[port]}. For example, #' \code{'http://myindex.thing.com:9200'}. -#' @param es_indexes A character vector that contains the names of indexes for +#' @param es_indices A character vector that contains the names of indices for #' which to get mappings. Default, is \code{'_all'}, which means -#' get the mapping for all indexes. Names of indexes can be +#' get the mapping for all indices Names of indices can be #' treated as regular expressions. #' @export #' @return A data.table containing four columns: index, type, field, and data_type #' @examples \dontrun{ -#' # get the mapping for all indexed fields in the ticket_sales and customers indexes +#' # get the mapping for all indexed fields in the ticket_sales and customers indices #' mappingDT <- retrieve_mapping(es_host = "http://es.custdb.mycompany.com:9200" -#' , es_indexes = c("ticket_sales", "indexes")) +#' , es_indices = c("ticket_sales", "customers")) #' } get_fields <- function(es_host - , es_indexes = '_all' + , es_indices = '_all' ) { # Input checking @@ -144,21 +144,21 @@ get_fields <- function(es_host # collapse character vectors into comma separated strings. If any arguments # are NULL, create an empty string - indexes <- paste(es_indexes, collapse = ',') + indices <- paste(es_indices, collapse = ',') ########################## build the query ################################ - if (nchar(indexes) > 0) { - url <- paste(url, indexes, '_mapping', sep = '/') + if (nchar(indices) > 0) { + url <- paste(url, indices, '_mapping', sep = '/') } else { - msg <- paste("get_fields must be passed a valid es_indexes." - , "You provided", paste(es_indexes, collapse = ', ') + msg <- paste("get_fields must be passed a valid es_indices." + , "You provided", paste(es_indices, collapse = ', ') , 'which resulted in an empty string') futile.logger::flog.fatal(msg) stop(msg) } ########################## make the query ################################ - futile.logger::flog.info(paste('Getting indexed fields for indexes:', indexes)) + futile.logger::flog.info(paste('Getting indexed fields for indices:', indices)) result <- httr::GET(url = url) httr::stop_for_status(result) @@ -178,7 +178,7 @@ get_fields <- function(es_host # log some information about this request to the user numFields <- nrow(mappingDT) numIndex <- mappingDT[, data.table::uniqueN(index)] - futile.logger::flog.info(paste('Retrieved', numFields, 'fields across', numIndex, 'indexes')) + futile.logger::flog.info(paste('Retrieved', numFields, 'fields across', numIndex, 'indices')) return(mappingDT) } @@ -215,7 +215,7 @@ get_fields <- function(es_host return(mappingDT) } -# [title] Get a data.table containing names of indexes and aliases +# [title] Get a data.table containing names of indices and aliases # [es_host] A string identifying an Elasticsearch host. #' @importFrom httr content GET stop_for_status .get_aliases <- function(es_host) { diff --git a/r-pkg/man/get_fields.Rd b/r-pkg/man/get_fields.Rd index 4b8b40c..3d32dd1 100644 --- a/r-pkg/man/get_fields.Rd +++ b/r-pkg/man/get_fields.Rd @@ -4,16 +4,16 @@ \alias{get_fields} \title{Get the names and data types of the indexed fields in an index} \usage{ -get_fields(es_host, es_indexes = "_all") +get_fields(es_host, es_indices = "_all") } \arguments{ \item{es_host}{A string identifying an Elasticsearch host. This should be of the form \code{[transfer_protocol][hostname]:[port]}. For example, \code{'http://myindex.thing.com:9200'}.} -\item{es_indexes}{A character vector that contains the names of indexes for +\item{es_indices}{A character vector that contains the names of indices for which to get mappings. Default, is \code{'_all'}, which means -get the mapping for all indexes. Names of indexes can be +get the mapping for all indices Names of indices can be treated as regular expressions.} } \value{ @@ -25,8 +25,8 @@ For a given Elasticsearch index, return the mapping from field name } \examples{ \dontrun{ -# get the mapping for all indexed fields in the ticket_sales and customers indexes +# get the mapping for all indexed fields in the ticket_sales and customers indices mappingDT <- retrieve_mapping(es_host = "http://es.custdb.mycompany.com:9200" - , es_indexes = c("ticket_sales", "indexes")) + , es_indices = c("ticket_sales", "customers")) } } diff --git a/r-pkg/tests/testthat/test-elasticsearch_eda_funs.R b/r-pkg/tests/testthat/test-elasticsearch_eda_funs.R index 74b2958..186e8b3 100644 --- a/r-pkg/tests/testthat/test-elasticsearch_eda_funs.R +++ b/r-pkg/tests/testthat/test-elasticsearch_eda_funs.R @@ -15,15 +15,15 @@ futile.logger::flog.threshold(0) #--- 2. get_fields - # Gives an informative error if es_indexes is NULL or an empty string - test_that("get_fields should give an informative error if es_indexes is NULL or an empty string", + # Gives an informative error if es_indices is NULL or an empty string + test_that("get_fields should give an informative error if es_indices is NULL or an empty string", { expect_error(get_fields(es_host = "http://es.custdb.mycompany.com:9200" - , es_indexes = NULL), - regexp = "get_fields must be passed a valid es_indexes") + , es_indices = NULL), + regexp = "get_fields must be passed a valid es_indices") expect_error(get_fields(es_host = "http://es.custdb.mycompany.com:9200" - , es_indexes = ''), - regexp = "get_fields must be passed a valid es_indexes") + , es_indices = ''), + regexp = "get_fields must be passed a valid es_indices") } ) @@ -40,7 +40,7 @@ futile.logger::flog.threshold(0) `uptasticsearch::.get_aliases` = function(...) {return(aliasDT)}, { outDT <- get_fields(es_host = 'http://db.mycompany.com:9200' - , es_indexes = c('company', 'hotel')) + , es_indices = c('company', 'hotel')) data.table::setkey(outDT, NULL) expected <- data.table::data.table( index = c(rep('alias1', 3), rep('hotel', 5)) @@ -75,8 +75,8 @@ futile.logger::flog.threshold(0) } ) - # works if multiple indexes are passed - test_that(".flatten_mapping should work if the mapping for multiple indexes are provided", + # works if multiple indices are passed + test_that(".flatten_mapping should work if the mapping for multiple indices are provided", { test_json <- system.file("testdata", "two_index_mapping.json", package = "uptasticsearch") mapping <- jsonlite::fromJSON(txt = test_json) From 944d5e1285d22a0465697d62fbc00002500923e1 Mon Sep 17 00:00:00 2001 From: Michael Frasco Date: Fri, 28 Jul 2017 07:39:57 -0700 Subject: [PATCH 11/13] fixed typo in roxygen documentation --- r-pkg/R/elasticsearch_eda_funs.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/r-pkg/R/elasticsearch_eda_funs.R b/r-pkg/R/elasticsearch_eda_funs.R index 00061c0..a587e87 100644 --- a/r-pkg/R/elasticsearch_eda_funs.R +++ b/r-pkg/R/elasticsearch_eda_funs.R @@ -125,8 +125,8 @@ get_counts <- function(field #' the form \code{[transfer_protocol][hostname]:[port]}. For example, #' \code{'http://myindex.thing.com:9200'}. #' @param es_indices A character vector that contains the names of indices for -#' which to get mappings. Default, is \code{'_all'}, which means -#' get the mapping for all indices Names of indices can be +#' which to get mappings. Default is \code{'_all'}, which means +#' get the mapping for all indices. Names of indices can be #' treated as regular expressions. #' @export #' @return A data.table containing four columns: index, type, field, and data_type From 055982d51b6a815406a8425e51666726971c1059 Mon Sep 17 00:00:00 2001 From: Michael Frasco Date: Fri, 28 Jul 2017 14:24:18 -0700 Subject: [PATCH 12/13] simplied process_alias to use read.table instead of string parsing --- r-pkg/NAMESPACE | 3 +-- r-pkg/R/elasticsearch_eda_funs.R | 21 ++++++--------------- r-pkg/R/uptasticsearch.R | 2 ++ r-pkg/man/get_fields.Rd | 4 ++-- 4 files changed, 11 insertions(+), 19 deletions(-) diff --git a/r-pkg/NAMESPACE b/r-pkg/NAMESPACE index b741b4a..2b7aab3 100644 --- a/r-pkg/NAMESPACE +++ b/r-pkg/NAMESPACE @@ -34,8 +34,7 @@ importFrom(purrr,simplify) importFrom(purrr,transpose) importFrom(stringr,str_detect) importFrom(stringr,str_extract) -importFrom(stringr,str_replace) importFrom(stringr,str_replace_all) -importFrom(stringr,str_split) importFrom(stringr,str_split_fixed) +importFrom(utils,read.table) importFrom(uuid,UUIDgenerate) diff --git a/r-pkg/R/elasticsearch_eda_funs.R b/r-pkg/R/elasticsearch_eda_funs.R index a587e87..50556da 100644 --- a/r-pkg/R/elasticsearch_eda_funs.R +++ b/r-pkg/R/elasticsearch_eda_funs.R @@ -238,21 +238,12 @@ get_fields <- function(es_host # [title] Process the string returned by the GET alias API into a data table # [alias_string] A string returned by the alias API with index and alias name -#' @importFrom stringr str_replace str_split -#' @importFrom data.table as.data.table setnames +#' @importFrom data.table data.table +#' @importFrom utils read.table .process_alias <- function(alias_string) { - # remove the new line at the end of the string, if it exists - aliasString <- stringr::str_replace(alias_string, '\n$', '') + # process the string provided by the /_cat/aliases API into a data.frame and then a data.table + aliasDT <- data.table::data.table(utils::read.table(text = alias_string, stringsAsFactors = FALSE)) - # split each entry, separated by a new line, into a vector in a list - aliases <- stringr::str_split(aliasString, '\n')[[1]] - - # remove white space and only take the first two entries - aliases <- stringr::str_split(aliases, '\\s+') - aliases <- lapply(aliases, function(pair) pair[1:2]) - - # create a data table from the resulting list - aliasDT <- data.table::as.data.table(matrix(unlist(aliases), byrow = TRUE, ncol = 2)) - data.table::setnames(aliasDT, old = colnames(aliasDT), new = c('alias', 'index')) - return(aliasDT) + # return only the first two columns + return(aliasDT[, .(alias = V1, index = V2)]) } diff --git a/r-pkg/R/uptasticsearch.R b/r-pkg/R/uptasticsearch.R index b3497dd..75c9262 100644 --- a/r-pkg/R/uptasticsearch.R +++ b/r-pkg/R/uptasticsearch.R @@ -10,4 +10,6 @@ utils::globalVariables(c('.' , '.id' , 'field' , 'index' + , 'V1' + , 'V2' )) \ No newline at end of file diff --git a/r-pkg/man/get_fields.Rd b/r-pkg/man/get_fields.Rd index 3d32dd1..fe558aa 100644 --- a/r-pkg/man/get_fields.Rd +++ b/r-pkg/man/get_fields.Rd @@ -12,8 +12,8 @@ the form \code{[transfer_protocol][hostname]:[port]}. For example, \code{'http://myindex.thing.com:9200'}.} \item{es_indices}{A character vector that contains the names of indices for -which to get mappings. Default, is \code{'_all'}, which means -get the mapping for all indices Names of indices can be +which to get mappings. Default is \code{'_all'}, which means +get the mapping for all indices. Names of indices can be treated as regular expressions.} } \value{ From e392bfaca3bb11540903a9f1cd24810b8c0363af Mon Sep 17 00:00:00 2001 From: Michael Frasco Date: Fri, 28 Jul 2017 14:26:42 -0700 Subject: [PATCH 13/13] adding utils to the import section in DESCRIPTION --- r-pkg/DESCRIPTION | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/r-pkg/DESCRIPTION b/r-pkg/DESCRIPTION index 6675d48..c98d5fe 100644 --- a/r-pkg/DESCRIPTION +++ b/r-pkg/DESCRIPTION @@ -26,7 +26,8 @@ Imports: jsonlite, purrr, stringr, - uuid + uuid, + utils Suggests: knitr, testthat,