Skip to content

Commit

Permalink
Merge pull request #23 from mfrasco/feature/retrieve_mapping
Browse files Browse the repository at this point in the history
Feature/retrieve mapping
  • Loading branch information
jameslamb authored Aug 8, 2017
2 parents 50934eb + 79fb751 commit 68293e1
Show file tree
Hide file tree
Showing 10 changed files with 377 additions and 3 deletions.
4 changes: 3 additions & 1 deletion r-pkg/DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ Authors@R: c(
person("James", "Lamb", email = "[email protected]", role = c("aut", "cre")),
person("Nick", "Paras", email = "[email protected]", role = c("aut")),
person("Austin", "Dickey", email = "[email protected]", role = c("aut")),
person("Michael", "Frasco", email = "[email protected]", role = c("ctb")),
person("Weiwen", "Gu", email = "[email protected]", role = c("ctb")),
person("Uptake Technologies Inc.", role = c("cph")))
Maintainer: James Lamb <[email protected]>
Expand All @@ -26,7 +27,8 @@ Imports:
jsonlite,
purrr,
stringr,
uuid
uuid,
utils
Suggests:
knitr,
testthat,
Expand Down
7 changes: 7 additions & 0 deletions r-pkg/NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ export(chomp_aggs)
export(chomp_hits)
export(es_search)
export(get_counts)
export(get_fields)
export(parse_date_time)
export(unpack_nested_data)
importFrom(data.table,":=")
Expand All @@ -14,9 +15,11 @@ importFrom(data.table,rbindlist)
importFrom(data.table,setcolorder)
importFrom(data.table,setkeyv)
importFrom(data.table,setnames)
importFrom(data.table,uniqueN)
importFrom(futile.logger,flog.fatal)
importFrom(futile.logger,flog.info)
importFrom(futile.logger,flog.warn)
importFrom(httr,GET)
importFrom(httr,POST)
importFrom(httr,content)
importFrom(httr,stop_for_status)
Expand All @@ -29,5 +32,9 @@ importFrom(parallel,stopCluster)
importFrom(purrr,map2)
importFrom(purrr,simplify)
importFrom(purrr,transpose)
importFrom(stringr,str_detect)
importFrom(stringr,str_extract)
importFrom(stringr,str_replace_all)
importFrom(stringr,str_split_fixed)
importFrom(utils,read.table)
importFrom(uuid,UUIDgenerate)
133 changes: 133 additions & 0 deletions r-pkg/R/elasticsearch_eda_funs.R
Original file line number Diff line number Diff line change
Expand Up @@ -114,3 +114,136 @@ get_counts <- function(field

}

#' @title Get the names and data types of the indexed fields in an index
#' @name get_fields
#' @description For a given Elasticsearch index, return the mapping from field name
#' to data type for all indexed fields.
#' @importFrom futile.logger flog.fatal flog.info
#' @importFrom httr GET content stop_for_status
#' @importFrom data.table := uniqueN
#' @param es_host A string identifying an Elasticsearch host. This should be of
#' the form \code{[transfer_protocol][hostname]:[port]}. For example,
#' \code{'http://myindex.thing.com:9200'}.
#' @param es_indices A character vector that contains the names of indices for
#' which to get mappings. Default is \code{'_all'}, which means
#' get the mapping for all indices. Names of indices can be
#' treated as regular expressions.
#' @export
#' @return A data.table containing four columns: index, type, field, and data_type
#' @examples \dontrun{
#' # get the mapping for all indexed fields in the ticket_sales and customers indices
#' mappingDT <- retrieve_mapping(es_host = "http://es.custdb.mycompany.com:9200"
#' , es_indices = c("ticket_sales", "customers"))
#' }
get_fields <- function(es_host
, es_indices = '_all'
) {

# Input checking
url <- .ValidateAndFormatHost(es_host)

# collapse character vectors into comma separated strings. If any arguments
# are NULL, create an empty string
indices <- paste(es_indices, collapse = ',')

########################## build the query ################################
if (nchar(indices) > 0) {
url <- paste(url, indices, '_mapping', sep = '/')
} else {
msg <- paste("get_fields must be passed a valid es_indices."
, "You provided", paste(es_indices, collapse = ', ')
, 'which resulted in an empty string')
futile.logger::flog.fatal(msg)
stop(msg)
}

########################## make the query ################################
futile.logger::flog.info(paste('Getting indexed fields for indices:', indices))

result <- httr::GET(url = url)
httr::stop_for_status(result)
resultContent <- httr::content(result, as = 'parsed')

######################### flatten the result ##############################
mappingDT <- .flatten_mapping(mapping = resultContent)

##################### get aliases for index names #########################
aliasDT <- .get_aliases(es_host = es_host)
if (!is.null(aliasDT)) {
lookup <- aliasDT[['alias']]
names(lookup) <- aliasDT[['index']]
mappingDT[index %in% names(lookup), index := lookup[index]]
}

# log some information about this request to the user
numFields <- nrow(mappingDT)
numIndex <- mappingDT[, data.table::uniqueN(index)]
futile.logger::flog.info(paste('Retrieved', numFields, 'fields across', numIndex, 'indices'))

return(mappingDT)
}

# [title] Flatten a mapping list of field name to data type into a data table
# [mapping] A list of json that is returned from a request to the mappings API
#' @importFrom data.table := data.table setnames
#' @importFrom stringr str_detect str_split_fixed str_replace_all
.flatten_mapping <- function(mapping) {

######################### parse the result ###############################
# flatten the list object that is returned from the query
flattened <- unlist(mapping)

# the names of the flattened object has the index, type, and field name
# however, it also has extra terms that we can use to split the name
# into three distinct parts
mappingCols <- stringr::str_split_fixed(names(flattened), '\\.(mappings|properties)\\.', n = 3)

# convert to data table and add the data type column
mappingDT <- data.table::data.table(meta = mappingCols, data_type = as.character(flattened))
newColNames <- c('index', 'type', 'field', 'data_type')
data.table::setnames(mappingDT, old = names(mappingDT), new = newColNames)

# remove any rows, where the field does not end in ".type" to remove meta info
mappingDT <- mappingDT[stringr::str_detect(field, '\\.type$')]

# mappings in nested objects have sub-fields called properties
# mappings of fields that are indexed in different ways have multiple fields
# we want to remove these terms from the field name
metaRegEx <- '\\.(properties|fields|type)'
mappingDT[, field := stringr::str_replace_all(field, metaRegEx, '')]

return(mappingDT)
}

# [title] Get a data.table containing names of indices and aliases
# [es_host] A string identifying an Elasticsearch host.
#' @importFrom httr content GET stop_for_status
.get_aliases <- function(es_host) {

# construct the url to the alias endpoint
url <- paste0(es_host, '/_cat/aliases')

# make the request
result <- httr::GET(url = url)
httr::stop_for_status(result)
resultContent <- httr::content(result, as = 'text')

if (is.null(resultContent)) {
# there are no aliases in this Elasticsearch cluster
return(NULL)
} else {
return(.process_alias(alias_string = resultContent))
}
}

# [title] Process the string returned by the GET alias API into a data table
# [alias_string] A string returned by the alias API with index and alias name
#' @importFrom data.table data.table
#' @importFrom utils read.table
.process_alias <- function(alias_string) {
# process the string provided by the /_cat/aliases API into a data.frame and then a data.table
aliasDT <- data.table::data.table(utils::read.table(text = alias_string, stringsAsFactors = FALSE))

# return only the first two columns
return(aliasDT[, .(alias = V1, index = V2)])
}
1 change: 0 additions & 1 deletion r-pkg/R/elasticsearch_parsers.R
Original file line number Diff line number Diff line change
Expand Up @@ -1048,7 +1048,6 @@ es_search <- function(es_host

}


# [title] Execute a Search request against an Elasticsearch cluster
# [name] .search_request
# [description] Given a query string (JSON with valid DSL), execute a request
Expand Down
4 changes: 4 additions & 0 deletions r-pkg/R/uptasticsearch.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,8 @@
utils::globalVariables(c('.'
, '.I'
, '.id'
, 'field'
, 'index'
, 'V1'
, 'V2'
))
29 changes: 29 additions & 0 deletions r-pkg/inst/testdata/one_index_mapping.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
{
"basketball": {
"mappings": {
"players": {
"properties": {
"team": {
"type": "keyword"
},
"name": {
"properties": {
"first": {
"type": "text"
},
"last": {
"type": "text"
}
}
},
"age": {
"type": "integer"
},
"position": {
"type": "keyword"
}
}
}
}
}
}
52 changes: 52 additions & 0 deletions r-pkg/inst/testdata/two_index_mapping.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
{
"company": {
"mappings": {
"building": {
"properties": {
"id": {
"type": "long"
},
"address": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
}
}
},
"hotel": {
"mappings": {
"bed_room": {
"properties": {
"num_beds": {
"type": "integer"
},
"description": {
"type": "text"
}
}
},
"conference_room": {
"properties": {
"num_people": {
"type": "integer"
},
"purpose": {
"type": "text",
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
}
}
}
}
32 changes: 32 additions & 0 deletions r-pkg/man/get_fields.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 68293e1

Please sign in to comment.