Skip to content

Commit

Permalink
Use jsonlite::fromJSON instead of rjson::fromJSON
Browse files Browse the repository at this point in the history
The former knows how to turn tabular data into data frames so I no longer
have to do it myself. It's also slightly more efficient.

Thanks to Marcel Ramos for the suggestion.
  • Loading branch information
hpages committed Mar 29, 2024
1 parent aabc641 commit f82b560
Show file tree
Hide file tree
Showing 5 changed files with 98 additions and 64 deletions.
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,12 @@ biocViews: Infrastructure, GenomeAssembly, Annotation, GenomeAnnotation,
DataImport
URL: https://bioconductor.org/packages/UCSC.utils
BugReports: https://github.com/Bioconductor/UCSC.utils/issues
Version: 0.99.3
Version: 0.99.4
License: Artistic-2.0
Encoding: UTF-8
Authors@R: person("Hervé", "Pagès", role=c("aut", "cre"),
email="[email protected]")
Imports: methods, stats, httr, rjson, S4Vectors
Imports: methods, stats, httr, jsonlite, S4Vectors
Suggests: DBI, RMariaDB, GenomeInfoDb, testthat, knitr, rmarkdown, BiocStyle
VignetteBuilder: knitr
Collate: 00utils.R
Expand Down
2 changes: 1 addition & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import(methods)
importFrom(stats, setNames)
importFrom(httr, GET, content, user_agent, stop_for_status)
importFrom(rjson, fromJSON)
importFrom(jsonlite, fromJSON)

importFrom(S4Vectors, wmsg, isTRUEorFALSE, isSingleNumber,
isSingleString, isSingleStringOrNA,
Expand Down
76 changes: 75 additions & 1 deletion R/00utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,84 @@ load_package_gracefully <- function(package, ...)
"\n\n BiocManager::install(\"", package, "\")")
}

lossless_num_to_int <- function(x)

### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### make_data_frame_from_list_of_rows()
###

.lossless_num_to_int <- function(x)
{
stopifnot(is.numeric(x))
y <- suppressWarnings(as.integer(x))
if (identical(as.numeric(y), x)) y else x
}

### Looks like tabular data in JSON is usually row-oriented (i.e. one list
### element per row, each row itself being represented by a named list of
### length-1 atomic vectors), instead of column-oriented. At least that's
### how UCSC's /getData/track endpoint returns their track data.
### Soooo inefficient!
make_data_frame_from_list_of_rows <- function(list_of_rows)
{
stopifnot(is.list(list_of_rows), is.null(names(list_of_rows)))
if (length(list_of_rows) == 0L) {
## Happens for example with
## fetch_UCSC_track_data("eboVir3", "unipAliSwissprot").
warning(wmsg("track is empty ==> returning a 0x0 data frame"))
return(data.frame())
}

## Turn list of rows into list of columns (transposition).

## 1st implementation.
#ans_colnames <- names(list_of_rows[[1L]])
#list_of_cols <- lapply(setNames(ans_colnames, ans_colnames),
# function(colname) {
# col <- sapply(list_of_rows, function(row) row[[colname]],
# USE.NAMES=FALSE)
# if (is.numeric(col))
# col <- .lossless_num_to_int(col)
# col
# }
#)

## 2nd implementation: About 3x faster than the above!
## Assumes that all list elements in 'list_of_rows' are ordered the
## same i.e. have the same names in the same order.
#ans_ncol <- length(list_of_rows[[1L]])
#stopifnot(all(lengths(list_of_rows) == ans_ncol))
### Turn 'list_of_rows' into a matrix of type list with
### 'length(list_of_rows)' rows and 'ans_ncol' cols.
#m <- do.call(rbind, list_of_rows)
#ans_colnames <- colnames(m)
#list_of_cols <- lapply(setNames(seq_len(ans_ncol), ans_colnames),
# function(j) {
# col <- unlist(m[ , j], recursive=FALSE, use.names=FALSE)
# if (is.numeric(col))
# col <- .lossless_num_to_int(col)
# col
# }
#)

## 3rd implementation: Even slightly faster than 2nd implementation.
## Also assumes that all list elements in 'list_of_rows' are ordered
## the same i.e. have the same names in the same order.
ans_ncol <- length(list_of_rows[[1L]])
ans_colnames <- names(list_of_rows[[1L]])
stopifnot(all(lengths(list_of_rows) == ans_ncol))
## Turn 'list_of_rows' into a matrix of type list with
## 'ans_ncol' rows and 'length(list_of_rows)' cols.
unlisted <- unlist(list_of_rows, recursive=FALSE, use.names=FALSE)
m <- matrix(unlisted, nrow=ans_ncol)
list_of_cols <- lapply(setNames(seq_len(ans_ncol), ans_colnames),
function(i) {
col <- unlist(m[i, ], recursive=FALSE, use.names=FALSE)
if (is.numeric(col))
col <- .lossless_num_to_int(col)
col
}
)

as.data.frame(list_of_cols, check.names=FALSE)
}

15 changes: 8 additions & 7 deletions R/REST_API.R
Original file line number Diff line number Diff line change
Expand Up @@ -13,17 +13,18 @@
## will silently return an NA (see ?httr::content). This happens for
## example with the following query:
## query <- list(genome="eboVir3", track="iedbBcell")
## .API_query("getData/track", query=query)
## .API_query("getData/track", query=query)
## This query returns a response with bytes 233 (\xe9) and 246 (\xf6)
## in response$content. These bytes cause the call to content() below
## to silently return an NA.
text <- content(response, as="text", encoding="UTF-8")
stopifnot(is.character(text), length(text) == 1L)
if (is.na(text)) {
text <- content(response, as="text", encoding="Windows-1252")
stopifnot(isSingleString(text))
json_string <- content(response, as="text", encoding="UTF-8")
stopifnot(is.character(json_string), length(json_string) == 1L)
if (is.na(json_string)) {
json_string <- content(response, as="json_string",
encoding="Windows-1252")
stopifnot(isSingleString(json_string))
}
parsed_json <- fromJSON(text)
parsed_json <- jsonlite::fromJSON(json_string)
## Sanity checks.
stopifnot(is.list(parsed_json), !is.null(names(parsed_json)))
parsed_json
Expand Down
65 changes: 12 additions & 53 deletions R/fetch_UCSC_track_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,52 +7,12 @@
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### .extract_table_data_from_parsed_json()
###
### WARNING: For JSON parsed with rjson::fromJSON()!
###
### Lots of massaging and sanity checks to deal with the messiness of UCSC's
### JSON!
### JSON.
###

### Quite chockingly, the list-based representation of a table as returned
### by the /getData/track endpoint is row-oriented (i.e. one list element
### per row) instead of column-oriented! Not a very efficient way to
### represent a table in JSON :-/
.make_data_frame_from_list_of_rows <- function(list_of_rows)
{
stopifnot(is.list(list_of_rows), is.null(names(list_of_rows)))
if (length(list_of_rows) == 0L) {
## Happens for example with
## fetch_UCSC_track_data("eboVir3", "unipAliSwissprot").
warning(wmsg("track is empty ==> returning a 0x0 data frame"))
return(data.frame())
}

## Turn list of rows into list of columns (transposition).

#ans_colnames <- names(list_of_rows[[1L]])
#list_of_cols <- lapply(setNames(ans_colnames, ans_colnames),
# function(colname) {
# col <- sapply(list_of_rows, function(row) row[[colname]],
# USE.NAMES=FALSE)
# if (is.numeric(col))
# col <- lossless_num_to_int(col)
# col
# }
#)

### About 3x faster than the above!
m <- do.call(rbind, list_of_rows) # a matrix of type list
ans_colnames <- colnames(m)
list_of_cols <- lapply(setNames(ans_colnames, ans_colnames),
function(colname) {
col <- unlist(m[ , colname], recursive=FALSE, use.names=FALSE)
if (is.numeric(col))
col <- lossless_num_to_int(col)
col
}
)

as.data.frame(list_of_cols, check.names=FALSE)
}

.extract_table_data_from_parsed_json <- function(parsed_json, primary_table)
{
stopifnot(is.list(parsed_json), !is.null(names(parsed_json)))
Expand All @@ -65,7 +25,7 @@
stopifnot(is.list(table_data))
if (is.null(names(table_data))) {
## One single table.
ans <- .make_data_frame_from_list_of_rows(table_data)
ans <- make_data_frame_from_list_of_rows(table_data)
} else {
## 'table_data' is a named list with the chromosome names on it. Each
## list element in 'table_data' is itself a list that represents a
Expand All @@ -78,7 +38,7 @@
warning(wmsg("track is empty ==> returning a 0x0 data frame"))
return(data.frame())
}
dfs <- lapply(table_data[idx], .make_data_frame_from_list_of_rows)
dfs <- lapply(table_data[idx], make_data_frame_from_list_of_rows)
ans <- do.call(rbind, unname(dfs))
}
stopifnot(nrow(ans) == parsed_json[["itemsReturned"]])
Expand All @@ -89,22 +49,21 @@
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### .extract_table_data_from_parsed_json2()
###
### WARNING: For JSON parsed with jsonlite::fromJSON()!
###
### A slightly simpler alternative to .extract_table_data_from_parsed_json()
### that takes advantage of the built-in data massaging capabilities of
### jsonlite::fromJSON().
### NOT a drop-in replacement for .extract_table_data_from_parsed_json() as
### it first requires switching from rjson::fromJSON to jsonlite::fromJSON
### in internal helper .parse_json() defined in R/REST_API.R.
### As Marcel pointed out, unlike the former the latter recognizes JSON
### lists that represent data tables and automatically turns them into data
### lists that represent tabular data and automatically turns them into data
### frames. See https://github.com/Bioconductor/Contributions/issues/3343
###
### Notes:
### - The code below still needs to perform a little bit of data massaging
### and sanity checks to deal with the messiness of UCSC's JSON!
### - There's no significant difference in performance between the
### jsonlite::fromJSON + .extract_table_data_from_parsed_json2 and the
### rjson::fromJSON + .extract_table_data_from_parsed_json solutions.
### Note that the code below still needs to perform a little bit of data
### massaging and sanity checks to deal with the messiness of UCSC's JSON!
###

### The table data is either put all together in a single data frame or split
### into one data frame per chromosome.
Expand Down Expand Up @@ -154,6 +113,6 @@ fetch_UCSC_track_data <- function(genome, primary_table, api.url=UCSC.api.url())
if (!(isSingleString(primary_table) && nzchar(primary_table)))
stop(wmsg("'primary_table' must be a single (non-empty) string"))
parsed_json <- API_get_track_data(genome, primary_table, api.url=api.url)
.extract_table_data_from_parsed_json(parsed_json, primary_table)
.extract_table_data_from_parsed_json2(parsed_json, primary_table)
}

0 comments on commit f82b560

Please sign in to comment.