Use jsonlite::fromJSON instead of rjson::fromJSON

The former knows how to turn tabular data into data frames so I no longer have to do it myself. It's also slightly more efficient. Thanks to Marcel Ramos for the suggestion.
Bioconductor · Mar 29, 2024 · f82b560 · f82b560
1 parent aabc641
commit f82b560
Show file tree

Hide file tree

Showing 5 changed files with 98 additions and 64 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -10,12 +10,12 @@ biocViews: Infrastructure, GenomeAssembly, Annotation, GenomeAnnotation,
 	DataImport
 URL: https://bioconductor.org/packages/UCSC.utils
 BugReports: https://github.com/Bioconductor/UCSC.utils/issues
-Version: 0.99.3
+Version: 0.99.4
 License: Artistic-2.0
 Encoding: UTF-8
 Authors@R: person("Hervé", "Pagès", role=c("aut", "cre"),
 	          email="[email protected]")
-Imports: methods, stats, httr, rjson, S4Vectors
+Imports: methods, stats, httr, jsonlite, S4Vectors
 Suggests: DBI, RMariaDB, GenomeInfoDb, testthat, knitr, rmarkdown, BiocStyle
 VignetteBuilder: knitr
 Collate: 00utils.R

diff --git a/NAMESPACE b/NAMESPACE
@@ -1,7 +1,7 @@
 import(methods)
 importFrom(stats, setNames)
 importFrom(httr, GET, content, user_agent, stop_for_status)
-importFrom(rjson, fromJSON)
+importFrom(jsonlite, fromJSON)
 
 importFrom(S4Vectors, wmsg, isTRUEorFALSE, isSingleNumber,
                       isSingleString, isSingleStringOrNA,

diff --git a/R/00utils.R b/R/00utils.R
@@ -9,10 +9,84 @@ load_package_gracefully <- function(package, ...)
              "\n\n    BiocManager::install(\"", package, "\")")
 }
 
-lossless_num_to_int <- function(x)
+
+### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+### make_data_frame_from_list_of_rows()
+###
+
+.lossless_num_to_int <- function(x)
 {
     stopifnot(is.numeric(x))
     y <- suppressWarnings(as.integer(x))
     if (identical(as.numeric(y), x)) y else x
 }
 
+### Looks like tabular data in JSON is usually row-oriented (i.e. one list
+### element per row, each row itself being represented by a named list of
+### length-1 atomic vectors), instead of column-oriented. At least that's
+### how UCSC's /getData/track endpoint returns their track data.
+### Soooo inefficient!
+make_data_frame_from_list_of_rows <- function(list_of_rows)
+{
+    stopifnot(is.list(list_of_rows), is.null(names(list_of_rows)))
+    if (length(list_of_rows) == 0L) {
+        ## Happens for example with
+        ## fetch_UCSC_track_data("eboVir3", "unipAliSwissprot").
+        warning(wmsg("track is empty ==> returning a 0x0 data frame"))
+        return(data.frame())
+    }
+
+    ## Turn list of rows into list of columns (transposition).
+
+    ## 1st implementation.
+    #ans_colnames <- names(list_of_rows[[1L]])
+    #list_of_cols <- lapply(setNames(ans_colnames, ans_colnames),
+    #    function(colname) {
+    #        col <- sapply(list_of_rows, function(row) row[[colname]],
+    #                      USE.NAMES=FALSE)
+    #        if (is.numeric(col))
+    #            col <- .lossless_num_to_int(col)
+    #        col
+    #    }
+    #)
+
+    ## 2nd implementation: About 3x faster than the above!
+    ## Assumes that all list elements in 'list_of_rows' are ordered the
+    ## same i.e. have the same names in the same order.
+    #ans_ncol <- length(list_of_rows[[1L]])
+    #stopifnot(all(lengths(list_of_rows) == ans_ncol))
+    ### Turn 'list_of_rows' into a matrix of type list with
+    ### 'length(list_of_rows)' rows and 'ans_ncol' cols.
+    #m <- do.call(rbind, list_of_rows)
+    #ans_colnames <- colnames(m)
+    #list_of_cols <- lapply(setNames(seq_len(ans_ncol), ans_colnames),
+    #    function(j) {
+    #        col <- unlist(m[ , j], recursive=FALSE, use.names=FALSE)
+    #        if (is.numeric(col))
+    #            col <- .lossless_num_to_int(col)
+    #        col
+    #    }
+    #)
+
+    ## 3rd implementation: Even slightly faster than 2nd implementation.
+    ## Also assumes that all list elements in 'list_of_rows' are ordered
+    ## the same i.e. have the same names in the same order.
+    ans_ncol <- length(list_of_rows[[1L]])
+    ans_colnames <- names(list_of_rows[[1L]])
+    stopifnot(all(lengths(list_of_rows) == ans_ncol))
+    ## Turn 'list_of_rows' into a matrix of type list with
+    ## 'ans_ncol' rows and 'length(list_of_rows)' cols.
+    unlisted <- unlist(list_of_rows, recursive=FALSE, use.names=FALSE)
+    m <- matrix(unlisted, nrow=ans_ncol)
+    list_of_cols <- lapply(setNames(seq_len(ans_ncol), ans_colnames),
+        function(i) {
+            col <- unlist(m[i, ], recursive=FALSE, use.names=FALSE)
+            if (is.numeric(col))
+                col <- .lossless_num_to_int(col)
+            col
+        }
+    )
+
+    as.data.frame(list_of_cols, check.names=FALSE)
+}
+
diff --git a/R/REST_API.R b/R/REST_API.R
@@ -13,17 +13,18 @@
     ## will silently return an NA (see ?httr::content). This happens for
     ## example with the following query:
     ##   query <- list(genome="eboVir3", track="iedbBcell")
-    ##  .API_query("getData/track", query=query)
+    ##   .API_query("getData/track", query=query)
     ## This query returns a response with bytes 233 (\xe9) and 246 (\xf6)
     ## in response$content. These bytes cause the call to content() below
     ## to silently return an NA.
-    text <- content(response, as="text", encoding="UTF-8")
-    stopifnot(is.character(text), length(text) == 1L)
-    if (is.na(text)) {
-        text <- content(response, as="text", encoding="Windows-1252")
-        stopifnot(isSingleString(text))
+    json_string <- content(response, as="text", encoding="UTF-8")
+    stopifnot(is.character(json_string), length(json_string) == 1L)
+    if (is.na(json_string)) {
+        json_string <- content(response, as="json_string",
+                               encoding="Windows-1252")
+        stopifnot(isSingleString(json_string))
     }
-    parsed_json <- fromJSON(text)
+    parsed_json <- jsonlite::fromJSON(json_string)
     ## Sanity checks.
     stopifnot(is.list(parsed_json), !is.null(names(parsed_json)))
     parsed_json

diff --git a/R/fetch_UCSC_track_data.R b/R/fetch_UCSC_track_data.R
@@ -7,52 +7,12 @@
 ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 ### .extract_table_data_from_parsed_json()
 ###
+### WARNING: For JSON parsed with rjson::fromJSON()!
+###
 ### Lots of massaging and sanity checks to deal with the messiness of UCSC's
-### JSON!
+### JSON.
 ###
 
-### Quite chockingly, the list-based representation of a table as returned
-### by the /getData/track endpoint is row-oriented (i.e. one list element
-### per row) instead of column-oriented! Not a very efficient way to
-### represent a table in JSON :-/
-.make_data_frame_from_list_of_rows <- function(list_of_rows)
-{
-    stopifnot(is.list(list_of_rows), is.null(names(list_of_rows)))
-    if (length(list_of_rows) == 0L) {
-        ## Happens for example with
-        ## fetch_UCSC_track_data("eboVir3", "unipAliSwissprot").
-        warning(wmsg("track is empty ==> returning a 0x0 data frame"))
-        return(data.frame())
-    }
-
-    ## Turn list of rows into list of columns (transposition).
-
-    #ans_colnames <- names(list_of_rows[[1L]])
-    #list_of_cols <- lapply(setNames(ans_colnames, ans_colnames),
-    #    function(colname) {
-    #        col <- sapply(list_of_rows, function(row) row[[colname]],
-    #                      USE.NAMES=FALSE)
-    #        if (is.numeric(col))
-    #            col <- lossless_num_to_int(col)
-    #        col
-    #    }
-    #)
-
-    ### About 3x faster than the above!
-    m <- do.call(rbind, list_of_rows)  # a matrix of type list
-    ans_colnames <- colnames(m)
-    list_of_cols <- lapply(setNames(ans_colnames, ans_colnames),
-        function(colname) {
-            col <- unlist(m[ , colname], recursive=FALSE, use.names=FALSE)
-            if (is.numeric(col))
-                col <- lossless_num_to_int(col)
-            col
-        }
-    )
-
-    as.data.frame(list_of_cols, check.names=FALSE)
-}
-
 .extract_table_data_from_parsed_json <- function(parsed_json, primary_table)
 {
     stopifnot(is.list(parsed_json), !is.null(names(parsed_json)))
@@ -65,7 +25,7 @@
     stopifnot(is.list(table_data))
     if (is.null(names(table_data))) {
         ## One single table.
-        ans <- .make_data_frame_from_list_of_rows(table_data)
+        ans <- make_data_frame_from_list_of_rows(table_data)
     } else {
         ## 'table_data' is a named list with the chromosome names on it. Each
         ## list element in 'table_data' is itself a list that represents a
@@ -78,7 +38,7 @@
             warning(wmsg("track is empty ==> returning a 0x0 data frame"))
             return(data.frame())
         }
-        dfs <- lapply(table_data[idx], .make_data_frame_from_list_of_rows)
+        dfs <- lapply(table_data[idx], make_data_frame_from_list_of_rows)
         ans <- do.call(rbind, unname(dfs))
     }
     stopifnot(nrow(ans) == parsed_json[["itemsReturned"]])
@@ -89,22 +49,21 @@
 ### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 ### .extract_table_data_from_parsed_json2()
 ###
+### WARNING: For JSON parsed with jsonlite::fromJSON()!
+###
 ### A slightly simpler alternative to .extract_table_data_from_parsed_json()
 ### that takes advantage of the built-in data massaging capabilities of
 ### jsonlite::fromJSON().
 ### NOT a drop-in replacement for .extract_table_data_from_parsed_json() as
 ### it first requires switching from rjson::fromJSON to jsonlite::fromJSON
 ### in internal helper .parse_json() defined in R/REST_API.R.
 ### As Marcel pointed out, unlike the former the latter recognizes JSON
-### lists that represent data tables and automatically turns them into data
+### lists that represent tabular data and automatically turns them into data
 ### frames. See https://github.com/Bioconductor/Contributions/issues/3343
 ###
-### Notes:
-### - The code below still needs to perform a little bit of data massaging
-###   and sanity checks to deal with the messiness of UCSC's JSON!
-### - There's no significant difference in performance between the
-###   jsonlite::fromJSON + .extract_table_data_from_parsed_json2 and the
-###   rjson::fromJSON + .extract_table_data_from_parsed_json solutions.
+### Note that the code below still needs to perform a little bit of data
+### massaging and sanity checks to deal with the messiness of UCSC's JSON!
+###
 
 ### The table data is either put all together in a single data frame or split
 ### into one data frame per chromosome.
@@ -154,6 +113,6 @@ fetch_UCSC_track_data <- function(genome, primary_table, api.url=UCSC.api.url())
     if (!(isSingleString(primary_table) && nzchar(primary_table)))
         stop(wmsg("'primary_table' must be a single (non-empty) string"))
     parsed_json <- API_get_track_data(genome, primary_table, api.url=api.url)
-    .extract_table_data_from_parsed_json(parsed_json, primary_table)
+    .extract_table_data_from_parsed_json2(parsed_json, primary_table)
 }