From 95c7183f389888893b00a4cd2943cf152b05421a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Herv=C3=A9=20Pag=C3=A8s?= Date: Fri, 22 Mar 2024 11:44:37 -0700 Subject: [PATCH] Set user agent to "Bioconductor UCSC.utils" when querying UCSC REST API --- DESCRIPTION | 11 ++++- NAMESPACE | 2 +- R/REST_API.R | 115 ++++++++++++++++++++++++++++++++++++++++++++++ R/UCSC.api.url.R | 116 ----------------------------------------------- 4 files changed, 126 insertions(+), 118 deletions(-) create mode 100644 R/REST_API.R diff --git a/DESCRIPTION b/DESCRIPTION index 5d7b8ef..844f629 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -10,7 +10,7 @@ biocViews: Infrastructure, GenomeAssembly, Annotation, GenomeAnnotation, DataImport URL: https://bioconductor.org/packages/UCSC.utils BugReports: https://github.com/Bioconductor/UCSC.utils/issues -Version: 0.99.1 +Version: 0.99.2 License: Artistic-2.0 Encoding: UTF-8 Authors@R: person("Hervé", "Pagès", role=c("aut", "cre"), @@ -18,3 +18,12 @@ Authors@R: person("Hervé", "Pagès", role=c("aut", "cre"), Imports: methods, stats, httr, rjson, S4Vectors Suggests: DBI, RMariaDB, GenomeInfoDb, testthat, knitr, rmarkdown, BiocStyle VignetteBuilder: knitr +Collate: 00utils.R + UCSC.api.url.R + REST_API.R + list_UCSC_genomes.R + get_UCSC_chrom_sizes.R + list_UCSC_tracks.R + fetch_UCSC_track_data.R + UCSC_dbselect.R + zzz.R diff --git a/NAMESPACE b/NAMESPACE index c37da34..df5bf64 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,6 +1,6 @@ import(methods) importFrom(stats, setNames) -importFrom(httr, GET, content) +importFrom(httr, GET, content, user_agent) importFrom(rjson, fromJSON) importFrom(S4Vectors, wmsg, isTRUEorFALSE, isSingleNumber, diff --git a/R/REST_API.R b/R/REST_API.R new file mode 100644 index 0000000..7d0670e --- /dev/null +++ b/R/REST_API.R @@ -0,0 +1,115 @@ +### ========================================================================= +### Thin R wrappers to UCSC REST API endpoints +### ------------------------------------------------------------------------- +### +### Nothing in this file is exported. +### + + +.API_query <- function(endpoint, query=list(), api.url=UCSC.api.url()) +{ + stopifnot(isSingleString(endpoint), nzchar(endpoint), + is.list(query), + isSingleString(api.url), nzchar(api.url)) + if (length(query) != 0L) + stopifnot(!is.null(names(query))) + url <- paste0(api.url, "/", endpoint) + GET(url, user_agent("Bioconductor UCSC.utils"), query=query) +} + +.parse_json <- function(response) +{ + parsed_json <- fromJSON(content(response, as="text", encoding="UTF-8")) + ## Sanity checks. + stopifnot(is.list(parsed_json), !is.null(names(parsed_json))) + parsed_json +} + + +### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +### We only support the following endpoints at the moment: +### - /list/ucscGenomes +### - /list/chromosomes +### - /list/tracks +### - /getData/track +### See https://genome.ucsc.edu/goldenPath/help/api.html#Endpoint for the +### full list of endpoints. +### +### All the functions below return parsed JSON. +### + +### Endpoint /list/ucscGenomes +API_list_genomes <- function(api.url=UCSC.api.url()) +{ + endpoint <- "list/ucscGenomes" + response <- .API_query(endpoint, api.url=api.url) + if (response[["status_code"]] != 200L) + stop(wmsg("failed to get list of UCSC genomes from ", api.url)) + + ans <- .parse_json(response)[["ucscGenomes"]] + ## Sanity check. + stopifnot(is.list(ans)) + ans +} + +### Endpoint /list/chromosomes +API_list_chromosomes <- function(genome, api.url=UCSC.api.url()) +{ + stopifnot(isSingleString(genome), nzchar(genome)) + + endpoint <- "list/chromosomes" + query <- list(genome=genome) + response <- .API_query(endpoint, query=query, api.url=api.url) + if (response[["status_code"]] != 200L) + stop(wmsg(genome, ": unknown UCSC genome ", + "(or ", api.url, " is down?)")) + + ans <- .parse_json(response) + ## Sanity check. + stopifnot(identical(ans[["genome"]], genome)) + ans +} + +### Endpoint /list/tracks +API_list_tracks <- function(genome, api.url=UCSC.api.url()) +{ + stopifnot(isSingleString(genome), nzchar(genome)) + + endpoint <- "list/tracks" + query <- list(genome=genome) + response <- .API_query(endpoint, query=query, api.url=api.url) + if (response[["status_code"]] != 200L) + stop(wmsg(genome, ": unknown UCSC genome ", + "(or ", api.url, " is down?)")) + + ans <- .parse_json(response)[[genome]] + ## Sanity check. + stopifnot(is.list(ans)) + ans +} + +### Endpoint /getData/track +### Note that the endpoint expects the supplied 'track' argument to be the +### name of the track's primary table rather than the track's name. +### E.g. "catLiftOffGenesV1" rather than "CAT/Liftoff Genes". +API_get_track_data <- function(genome, primary_table, api.url=UCSC.api.url()) +{ + stopifnot(isSingleString(genome), nzchar(genome), + isSingleString(primary_table), nzchar(primary_table)) + + endpoint <- "getData/track" + query <- list(genome=genome, track=primary_table) + response <- .API_query(endpoint, query=query, api.url=api.url) + if (response[["status_code"]] != 200L) + stop(wmsg(genome, "/", primary_table, ": ", + "unknown UCSC genome/primary_table ", + "(or ", api.url, " is down?)")) + + ans <- .parse_json(response) + ## Sanity checks. + stopifnot(identical(ans[["genome"]], genome)) + if (!is.null(ans[["track"]])) + stopifnot(identical(ans[["track"]], primary_table)) + ans +} + diff --git a/R/UCSC.api.url.R b/R/UCSC.api.url.R index 5fcee49..9d47b87 100644 --- a/R/UCSC.api.url.R +++ b/R/UCSC.api.url.R @@ -4,7 +4,6 @@ ### -### Exported. UCSC.api.url <- function(new_url=NULL) { ans <- getOption("UCSC.api.url") @@ -18,118 +17,3 @@ UCSC.api.url <- function(new_url=NULL) invisible(ans) # return old URL invisibly } - -### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -### Helpers for the thin endpoint wrappers defined below -### - -.API_query <- function(endpoint, query=list(), api.url=UCSC.api.url()) -{ - stopifnot(isSingleString(endpoint), nzchar(endpoint), - is.list(query), - isSingleString(api.url), nzchar(api.url)) - if (length(query) != 0L) - stopifnot(!is.null(names(query))) - url <- paste0(api.url, "/", endpoint) - GET(url, query=query) -} - -.parse_json <- function(response) -{ - parsed_json <- fromJSON(content(response, as="text", encoding="UTF-8")) - ## Sanity checks. - stopifnot(is.list(parsed_json), !is.null(names(parsed_json))) - parsed_json -} - - -### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -### Thin wrappers to UCSC REST API endpoints -### -### At the moment, we only support: -### - /list/ucscGenomes -### - /list/chromosomes -### - /list/tracks -### - /getData/track -### See https://genome.ucsc.edu/goldenPath/help/api.html#Endpoint for the -### full list of endpoint functions. -### -### All the functions below return parsed JSON. -### None of them is exported. -### - -### Endpoint /list/ucscGenomes -API_list_genomes <- function(api.url=UCSC.api.url()) -{ - endpoint <- "list/ucscGenomes" - response <- .API_query(endpoint, api.url=api.url) - if (response[["status_code"]] != 200L) - stop(wmsg("failed to get list of UCSC genomes from ", api.url)) - - ans <- .parse_json(response)[["ucscGenomes"]] - ## Sanity check. - stopifnot(is.list(ans)) - ans -} - -### Endpoint /list/chromosomes -API_list_chromosomes <- function(genome, api.url=UCSC.api.url()) -{ - stopifnot(isSingleString(genome), nzchar(genome)) - - endpoint <- "list/chromosomes" - query <- list(genome=genome) - response <- .API_query(endpoint, query=query, api.url=api.url) - if (response[["status_code"]] != 200L) - stop(wmsg(genome, ": unknown UCSC genome ", - "(or ", api.url, " is down?)")) - - ans <- .parse_json(response) - ## Sanity check. - stopifnot(identical(ans[["genome"]], genome)) - ans -} - -### Endpoint /list/tracks -API_list_tracks <- function(genome, api.url=UCSC.api.url()) -{ - stopifnot(isSingleString(genome), nzchar(genome)) - - endpoint <- "list/tracks" - query <- list(genome=genome) - response <- .API_query(endpoint, query=query, api.url=api.url) - if (response[["status_code"]] != 200L) - stop(wmsg(genome, ": unknown UCSC genome ", - "(or ", api.url, " is down?)")) - - ans <- .parse_json(response)[[genome]] - ## Sanity check. - stopifnot(is.list(ans)) - ans -} - -### Endpoint /getData/track -### Note that the endpoint expects the supplied 'track' argument to be the -### name of the track's primary table rather than the track's name. -### E.g. "catLiftOffGenesV1" rather than "CAT/Liftoff Genes". -API_get_track_data <- function(genome, primary_table, api.url=UCSC.api.url()) -{ - stopifnot(isSingleString(genome), nzchar(genome), - isSingleString(primary_table), nzchar(primary_table)) - - endpoint <- "getData/track" - query <- list(genome=genome, track=primary_table) - response <- .API_query(endpoint, query=query, api.url=api.url) - if (response[["status_code"]] != 200L) - stop(wmsg(genome, "/", primary_table, ": ", - "unknown UCSC genome/primary_table ", - "(or ", api.url, " is down?)")) - - ans <- .parse_json(response) - ## Sanity checks. - stopifnot(identical(ans[["genome"]], genome)) - if (!is.null(ans[["track"]])) - stopifnot(identical(ans[["track"]], primary_table)) - ans -} -