From 2845b8d63a235306c17b8858235f04a7765d1eab Mon Sep 17 00:00:00 2001 From: Anh Nguyet Vu Date: Tue, 13 Feb 2024 14:00:11 -0700 Subject: [PATCH] Update formatting of PMID --- R/add_publication_from_pubmed.R | 75 +++++++++++++++++---------------- 1 file changed, 38 insertions(+), 37 deletions(-) diff --git a/R/add_publication_from_pubmed.R b/R/add_publication_from_pubmed.R index 0b6d2397..5f80cc49 100644 --- a/R/add_publication_from_pubmed.R +++ b/R/add_publication_from_pubmed.R @@ -5,11 +5,11 @@ .add_publication_from_pubmed <- function(batch = 0L, cache = batch) { # implement logging for batch? pmids <- new_data <- NULL counter <- 0L - function(pmid, study_id, disease_focus, manifestation, + function(pmid, study_id, disease_focus, manifestation, publication_table_id, study_table_id, dry_run = T) { - + .check_login() - + counter <<- counter + 1L # cat("current record:", counter) # make verbose? # Query only for data needed, i.e. PMID to check non-dup; result can be cached @@ -17,18 +17,18 @@ pmids <- table_query(publication_table_id, "pmid") %>% unlist(use.names = F) if(cache) pmids <<- pmids } - + if(pmid %in% pmids) { message(glue::glue("PMID:{pmid} already exists in destination table!")) # Possible that PMID needs to link to other study IDs } else { - record <- from_pubmed(pmid) + record <- from_pubmed(pmid) if(!length(record)) return() - + study_id_set <- glue::glue_collapse(glue::single_quote(study_id), sep = ", ") study <- .syn$tableQuery(glue::glue("SELECT studyId, studyName, fundingAgency FROM {study_table_id} WHERE studyId IN ({study_id_set})"))$asDataFrame() record <- cbind(record, diseaseFocus = I(list(disease_focus)), manifestation = I(list(manifestation)), studyId = I(list(study$studyId)), studyName = I(list(study$studyName)), fundingAgency = I(list(study$fundingAgency))) - + # If batch mode, rbind and defer table schemafication until all records processed if(batch) { new_data <<- rbind(new_data, record) @@ -47,20 +47,20 @@ } #' Add a publication to the publication table -#' +#' #' Requires that the publication be in PubMed to auto-derive metadata such as authors, title, etc. -#' In contrast, `disease_focus` and `manifestation` need to be supplemented by the curator. -#' The `study_id` is used to get consistent `studyName` and `fundingAgency` from study table without manual input. -#' +#' In contrast, `disease_focus` and `manifestation` need to be supplemented by the curator. +#' The `study_id` is used to get consistent `studyName` and `fundingAgency` from study table without manual input. +#' #' @param pmid PubMed ID (*not* PMCID) of the publication to be added. -#' @param study_id Synapse id(s) of the study that are associated with the publication. +#' @param study_id Synapse id(s) of the study that are associated with the publication. #' @param disease_focus The disease focus(s) that are associated with the publication. #' @param manifestation The manifestation(s) that are associated with the publication. #' @param publication_table_id Synapse id of the portal publication table. Must have write access. #' @param study_table_id Synapse id of the portal study table. Need read access. #' @param dry_run Default = TRUE. Skips upload to table and instead prints formatted publication metadata. #' @return If dry_run == T, returns publication metadata to be added. -#' @examples +#' @examples #' \dontrun{ #' add_publication_from_pubmed( #' pmid = "33574490", @@ -74,63 +74,64 @@ add_publication_from_pubmed <- .add_publication_from_pubmed() #' Get publication metadata from PubMed -#' +#' #' @param pmid PubMed id. #' @return If PMID found, return meta as table w/ `title` `journal` `author` `year` `pmid` `doi`. #' @export from_pubmed <- function(pmid) { - - res <- easyPubMed::get_pubmed_ids(pmid) - if(res$Count == 0) { + + res <- easyPubMed::get_pubmed_ids(pmid) + if(res$Count == 0) { message(glue::glue("Nothing found for PMID:{pmid}")) - return() # Return NULL early if no records found + return() # Return NULL early if no records found } - p <- easyPubMed::fetch_pubmed_data(res, format = "xml", retmax = 1) %>% + p <- easyPubMed::fetch_pubmed_data(res, format = "xml", retmax = 1) %>% `[[`(1) %>% xml2::read_xml() %>% xml2::as_list() - - authors <- mapply(function(author) paste(author$ForeName, author$LastName), - p$PubmedArticleSet$PubmedArticle$MedlineCitation$Article$AuthorList) + + authors <- mapply(function(author) paste(author$ForeName, author$LastName), + p$PubmedArticleSet$PubmedArticle$MedlineCitation$Article$AuthorList) journal <- tools::toTitleCase(p$PubmedArticleSet$PubmedArticle$MedlineCitation$Article$Journal$Title[[1]]) title <- glue::glue_collapse(unlist(p$PubmedArticleSet$PubmedArticle$MedlineCitation$Article$ArticleTitle)) doi <- paste0("https://www.doi.org/", p$PubmedArticleSet$PubmedArticle$MedlineCitation$Article$ELocationID[[1]]) - year <- p$PubmedArticleSet$PubmedArticle$MedlineCitation$Article$Journal$JournalIssue$PubDate$Year[[1]] + year <- p$PubmedArticleSet$PubmedArticle$MedlineCitation$Article$Journal$JournalIssue$PubDate$Year[[1]] if(is.null(year)) { # when not available resort to ArticleDate with note (often doesn't matter) year <- p$PubmedArticleSet$PubmedArticle$MedlineCitation$Article$ArticleDate$Year[[1]] message(glue::glue("Note: Using article year for PMID:{pmid} because of missing journal meta. Review and modify if needed.")) } - + record <- data.frame(title = title, journal = journal, author = I(list(authors)), - year = year, pmid = pmid, doi = doi) + year = year, pmid = glue::glue("PMID:{pmid}"), doi = doi) return(record) } #' Add a batch of publications from spreadsheet -#' +#' #' @inheritParams add_publication_from_pubmed -#' @param file Spreadsheet (.csv/.tsv) with pubs to add should have `pmid`, `studyId`, `diseaseFocus`, `manifestation`. +#' @param file Spreadsheet (.csv/.tsv) with pubs to add should have `pmid`, `studyId`, `diseaseFocus`, `manifestation`. #' `pmid` is one per row and unique, rest can be `list_sep` vals. #' @param list_sep Delimiter character used to separate list columns. #' @import data.table #' @export -add_publications_from_file <- function(file, - publication_table_id, study_table_id, +add_publications_from_file <- function(file, + publication_table_id, + study_table_id, list_sep = "|", dry_run = TRUE) { - + pubs <- fread(file, colClasses = "character") n <- nrow(pubs) for(col in c("studyId", "diseaseFocus", "manifestation")) { - pubs[[col]] <- strsplit(pubs[[col]], split = list_sep, fixed = TRUE) + pubs[[col]] <- strsplit(pubs[[col]], split = list_sep, fixed = TRUE) } - + add_pub <- .add_publication_from_pubmed(batch = n) for(i in 1:n) { - new_pubs <- add_pub(pmid = pubs$pmid[i], - study_id = pubs$studyId[[i]], - disease_focus = pubs$diseaseFocus[[i]], - manifestation = pubs$manifestation[[i]], + new_pubs <- add_pub(pmid = pubs$pmid[i], + study_id = pubs$studyId[[i]], + disease_focus = pubs$diseaseFocus[[i]], + manifestation = pubs$manifestation[[i]], publication_table_id = publication_table_id, - study_table_id = study_table_id, + study_table_id = study_table_id, dry_run = dry_run) } new_pubs