From 2845b8d63a235306c17b8858235f04a7765d1eab Mon Sep 17 00:00:00 2001
From: Anh Nguyet Vu <anngvu@gmail.com>
Date: Tue, 13 Feb 2024 14:00:11 -0700
Subject: [PATCH] Update formatting of PMID

---
 R/add_publication_from_pubmed.R | 75 +++++++++++++++++----------------
 1 file changed, 38 insertions(+), 37 deletions(-)

diff --git a/R/add_publication_from_pubmed.R b/R/add_publication_from_pubmed.R
index 0b6d2397..5f80cc49 100644
--- a/R/add_publication_from_pubmed.R
+++ b/R/add_publication_from_pubmed.R
@@ -5,11 +5,11 @@
 .add_publication_from_pubmed <- function(batch = 0L, cache = batch) { # implement logging for batch?
   pmids <- new_data <- NULL
   counter <- 0L
-  function(pmid, study_id, disease_focus, manifestation, 
+  function(pmid, study_id, disease_focus, manifestation,
            publication_table_id, study_table_id, dry_run = T) {
-    
+
     .check_login()
-    
+
     counter <<- counter + 1L
     # cat("current record:", counter) # make verbose?
     # Query only for data needed, i.e. PMID to check non-dup; result can be cached
@@ -17,18 +17,18 @@
       pmids <- table_query(publication_table_id, "pmid") %>% unlist(use.names = F)
       if(cache) pmids <<- pmids
     }
-    
+
     if(pmid %in% pmids) {
       message(glue::glue("PMID:{pmid} already exists in destination table!")) # Possible that PMID needs to link to other study IDs
     } else {
-      record <- from_pubmed(pmid) 
+      record <- from_pubmed(pmid)
       if(!length(record)) return()
-      
+
       study_id_set <- glue::glue_collapse(glue::single_quote(study_id), sep = ", ")
       study <- .syn$tableQuery(glue::glue("SELECT studyId, studyName, fundingAgency FROM {study_table_id} WHERE studyId IN ({study_id_set})"))$asDataFrame()
       record <- cbind(record, diseaseFocus = I(list(disease_focus)), manifestation = I(list(manifestation)),
                       studyId = I(list(study$studyId)), studyName = I(list(study$studyName)), fundingAgency = I(list(study$fundingAgency)))
-      
+
       # If batch mode, rbind and defer table schemafication until all records processed
       if(batch) {
         new_data <<- rbind(new_data, record)
@@ -47,20 +47,20 @@
 }
 
 #' Add a publication to the publication table
-#' 
+#'
 #' Requires that the publication be in PubMed to auto-derive metadata such as authors, title, etc.
-#' In contrast, `disease_focus` and `manifestation` need to be supplemented by the curator. 
-#' The `study_id` is used to get consistent `studyName` and `fundingAgency` from study table without manual input. 
-#' 
+#' In contrast, `disease_focus` and `manifestation` need to be supplemented by the curator.
+#' The `study_id` is used to get consistent `studyName` and `fundingAgency` from study table without manual input.
+#'
 #' @param pmid PubMed ID (*not* PMCID) of the publication to be added.
-#' @param study_id Synapse id(s) of the study that are associated with the publication. 
+#' @param study_id Synapse id(s) of the study that are associated with the publication.
 #' @param disease_focus The disease focus(s) that are associated with the publication.
 #' @param manifestation The manifestation(s) that are associated with the publication.
 #' @param publication_table_id Synapse id of the portal publication table. Must have write access.
 #' @param study_table_id Synapse id of the portal study table. Need read access.
 #' @param dry_run Default = TRUE. Skips upload to table and instead prints formatted publication metadata.
 #' @return If dry_run == T, returns publication metadata to be added.
-#' @examples 
+#' @examples
 #' \dontrun{
 #' add_publication_from_pubmed(
 #'                pmid = "33574490",
@@ -74,63 +74,64 @@
 add_publication_from_pubmed <- .add_publication_from_pubmed()
 
 #' Get publication metadata from PubMed
-#' 
+#'
 #' @param pmid PubMed id.
 #' @return If PMID found, return meta as table w/ `title` `journal` `author` `year` `pmid` `doi`.
 #' @export
 from_pubmed <- function(pmid) {
-  
-  res <- easyPubMed::get_pubmed_ids(pmid) 
-  if(res$Count == 0) { 
+
+  res <- easyPubMed::get_pubmed_ids(pmid)
+  if(res$Count == 0) {
     message(glue::glue("Nothing found for PMID:{pmid}"))
-    return() # Return NULL early if no records found 
+    return() # Return NULL early if no records found
   }
-  p <- easyPubMed::fetch_pubmed_data(res, format = "xml", retmax = 1) %>% 
+  p <- easyPubMed::fetch_pubmed_data(res, format = "xml", retmax = 1) %>%
     `[[`(1) %>% xml2::read_xml() %>% xml2::as_list()
-  
-  authors <- mapply(function(author) paste(author$ForeName, author$LastName), 
-                    p$PubmedArticleSet$PubmedArticle$MedlineCitation$Article$AuthorList) 
+
+  authors <- mapply(function(author) paste(author$ForeName, author$LastName),
+                    p$PubmedArticleSet$PubmedArticle$MedlineCitation$Article$AuthorList)
   journal <- tools::toTitleCase(p$PubmedArticleSet$PubmedArticle$MedlineCitation$Article$Journal$Title[[1]])
   title <- glue::glue_collapse(unlist(p$PubmedArticleSet$PubmedArticle$MedlineCitation$Article$ArticleTitle))
   doi <- paste0("https://www.doi.org/", p$PubmedArticleSet$PubmedArticle$MedlineCitation$Article$ELocationID[[1]])
-  year <- p$PubmedArticleSet$PubmedArticle$MedlineCitation$Article$Journal$JournalIssue$PubDate$Year[[1]] 
+  year <- p$PubmedArticleSet$PubmedArticle$MedlineCitation$Article$Journal$JournalIssue$PubDate$Year[[1]]
   if(is.null(year)) { # when not available resort to ArticleDate with note (often doesn't matter)
    year <- p$PubmedArticleSet$PubmedArticle$MedlineCitation$Article$ArticleDate$Year[[1]]
    message(glue::glue("Note: Using article year for PMID:{pmid} because of missing journal meta. Review and modify if needed."))
   }
-  
+
   record <- data.frame(title = title, journal = journal, author = I(list(authors)),
-                       year = year, pmid = pmid, doi = doi)
+                       year = year, pmid = glue::glue("PMID:{pmid}"), doi = doi)
   return(record)
 }
 
 
 #' Add a batch of publications from spreadsheet
-#' 
+#'
 #' @inheritParams add_publication_from_pubmed
-#' @param file Spreadsheet (.csv/.tsv) with pubs to add should have `pmid`, `studyId`, `diseaseFocus`, `manifestation`. 
+#' @param file Spreadsheet (.csv/.tsv) with pubs to add should have `pmid`, `studyId`, `diseaseFocus`, `manifestation`.
 #' `pmid` is one per row and unique, rest can be `list_sep` vals.
 #' @param list_sep Delimiter character used to separate list columns.
 #' @import data.table
 #' @export
-add_publications_from_file <- function(file, 
-                                       publication_table_id, study_table_id, 
+add_publications_from_file <- function(file,
+                                       publication_table_id,
+                                       study_table_id,
                                        list_sep = "|", dry_run = TRUE) {
-  
+
   pubs <- fread(file, colClasses = "character")
   n <- nrow(pubs)
   for(col in c("studyId", "diseaseFocus", "manifestation")) {
-    pubs[[col]] <- strsplit(pubs[[col]], split = list_sep, fixed = TRUE) 
+    pubs[[col]] <- strsplit(pubs[[col]], split = list_sep, fixed = TRUE)
   }
-  
+
   add_pub <- .add_publication_from_pubmed(batch = n)
   for(i in 1:n) {
-    new_pubs <- add_pub(pmid = pubs$pmid[i], 
-                        study_id = pubs$studyId[[i]], 
-                        disease_focus = pubs$diseaseFocus[[i]], 
-                        manifestation = pubs$manifestation[[i]], 
+    new_pubs <- add_pub(pmid = pubs$pmid[i],
+                        study_id = pubs$studyId[[i]],
+                        disease_focus = pubs$diseaseFocus[[i]],
+                        manifestation = pubs$manifestation[[i]],
                         publication_table_id = publication_table_id,
-                        study_table_id = study_table_id, 
+                        study_table_id = study_table_id,
                         dry_run = dry_run)
   }
   new_pubs