Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update formatting of PMID #167

Merged
merged 1 commit into from
Feb 14, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 38 additions & 37 deletions R/add_publication_from_pubmed.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,30 +5,30 @@
.add_publication_from_pubmed <- function(batch = 0L, cache = batch) { # implement logging for batch?
pmids <- new_data <- NULL
counter <- 0L
function(pmid, study_id, disease_focus, manifestation,
function(pmid, study_id, disease_focus, manifestation,
publication_table_id, study_table_id, dry_run = T) {

.check_login()

counter <<- counter + 1L
# cat("current record:", counter) # make verbose?
# Query only for data needed, i.e. PMID to check non-dup; result can be cached
if(is.null(pmids)) {
pmids <- table_query(publication_table_id, "pmid") %>% unlist(use.names = F)
if(cache) pmids <<- pmids
}

if(pmid %in% pmids) {
message(glue::glue("PMID:{pmid} already exists in destination table!")) # Possible that PMID needs to link to other study IDs
} else {
record <- from_pubmed(pmid)
record <- from_pubmed(pmid)
if(!length(record)) return()

study_id_set <- glue::glue_collapse(glue::single_quote(study_id), sep = ", ")
study <- .syn$tableQuery(glue::glue("SELECT studyId, studyName, fundingAgency FROM {study_table_id} WHERE studyId IN ({study_id_set})"))$asDataFrame()
record <- cbind(record, diseaseFocus = I(list(disease_focus)), manifestation = I(list(manifestation)),
studyId = I(list(study$studyId)), studyName = I(list(study$studyName)), fundingAgency = I(list(study$fundingAgency)))

# If batch mode, rbind and defer table schemafication until all records processed
if(batch) {
new_data <<- rbind(new_data, record)
Expand All @@ -47,20 +47,20 @@
}

#' Add a publication to the publication table
#'
#'
#' Requires that the publication be in PubMed to auto-derive metadata such as authors, title, etc.
#' In contrast, `disease_focus` and `manifestation` need to be supplemented by the curator.
#' The `study_id` is used to get consistent `studyName` and `fundingAgency` from study table without manual input.
#'
#' In contrast, `disease_focus` and `manifestation` need to be supplemented by the curator.
#' The `study_id` is used to get consistent `studyName` and `fundingAgency` from study table without manual input.
#'
#' @param pmid PubMed ID (*not* PMCID) of the publication to be added.
#' @param study_id Synapse id(s) of the study that are associated with the publication.
#' @param study_id Synapse id(s) of the study that are associated with the publication.
#' @param disease_focus The disease focus(s) that are associated with the publication.
#' @param manifestation The manifestation(s) that are associated with the publication.
#' @param publication_table_id Synapse id of the portal publication table. Must have write access.
#' @param study_table_id Synapse id of the portal study table. Need read access.
#' @param dry_run Default = TRUE. Skips upload to table and instead prints formatted publication metadata.
#' @return If dry_run == T, returns publication metadata to be added.
#' @examples
#' @examples
#' \dontrun{
#' add_publication_from_pubmed(
#' pmid = "33574490",
Expand All @@ -74,63 +74,64 @@
add_publication_from_pubmed <- .add_publication_from_pubmed()

#' Get publication metadata from PubMed
#'
#'
#' @param pmid PubMed id.
#' @return If PMID found, return meta as table w/ `title` `journal` `author` `year` `pmid` `doi`.
#' @export
from_pubmed <- function(pmid) {
res <- easyPubMed::get_pubmed_ids(pmid)
if(res$Count == 0) {

res <- easyPubMed::get_pubmed_ids(pmid)
if(res$Count == 0) {
message(glue::glue("Nothing found for PMID:{pmid}"))
return() # Return NULL early if no records found
return() # Return NULL early if no records found
}
p <- easyPubMed::fetch_pubmed_data(res, format = "xml", retmax = 1) %>%
p <- easyPubMed::fetch_pubmed_data(res, format = "xml", retmax = 1) %>%
`[[`(1) %>% xml2::read_xml() %>% xml2::as_list()
authors <- mapply(function(author) paste(author$ForeName, author$LastName),
p$PubmedArticleSet$PubmedArticle$MedlineCitation$Article$AuthorList)

authors <- mapply(function(author) paste(author$ForeName, author$LastName),
p$PubmedArticleSet$PubmedArticle$MedlineCitation$Article$AuthorList)
journal <- tools::toTitleCase(p$PubmedArticleSet$PubmedArticle$MedlineCitation$Article$Journal$Title[[1]])
title <- glue::glue_collapse(unlist(p$PubmedArticleSet$PubmedArticle$MedlineCitation$Article$ArticleTitle))
doi <- paste0("https://www.doi.org/", p$PubmedArticleSet$PubmedArticle$MedlineCitation$Article$ELocationID[[1]])
year <- p$PubmedArticleSet$PubmedArticle$MedlineCitation$Article$Journal$JournalIssue$PubDate$Year[[1]]
year <- p$PubmedArticleSet$PubmedArticle$MedlineCitation$Article$Journal$JournalIssue$PubDate$Year[[1]]
if(is.null(year)) { # when not available resort to ArticleDate with note (often doesn't matter)
year <- p$PubmedArticleSet$PubmedArticle$MedlineCitation$Article$ArticleDate$Year[[1]]
message(glue::glue("Note: Using article year for PMID:{pmid} because of missing journal meta. Review and modify if needed."))
}

record <- data.frame(title = title, journal = journal, author = I(list(authors)),
year = year, pmid = pmid, doi = doi)
year = year, pmid = glue::glue("PMID:{pmid}"), doi = doi)
return(record)
}


#' Add a batch of publications from spreadsheet
#'
#'
#' @inheritParams add_publication_from_pubmed
#' @param file Spreadsheet (.csv/.tsv) with pubs to add should have `pmid`, `studyId`, `diseaseFocus`, `manifestation`.
#' @param file Spreadsheet (.csv/.tsv) with pubs to add should have `pmid`, `studyId`, `diseaseFocus`, `manifestation`.
#' `pmid` is one per row and unique, rest can be `list_sep` vals.
#' @param list_sep Delimiter character used to separate list columns.
#' @import data.table
#' @export
add_publications_from_file <- function(file,
publication_table_id, study_table_id,
add_publications_from_file <- function(file,
publication_table_id,
study_table_id,
list_sep = "|", dry_run = TRUE) {

pubs <- fread(file, colClasses = "character")
n <- nrow(pubs)
for(col in c("studyId", "diseaseFocus", "manifestation")) {
pubs[[col]] <- strsplit(pubs[[col]], split = list_sep, fixed = TRUE)
pubs[[col]] <- strsplit(pubs[[col]], split = list_sep, fixed = TRUE)
}

add_pub <- .add_publication_from_pubmed(batch = n)
for(i in 1:n) {
new_pubs <- add_pub(pmid = pubs$pmid[i],
study_id = pubs$studyId[[i]],
disease_focus = pubs$diseaseFocus[[i]],
manifestation = pubs$manifestation[[i]],
new_pubs <- add_pub(pmid = pubs$pmid[i],
study_id = pubs$studyId[[i]],
disease_focus = pubs$diseaseFocus[[i]],
manifestation = pubs$manifestation[[i]],
publication_table_id = publication_table_id,
study_table_id = study_table_id,
study_table_id = study_table_id,
dry_run = dry_run)
}
new_pubs
Expand Down
Loading