diff --git a/DESCRIPTION b/DESCRIPTION index c8be4c7e..ec3968af 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: poldis Type: Package Title: Analyse Political Texts -Version: 0.1.1 -Date: 2024-07-17 +Version: 0.1.2 +Date: 2024-08-08 Authors@R: c(person(given = "Henrique", family = "Sposito", @@ -46,8 +46,9 @@ Suggests: pdftools, fmsb, ggplot2, - tm -RoxygenNote: 7.3.1 + tm, + cli +RoxygenNote: 7.3.2 Encoding: UTF-8 LazyData: True Depends: diff --git a/NEWS.md b/NEWS.md index ffd1ddfa..86bb76c6 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,16 @@ +# poldis 0.1.2 + +2024-08-08 + +## Package + +- Updated documentation for `get_urgency()` to include more details about urgency scores + +## Functions + +- Updated `get_urgency()` and `gather_topics()` to open codebooks for urgency or topic when no arguments are declared +- Updated urgency scores in `get_urgency()` according to survey results. + # poldis 0.1.1 2024-07-17 diff --git a/R/priorities.R b/R/priorities.R index 3cfaf98f..8ec73992 100644 --- a/R/priorities.R +++ b/R/priorities.R @@ -13,6 +13,8 @@ #' @importFrom dplyr mutate distinct %>% #' @return A data frame with syntax information by sentences and #' a variable identifying which of these sentences are priorities. +#' @examples +#' #select_priorities(US_News_Conferences_1960_1980[1:2,3]) #' @export select_priorities <- function(.data, na.rm = TRUE) { tags <- sentence <- lemmas <- priorities <- NULL diff --git a/R/text_tools.R b/R/text_tools.R index 4c7b8e3e..60740b5c 100644 --- a/R/text_tools.R +++ b/R/text_tools.R @@ -7,6 +7,8 @@ #' @import spacyr #' @return A data frame of individual names and the number of times they appear. #' @details The function relies on named entity recognition from NLP models. +#' @examples +#' #extract_names(US_News_Conferences_1960_1980[20, 3]) #' @export extract_names <- function(v) { ent_type <- text <- s <- NULL @@ -39,6 +41,9 @@ extract_names <- function(v) { #' @importFrom stringdist stringsimmatrix #' @return A data frame of locations and the number of times they appear. #' @details The function relies on geographical entity detection from NLP models. +#' @examples +#' #extract_locations(c("This is the United States", "This is Sao Paulo", +#' #"I was in Rio de Janeiro and Sao Paulo, then back to the United States")) #' @export extract_locations <- function(v) { v <- stringi::stri_trans_general(v, id = "Latin-ASCII") @@ -83,6 +88,8 @@ extract_title <- function(v) { #' Wrapper function for `messydates::as_messydates`. #' @param v Text vector. #' @return A vector of the dates in text. +#' @examples +#' #extract_date("Today is the twenty six of February of two thousand and twenty four") #' @export extract_date <- function(v) { thisRequires("messydates") @@ -213,6 +220,8 @@ extract_context <- function(match, v, level = "sentences", n = 1) { #' in selecting a method, please see `?quanteda.textstats::textstat_simil()`. #' @importFrom dplyr group_by summarise select %>% #' @return A matrix of similarity scores between texts. +#' @examples +#' #extract_text_similarities(US_News_Conferences_1960_1980[1:2,3]) #' @export extract_text_similarities <- function(v, comparison = "similarities", method) { thisRequires("quanteda.textstats") @@ -309,6 +318,9 @@ read_pdf <- function(path) { #' @importFrom dplyr group_by summarise ungroup %>% #' @importFrom stringr str_squish #' @return A data frame with syntax information by words or sentences in text. +#' @examples +#' #annotate_text(US_News_Conferences_1960_1980[1:2, 3]) +#' #annotate_text(US_News_Conferences_1960_1980[1:2, 3], level = "sentence") #' @export annotate_text <- function(v, level = "words") { doc_id <- sentence_id <- token_id <- token <- pos <- tag <- lemma <- entity <- NULL diff --git a/R/topic.R b/R/topic.R index 21be3e31..6a0f4766 100644 --- a/R/topic.R +++ b/R/topic.R @@ -4,12 +4,15 @@ #' `select_priorities()`, or text vector. #' For data frames, function will search for "text" variable. #' For priorities data frame function will search for "priorities" variable. +#' If missing, opens the webpage containing the political topics codebook. #' @param dictionary The dictionary of 20 major political topics from the #' Comparative Agendas Project (Jones et al., 2023) is used by default. #' Users can also declare a custom dictionary as a vector or a list. #' If users declare a vector, each element is treated as a independent topic. #' If users declare a list of subjects and related terms, function understands #' names as topic and words as terms. +#' For more information on how the CAP topics were adapted, please run +#' `gather_topics()` to access the political topics codebook. #' @import dplyr #' @importFrom tidyr unite #' @return A list of topics present in each text separated by comma. @@ -21,10 +24,15 @@ #' gather_topics(US_News_Conferences_1960_1980[1:5, 3], #' dictionary = list("military" = c("military", "gun", "war"), #' "development" = c("development", "interest rate", "banks"))) +#' #summary(gather_topics(US_News_Conferences_1960_1980[1:5, 3])) +#' #plot(gather_topics(US_News_Conferences_1960_1980[1:5, 3], +#' # dictionary = c("military", "development"))) #' } #' @export gather_topics <- function(.data, dictionary = "CAP") { Words <- topics <- NULL + # tries to open topic codebook if no argument is declared + if (missing(.data)) open_codebook(codebook = "topic") # get text variable if (inherits(.data, "priorities")) { text <- stats::na.omit(.clean_token(getElement(.data, "priorities"))) @@ -81,6 +89,13 @@ gather_topics <- function(.data, dictionary = "CAP") { #' “Keyword-Assisted Topic Models.” #' _American Journal of Political Science_, 68(2): 730-750. #' \doi{10.1111/ajps.12779} +#' @examples +#' #gather_related_terms(US_News_Conferences_1960_1980[1:5, 3], dictionary = "CAP") +#' #gather_related_terms(US_News_Conferences_1960_1980[1:5, 3], +#' # dictionary = c("military", "development")) +#' #gather_related_terms(US_News_Conferences_1960_1980[1:5, 3], +#' # dictionary = list("military" = c("military", "gun", "war"), +#' # "development" = c("development", "interest rate", "banks"))) #' @export gather_related_terms <- function(.data, dictionary) { Words <- NULL diff --git a/R/urgency.R b/R/urgency.R index 158ad02f..df62543f 100644 --- a/R/urgency.R +++ b/R/urgency.R @@ -4,20 +4,60 @@ #' `select_priorities()`, or text vector. #' For data frames, function will search for "text" variable. #' For priorities data frame function will search for "priorities" variable. +#' If missing, opens the webpage containing the urgency codebook. #' @param normalize Would you like urgency scores to be normalized? #' By default, urgency scores are normalized by "tokens", #' the number of words in text observation. #' Users can also declare "none", for no normalization. +#' Since dictionaries for each dimension of urgency have a slightly different +#' number of words, scores for in each dimension are adjusted +#' by the number of words in each dictionary by default. +#' @details +#' Urgency in political discourses is an expression of how necessary and/or +#' how soon an action should be undertaken or completed. +#' This is measured along four dimensions, +#' two related to necessity and two related to timing. +#' The first two dimensions, degree of intensity and degree of commitment, +#' relate to the necessity of taking the action, while the next two dimensions, +#' frequency of action and timing of action, +#' relate to the timing in which action is taken. +#' Our dictionary includes terms in each of these dimensions. +#' The terms included in each of these dimensions of urgency were first adapted +#' from established lexicon dictionaries and later complemented by other terms +#' inductively found in texts during pre-testing. +#' The words in the dictionaries for each dimension are scored on a scale +#' between 0 and 1, with 1 being the maximum value obtainable and contributing +#' the most to the urgency score of the sentence. +#' Urgency terms were validated and adjusted with online survey +#' with 206 participants that took place between July and August of 2024. +#' The survey collected responses anonymously but included basic demographic +#' information about participants, as English proficiency and education levels. +#' The survey results were recorded as counts of the number of participants +#' who said a certain randomly selected urgency related word was more urgent +#' than another randomly selected urgency related word. +#' To analyze the survey results, we employed Bradley-Terry models for +#' paired comparisons using the BradleyTerry2 R package (Turner and Firth, 2012). +#' This allowed to obtain a rank of the words for each dimension of urgency. +#' The rankings were then used to adjust and validate urgency word scores +#' in the dictionaries. +#' For more information on the dimensions, scores, or the survey on urgency, +#' please run `get_urgency()` to access the urgency codebook. #' @return A scored data frame for each dimension of urgency. #' @import dplyr #' @examples #' \donttest{ #' get_urgency(US_News_Conferences_1960_1980[1:10, 3]) #' get_urgency(US_News_Conferences_1960_1980[1:10,]) +#' #get_urgency(select_priorities(US_News_Conferences_1960_1980[1:2, 3])) +#' #summary(get_urgency(US_News_Conferences_1960_1980[1:10, 3])) +#' #plot(get_urgency(US_News_Conferences_1960_1980[1:10, 3])) +#' #get_urgency() #' } #' @export get_urgency <- function(.data, normalize = "tokens") { Frequency <- Timing <- Commitment <- Intensity <- Urgency <- text_clean <- NULL + # tries to open urgency codebook if no argument is declared + if (missing(.data)) open_codebook(codebook = "urgency") # get text variable if (inherits(.data, "priorities")) { text_clean <- getElement(.data, "priorities") @@ -26,10 +66,10 @@ get_urgency <- function(.data, normalize = "tokens") { } else text_clean <- .data # assign urgency dimensions out <- data.frame("text_clean" = .clean_token(text_clean)) %>% - dplyr::mutate(Frequency = .assign_frequencies(text_clean), - Timing = .assign_timing(text_clean), - Intensity = .assign_intensity(text_clean), - Commitment = .assign_commitment(text_clean)) + dplyr::mutate(Frequency = .assign_frequencies(text_clean)*1.2, #61 terms + Timing = .assign_timing(text_clean)*1.3, #41 terms + Intensity = .assign_intensity(text_clean), #98 terms + Commitment = .assign_commitment(text_clean)*1.1) #85 terms if (normalize == "tokens") { out <- out %>% dplyr::mutate(Urgency = (Frequency + Timing + Intensity + Commitment)/ @@ -44,54 +84,54 @@ get_urgency <- function(.data, normalize = "tokens") { } .assign_frequencies <- function(v) { - frequency <- score_frequency <- NULL - freq_words <- urgency_word_scores[,1:2] %>% + frequency <- score_frequency_scaled <- NULL + freq_words <- urgency_word_scores[,c(5,8)] %>% tidyr::drop_na() %>% dplyr::mutate(frequency = textstem::lemmatize_words(frequency)) %>% - dplyr::distinct() %>% # 61 terms - dplyr::group_by(score_frequency) %>% + dplyr::distinct() %>% + dplyr::group_by(score_frequency_scaled) %>% summarise(terms = paste0(frequency, collapse = "|")) rowSums(do.call("cbind", lapply(seq_len(nrow(freq_words)), function(i) stringr::str_count(as.character(v), paste0("\\b", freq_words$terms[i], "\\b"))* - freq_words$score_frequency[i]))) + freq_words$score_frequency_scaled[i]))) } .assign_timing <- function(v) { - timing <- score_timing <- NULL - timing_words <- urgency_word_scores[,3:4] %>% + timing <- score_timing_scaled <- NULL + timing_words <- urgency_word_scores[,c(1,4)] %>% tidyr::drop_na() %>% dplyr::mutate(timing = textstem::lemmatize_words(timing)) %>% dplyr::distinct() %>% # 41 terms - dplyr::group_by(score_timing) %>% + dplyr::group_by(score_timing_scaled) %>% summarise(terms = paste0(timing, collapse = "|")) rowSums(do.call("cbind", lapply(seq_len(nrow(timing_words)), function(i) stringr::str_count(as.character(v), paste0("\\b", timing_words$terms[i], "\\b"))* - timing_words$score_timing[i]))) + timing_words$score_timing_scaled[i]))) } .assign_intensity <- function(v) { - intensity <- score_intensity <- NULL - intensity_words <- urgency_word_scores[,5:6] %>% + intensity <- score_intensity_scaled <- NULL + intensity_words <- urgency_word_scores[,c(13, 16)] %>% tidyr::drop_na() %>% dplyr::mutate(intensity = textstem::lemmatize_words(intensity)) %>% - dplyr::distinct() %>% # 103 terms - dplyr::group_by(score_intensity) %>% + dplyr::distinct() %>% + dplyr::group_by(score_intensity_scaled) %>% summarise(terms = paste0(intensity, collapse = "|")) rowSums(do.call("cbind", lapply(seq_len(nrow(intensity_words)), function(i) stringr::str_count(as.character(v), paste0("\\b", intensity_words$terms[i], "\\b"))* - intensity_words$score_intensity[i]))) + intensity_words$score_intensity_scaled[i]))) } .assign_commitment <- function(v) { - commitment <- score_commitment <- NULL - commitment_words <- urgency_word_scores[,7:8] %>% + commitment <- score_commitment_scaled <- NULL + commitment_words <- urgency_word_scores[,c(9,12)] %>% tidyr::drop_na() %>% dplyr::mutate(commitment = textstem::lemmatize_words(commitment)) %>% - dplyr::distinct() %>% # 78 terms - dplyr::group_by(score_commitment) %>% + dplyr::distinct() %>% + dplyr::group_by(score_commitment_scaled) %>% summarise(terms = paste0(commitment, collapse = "|")) out <- list() rowSums(do.call("cbind", lapply(seq_len(nrow(commitment_words)), function(i) stringr::str_count(as.character(v), paste0("\\b", commitment_words$terms[i], "\\b"))* - commitment_words$score_commitment[i]))) + commitment_words$score_commitment_scaled[i]))) } diff --git a/R/utils.R b/R/utils.R index b0599f36..f8d0f514 100644 --- a/R/utils.R +++ b/R/utils.R @@ -13,3 +13,30 @@ thisRequires <- function(pkgname){ .clean_token <- function(v) { textstem::lemmatize_words(stringr::str_squish(tolower(v))) } + +# Helper function to get urgency or topics codebook. +open_codebook <- function(codebook = "urgency") { + thisRequires("cli") + url <- "https://github.com/henriquesposito/poldis/inst" + if (codebook == "urgency") { + tryCatch({ + utils::browseURL(paste0(url, "/urgency_codebok.pdf"), + browser = getOption("browser"), encodeIfNeeded = FALSE) + }, error = function(e) { + message(paste0("Unable to open codebook, please visit: ", + cli::style_hyperlink(paste0(url, "/urgency_codebok.pdf"), + paste0(url, "/urgency_codebok.pdf")))) + }) + } else if (codebook == "topic") { + tryCatch({ + utils::browseURL(paste0(url, "/topic.pdf"), browser = getOption("browser"), encodeIfNeeded = FALSE) + }, error = function(e) { + message(paste0("Unable to open codebook, please visit: ", + cli::style_hyperlink(paste0(url, "/topic.pdf"), + paste0(url, "/topic.pdf")))) + }) + } else { + message(paste0("Codebook not found, to see the available codebooks please visit: ", + cli::style_hyperlink(url, url))) + } +} diff --git a/man/annotate_text.Rd b/man/annotate_text.Rd index ef8c108c..4ba1d1b0 100644 --- a/man/annotate_text.Rd +++ b/man/annotate_text.Rd @@ -19,3 +19,7 @@ A data frame with syntax information by words or sentences in text. \description{ This function relies on `\{spacyr\}` NLP parsing to annotate texts. } +\examples{ +#annotate_text(US_News_Conferences_1960_1980[1:2, 3]) +#annotate_text(US_News_Conferences_1960_1980[1:2, 3], level = "sentence") +} diff --git a/man/extract_date.Rd b/man/extract_date.Rd index f985f130..5c231f00 100644 --- a/man/extract_date.Rd +++ b/man/extract_date.Rd @@ -15,3 +15,6 @@ A vector of the dates in text. \description{ Wrapper function for `messydates::as_messydates`. } +\examples{ +#extract_date("Today is the twenty six of February of two thousand and twenty four") +} diff --git a/man/extract_locations.Rd b/man/extract_locations.Rd index 3b204ba0..d19bcc82 100644 --- a/man/extract_locations.Rd +++ b/man/extract_locations.Rd @@ -18,3 +18,7 @@ Extract locations from strings \details{ The function relies on geographical entity detection from NLP models. } +\examples{ +#extract_locations(c("This is the United States", "This is Sao Paulo", +#"I was in Rio de Janeiro and Sao Paulo, then back to the United States")) +} diff --git a/man/extract_names.Rd b/man/extract_names.Rd index 2d6080cd..4aad9fe1 100644 --- a/man/extract_names.Rd +++ b/man/extract_names.Rd @@ -18,3 +18,6 @@ Extract a list of possible names of individuals in texts \details{ The function relies on named entity recognition from NLP models. } +\examples{ +#extract_names(US_News_Conferences_1960_1980[20, 3]) +} diff --git a/man/extract_text_similarities.Rd b/man/extract_text_similarities.Rd index 82a55eca..00579e9a 100644 --- a/man/extract_text_similarities.Rd +++ b/man/extract_text_similarities.Rd @@ -30,3 +30,6 @@ A matrix of similarity scores between texts. \description{ Extract similarities and differences in texts/segments } +\examples{ +#extract_text_similarities(US_News_Conferences_1960_1980[1:2,3]) +} diff --git a/man/gather_related_terms.Rd b/man/gather_related_terms.Rd index c4a39f09..038d22c7 100644 --- a/man/gather_related_terms.Rd +++ b/man/gather_related_terms.Rd @@ -30,6 +30,14 @@ This function relies on keyword assisted topic models implemented in the `\{keyATM\}` package to find related words based on the topics provided and texts in which they appear. } +\examples{ +#gather_related_terms(US_News_Conferences_1960_1980[1:5, 3], dictionary = "CAP") +#gather_related_terms(US_News_Conferences_1960_1980[1:5, 3], +# dictionary = c("military", "development")) +#gather_related_terms(US_News_Conferences_1960_1980[1:5, 3], +# dictionary = list("military" = c("military", "gun", "war"), +# "development" = c("development", "interest rate", "banks"))) +} \references{ Eshima S, Imai K, and Sasaki T. 2024. “Keyword-Assisted Topic Models.” diff --git a/man/gather_topics.Rd b/man/gather_topics.Rd index f357925d..3eac8177 100644 --- a/man/gather_topics.Rd +++ b/man/gather_topics.Rd @@ -10,14 +10,17 @@ gather_topics(.data, dictionary = "CAP") \item{.data}{A data frame, priorities data frame coded using `select_priorities()`, or text vector. For data frames, function will search for "text" variable. -For priorities data frame function will search for "priorities" variable.} +For priorities data frame function will search for "priorities" variable. +If missing, opens the webpage containing the political topics codebook.} \item{dictionary}{The dictionary of 20 major political topics from the Comparative Agendas Project (Jones et al., 2023) is used by default. Users can also declare a custom dictionary as a vector or a list. If users declare a vector, each element is treated as a independent topic. If users declare a list of subjects and related terms, function understands -names as topic and words as terms.} +names as topic and words as terms. +For more information on how the CAP topics were adapted, please run +`gather_topics()` to access the political topics codebook.} } \value{ A list of topics present in each text separated by comma. @@ -33,5 +36,8 @@ gather_topics(US_News_Conferences_1960_1980[1:5, 3], gather_topics(US_News_Conferences_1960_1980[1:5, 3], dictionary = list("military" = c("military", "gun", "war"), "development" = c("development", "interest rate", "banks"))) +#summary(gather_topics(US_News_Conferences_1960_1980[1:5, 3])) +#plot(gather_topics(US_News_Conferences_1960_1980[1:5, 3], +# dictionary = c("military", "development"))) } } diff --git a/man/get_urgency.Rd b/man/get_urgency.Rd index 3686b06d..48926a47 100644 --- a/man/get_urgency.Rd +++ b/man/get_urgency.Rd @@ -10,12 +10,16 @@ get_urgency(.data, normalize = "tokens") \item{.data}{A data frame, priorities data frame coded using `select_priorities()`, or text vector. For data frames, function will search for "text" variable. -For priorities data frame function will search for "priorities" variable.} +For priorities data frame function will search for "priorities" variable. +If missing, opens the webpage containing the urgency codebook.} \item{normalize}{Would you like urgency scores to be normalized? By default, urgency scores are normalized by "tokens", the number of words in text observation. -Users can also declare "none", for no normalization.} +Users can also declare "none", for no normalization. +Since dictionaries for each dimension of urgency have a slightly different +number of words, scores for in each dimension are adjusted +by the number of words in each dictionary by default.} } \value{ A scored data frame for each dimension of urgency. @@ -23,9 +27,44 @@ A scored data frame for each dimension of urgency. \description{ Urgency Analysis } +\details{ +Urgency in political discourses is an expression of how necessary and/or +how soon an action should be undertaken or completed. +This is measured along four dimensions, +two related to necessity and two related to timing. +The first two dimensions, degree of intensity and degree of commitment, +relate to the necessity of taking the action, while the next two dimensions, +frequency of action and timing of action, +relate to the timing in which action is taken. +Our dictionary includes terms in each of these dimensions. +The terms included in each of these dimensions of urgency were first adapted +from established lexicon dictionaries and later complemented by other terms +inductively found in texts during pre-testing. +The words in the dictionaries for each dimension are scored on a scale +between 0 and 1, with 1 being the maximum value obtainable and contributing +the most to the urgency score of the sentence. +Urgency terms were validated and adjusted with online survey +with 206 participants that took place between July and August of 2024. +The survey collected responses anonymously but included basic demographic +information about participants, as English proficiency and education levels. +The survey results were recorded as counts of the number of participants +who said a certain randomly selected urgency related word was more urgent +than another randomly selected urgency related word. +To analyze the survey results, we employed Bradley-Terry models for +paired comparisons using the BradleyTerry2 R package (Turner and Firth, 2012). +This allowed to obtain a rank of the words for each dimension of urgency. +The rankings were then used to adjust and validate urgency word scores +in the dictionaries. +For more information on the dimensions, scores, or the survey on urgency, +please run `get_urgency()` to access the urgency codebook. +} \examples{ \donttest{ get_urgency(US_News_Conferences_1960_1980[1:10, 3]) get_urgency(US_News_Conferences_1960_1980[1:10,]) +#get_urgency(select_priorities(US_News_Conferences_1960_1980[1:2, 3])) +#summary(get_urgency(US_News_Conferences_1960_1980[1:10, 3])) +#plot(get_urgency(US_News_Conferences_1960_1980[1:10, 3])) +#get_urgency() } } diff --git a/man/select_priorities.Rd b/man/select_priorities.Rd index 0fd7ec6f..0374f58a 100644 --- a/man/select_priorities.Rd +++ b/man/select_priorities.Rd @@ -24,3 +24,6 @@ a variable identifying which of these sentences are priorities. Political priorities are statements in which actors express their intent or commitment to take political action in the future. } +\examples{ +#select_priorities(US_News_Conferences_1960_1980[1:2,3]) +} diff --git a/tests/testthat/test_urgency.R b/tests/testthat/test_urgency.R index ea93ea05..3fc0030f 100644 --- a/tests/testthat/test_urgency.R +++ b/tests/testthat/test_urgency.R @@ -6,8 +6,9 @@ text <- c("We must implement these measures to limit our carbon emissions now.", urgency <- get_urgency(text) test_that("Urgency is scored properly", { - expect_true(urgency$Frequency[5] > 0.01) - expect_true(urgency$Timing[1] > 0.01) + expect_true(urgency$Frequency[5] > 1) + expect_true(urgency$Timing[1] > 1) expect_true(urgency$Intensity[4] > 1) - expect_true(all(urgency$Commitment == c(1, 0.5, 1, 1, 1))) + expect_true(urgency$Commitment[2] < urgency$Commitment[3]) + expect_true(all(order(urgency$Urgency) == c(3, 2, 4, 5, 1))) })