Merge branch 'develop' of https://github.com/henriquesposito/poldis i…

…nto develop
henriquesposito · Aug 8, 2024 · 8dccc36 · 8dccc36
2 parents a927e49 + 4bdf38c
commit 8dccc36
Show file tree

Hide file tree

Showing 17 changed files with 218 additions and 34 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,8 +1,8 @@
 Package: poldis
 Type: Package
 Title: Analyse Political Texts
-Version: 0.1.1
-Date: 2024-07-17
+Version: 0.1.2
+Date: 2024-08-08
 Authors@R: 
     c(person(given = "Henrique",
              family = "Sposito", 
@@ -46,8 +46,9 @@ Suggests:
     pdftools,
     fmsb,
     ggplot2,
-    tm
-RoxygenNote: 7.3.1
+    tm,
+    cli
+RoxygenNote: 7.3.2
 Encoding: UTF-8
 LazyData: True
 Depends: 

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,16 @@
+# poldis 0.1.2
+
+2024-08-08
+
+## Package
+
+- Updated documentation for `get_urgency()` to include more details about urgency scores
+
+## Functions
+
+- Updated `get_urgency()` and `gather_topics()` to open codebooks for urgency or topic when no arguments are declared
+- Updated urgency scores in `get_urgency()` according to survey results. 
+
 # poldis 0.1.1
 
 2024-07-17

diff --git a/R/priorities.R b/R/priorities.R
@@ -13,6 +13,8 @@
 #' @importFrom dplyr mutate distinct %>%
 #' @return A data frame with syntax information by sentences and
 #' a variable identifying which of these sentences are priorities.
+#' @examples
+#' #select_priorities(US_News_Conferences_1960_1980[1:2,3])
 #' @export
 select_priorities <- function(.data, na.rm = TRUE) {
   tags <- sentence <- lemmas <- priorities <- NULL

diff --git a/R/text_tools.R b/R/text_tools.R
@@ -7,6 +7,8 @@
 #' @import spacyr
 #' @return A data frame of individual names and the number of times they appear.
 #' @details The function relies on named entity recognition from NLP models.
+#' @examples
+#' #extract_names(US_News_Conferences_1960_1980[20, 3])
 #' @export
 extract_names <- function(v) {
   ent_type <- text <- s <- NULL
@@ -39,6 +41,9 @@ extract_names <- function(v) {
 #' @importFrom stringdist stringsimmatrix
 #' @return A data frame of locations and the number of times they appear.
 #' @details The function relies on geographical entity detection from NLP models.
+#' @examples
+#' #extract_locations(c("This is the United States", "This is Sao Paulo",
+#' #"I was in Rio de Janeiro and Sao Paulo, then back to the United States"))
 #' @export
 extract_locations <- function(v) {
   v <- stringi::stri_trans_general(v, id = "Latin-ASCII")
@@ -83,6 +88,8 @@ extract_title <- function(v) {
 #' Wrapper function for `messydates::as_messydates`.
 #' @param v Text vector.
 #' @return A vector of the dates in text.
+#' @examples
+#' #extract_date("Today is the twenty six of February of two thousand and twenty four")
 #' @export
 extract_date <- function(v) {
   thisRequires("messydates")
@@ -213,6 +220,8 @@ extract_context <- function(match, v, level = "sentences", n = 1) {
 #' in selecting a method, please see `?quanteda.textstats::textstat_simil()`.
 #' @importFrom dplyr group_by summarise select %>%
 #' @return A matrix of similarity scores between texts.
+#' @examples
+#' #extract_text_similarities(US_News_Conferences_1960_1980[1:2,3])
 #' @export
 extract_text_similarities <- function(v, comparison = "similarities", method) {
   thisRequires("quanteda.textstats")
@@ -309,6 +318,9 @@ read_pdf <- function(path) {
 #' @importFrom dplyr group_by summarise ungroup %>%
 #' @importFrom stringr str_squish
 #' @return A data frame with syntax information by words or sentences in text.
+#' @examples
+#' #annotate_text(US_News_Conferences_1960_1980[1:2, 3])
+#' #annotate_text(US_News_Conferences_1960_1980[1:2, 3], level = "sentence")
 #' @export
 annotate_text <- function(v, level = "words") {
   doc_id <- sentence_id <- token_id <- token <- pos <- tag <- lemma <- entity <- NULL

diff --git a/R/topic.R b/R/topic.R
@@ -4,12 +4,15 @@
 #' `select_priorities()`, or text vector.
 #' For data frames, function will search for "text" variable.
 #' For priorities data frame function will search for "priorities" variable.
+#' If missing, opens the webpage containing the political topics codebook.
 #' @param dictionary The dictionary of 20 major political topics from the
 #' Comparative Agendas Project (Jones et al., 2023) is used by default.
 #' Users can also declare a custom dictionary as a vector or a list.
 #' If users declare a vector, each element is treated as a independent topic.
 #' If users declare a list of subjects and related terms, function understands
 #' names as topic and words as terms.
+#' For more information on how the CAP topics were adapted, please run
+#' `gather_topics()` to access the political topics codebook.
 #' @import dplyr
 #' @importFrom tidyr unite
 #' @return A list of topics present in each text separated by comma.
@@ -21,10 +24,15 @@
 #' gather_topics(US_News_Conferences_1960_1980[1:5, 3],
 #'               dictionary = list("military" = c("military", "gun", "war"),
 #'                                 "development" = c("development", "interest rate", "banks")))
+#' #summary(gather_topics(US_News_Conferences_1960_1980[1:5, 3]))
+#' #plot(gather_topics(US_News_Conferences_1960_1980[1:5, 3],
+#' #                   dictionary = c("military", "development")))
 #' }
 #' @export
 gather_topics <- function(.data, dictionary = "CAP") {
   Words <- topics <- NULL
+  # tries to open topic codebook if no argument is declared
+  if (missing(.data)) open_codebook(codebook = "topic")
   # get text variable
   if (inherits(.data, "priorities")) {
     text <- stats::na.omit(.clean_token(getElement(.data, "priorities")))
@@ -81,6 +89,13 @@ gather_topics <- function(.data, dictionary = "CAP") {
 #' “Keyword-Assisted Topic Models.”
 #' _American Journal of Political Science_, 68(2): 730-750.
 #' \doi{10.1111/ajps.12779}
+#' @examples
+#' #gather_related_terms(US_News_Conferences_1960_1980[1:5, 3], dictionary = "CAP")
+#' #gather_related_terms(US_News_Conferences_1960_1980[1:5, 3],
+#' #                     dictionary = c("military", "development"))
+#' #gather_related_terms(US_News_Conferences_1960_1980[1:5, 3],
+#' #                     dictionary = list("military" = c("military", "gun", "war"),
+#' #                                       "development" = c("development", "interest rate", "banks")))
 #' @export
 gather_related_terms <- function(.data, dictionary) {
   Words <- NULL

diff --git a/R/urgency.R b/R/urgency.R
@@ -4,20 +4,60 @@
 #' `select_priorities()`, or text vector.
 #' For data frames, function will search for "text" variable.
 #' For priorities data frame function will search for "priorities" variable.
+#' If missing, opens the webpage containing the urgency codebook.
 #' @param normalize Would you like urgency scores to be normalized?
 #' By default, urgency scores are normalized by "tokens",
 #' the number of words in text observation.
 #' Users can also declare "none", for no normalization.
+#' Since dictionaries for each dimension of urgency have a slightly different
+#' number of words, scores for in each dimension are adjusted
+#' by the number of words in each dictionary by default.
+#' @details
+#' Urgency in political discourses is an expression of how necessary and/or
+#' how soon an action should be undertaken or completed.
+#' This is measured along four dimensions,
+#' two related to necessity and two related to timing.
+#' The first two dimensions, degree of intensity and degree of commitment,
+#' relate to the necessity of taking the action, while the next two dimensions,
+#' frequency of action and timing of action,
+#' relate to the timing in which action is taken.
+#' Our dictionary includes terms in each of these dimensions.
+#' The terms included in each of these dimensions of urgency were first adapted
+#' from established lexicon dictionaries and later complemented by other terms
+#' inductively found in texts during pre-testing.
+#' The words in the dictionaries for each dimension are scored on a scale
+#' between 0 and 1, with 1 being the maximum value obtainable and contributing
+#' the most to the urgency score of the sentence.
+#' Urgency terms were validated and adjusted with online survey
+#' with 206 participants that took place between July and August of 2024.
+#' The survey collected responses anonymously but included basic demographic
+#' information about participants, as English proficiency and education levels.
+#' The survey results were recorded as counts of the number of participants
+#' who said a certain randomly selected urgency related word was more urgent
+#' than another randomly selected urgency related word.
+#' To analyze the survey results, we employed Bradley-Terry models for
+#' paired comparisons using the BradleyTerry2 R package (Turner and Firth, 2012).
+#' This allowed to obtain a rank of the words for each dimension of urgency.
+#' The rankings were then used to adjust and validate urgency word scores
+#' in the dictionaries.
+#' For more information on the dimensions, scores, or the survey on urgency,
+#' please run `get_urgency()` to access the urgency codebook.
 #' @return A scored data frame for each dimension of urgency.
 #' @import dplyr
 #' @examples
 #' \donttest{
 #' get_urgency(US_News_Conferences_1960_1980[1:10, 3])
 #' get_urgency(US_News_Conferences_1960_1980[1:10,])
+#' #get_urgency(select_priorities(US_News_Conferences_1960_1980[1:2, 3]))
+#' #summary(get_urgency(US_News_Conferences_1960_1980[1:10, 3]))
+#' #plot(get_urgency(US_News_Conferences_1960_1980[1:10, 3]))
+#' #get_urgency()
 #' }
 #' @export
 get_urgency <- function(.data, normalize = "tokens") {
   Frequency <- Timing <- Commitment <- Intensity <- Urgency <- text_clean <- NULL
+  # tries to open urgency codebook if no argument is declared
+  if (missing(.data)) open_codebook(codebook = "urgency")
   # get text variable
   if (inherits(.data, "priorities")) {
     text_clean <- getElement(.data, "priorities")
@@ -26,10 +66,10 @@ get_urgency <- function(.data, normalize = "tokens") {
   } else text_clean <- .data
   # assign urgency dimensions
   out <- data.frame("text_clean" = .clean_token(text_clean)) %>%
-    dplyr::mutate(Frequency = .assign_frequencies(text_clean),
-                  Timing = .assign_timing(text_clean),
-                  Intensity = .assign_intensity(text_clean),
-                  Commitment = .assign_commitment(text_clean))
+    dplyr::mutate(Frequency = .assign_frequencies(text_clean)*1.2, #61 terms
+                  Timing = .assign_timing(text_clean)*1.3, #41 terms
+                  Intensity = .assign_intensity(text_clean), #98 terms
+                  Commitment = .assign_commitment(text_clean)*1.1) #85 terms
   if (normalize == "tokens") {
     out <- out %>%
       dplyr::mutate(Urgency = (Frequency + Timing + Intensity + Commitment)/
@@ -44,54 +84,54 @@ get_urgency <- function(.data, normalize = "tokens") {
 }
 
 .assign_frequencies <- function(v) {
-  frequency <- score_frequency <- NULL
-  freq_words <- urgency_word_scores[,1:2] %>%
+  frequency <- score_frequency_scaled <- NULL
+  freq_words <- urgency_word_scores[,c(5,8)] %>%
     tidyr::drop_na() %>%
     dplyr::mutate(frequency = textstem::lemmatize_words(frequency)) %>%
-    dplyr::distinct() %>% # 61 terms
-    dplyr::group_by(score_frequency) %>%
+    dplyr::distinct() %>%
+    dplyr::group_by(score_frequency_scaled) %>%
     summarise(terms = paste0(frequency, collapse = "|"))
   rowSums(do.call("cbind", lapply(seq_len(nrow(freq_words)), function(i)
     stringr::str_count(as.character(v), paste0("\\b", freq_words$terms[i], "\\b"))*
-      freq_words$score_frequency[i])))
+      freq_words$score_frequency_scaled[i])))
 }
 
 .assign_timing <- function(v) {
-  timing <- score_timing <- NULL
-  timing_words <- urgency_word_scores[,3:4] %>%
+  timing <- score_timing_scaled <- NULL
+  timing_words <- urgency_word_scores[,c(1,4)] %>%
     tidyr::drop_na() %>%
     dplyr::mutate(timing = textstem::lemmatize_words(timing)) %>%
     dplyr::distinct() %>% # 41 terms
-    dplyr::group_by(score_timing) %>%
+    dplyr::group_by(score_timing_scaled) %>%
     summarise(terms = paste0(timing, collapse = "|"))
   rowSums(do.call("cbind", lapply(seq_len(nrow(timing_words)), function(i)
     stringr::str_count(as.character(v), paste0("\\b", timing_words$terms[i], "\\b"))*
-      timing_words$score_timing[i])))
+      timing_words$score_timing_scaled[i])))
 }
 
 .assign_intensity <- function(v) {
-  intensity <- score_intensity <- NULL
-  intensity_words <- urgency_word_scores[,5:6] %>%
+  intensity <- score_intensity_scaled <- NULL
+  intensity_words <- urgency_word_scores[,c(13, 16)] %>%
     tidyr::drop_na() %>%
     dplyr::mutate(intensity = textstem::lemmatize_words(intensity)) %>%
-    dplyr::distinct() %>% # 103 terms
-    dplyr::group_by(score_intensity) %>%
+    dplyr::distinct() %>%
+    dplyr::group_by(score_intensity_scaled) %>%
     summarise(terms = paste0(intensity, collapse = "|"))
   rowSums(do.call("cbind", lapply(seq_len(nrow(intensity_words)), function(i)
     stringr::str_count(as.character(v), paste0("\\b", intensity_words$terms[i], "\\b"))*
-      intensity_words$score_intensity[i])))
+      intensity_words$score_intensity_scaled[i])))
 }
 
 .assign_commitment <- function(v) {
-  commitment <- score_commitment <- NULL
-  commitment_words <- urgency_word_scores[,7:8] %>%
+  commitment <- score_commitment_scaled <- NULL
+  commitment_words <- urgency_word_scores[,c(9,12)] %>%
     tidyr::drop_na() %>%
     dplyr::mutate(commitment = textstem::lemmatize_words(commitment)) %>%
-    dplyr::distinct() %>% # 78 terms
-    dplyr::group_by(score_commitment) %>%
+    dplyr::distinct() %>%
+    dplyr::group_by(score_commitment_scaled) %>%
     summarise(terms = paste0(commitment, collapse = "|"))
   out <- list()
   rowSums(do.call("cbind", lapply(seq_len(nrow(commitment_words)), function(i)
     stringr::str_count(as.character(v), paste0("\\b", commitment_words$terms[i], "\\b"))*
-      commitment_words$score_commitment[i])))
+      commitment_words$score_commitment_scaled[i])))
 }
diff --git a/R/utils.R b/R/utils.R
@@ -13,3 +13,30 @@ thisRequires <- function(pkgname){
 .clean_token <- function(v) {
   textstem::lemmatize_words(stringr::str_squish(tolower(v)))
 }
+
+# Helper function to get urgency or topics codebook.
+open_codebook <- function(codebook = "urgency") {
+  thisRequires("cli")
+  url <- "https://github.com/henriquesposito/poldis/inst"
+  if (codebook == "urgency") {
+    tryCatch({
+      utils::browseURL(paste0(url, "/urgency_codebok.pdf"),
+                       browser = getOption("browser"), encodeIfNeeded = FALSE)
+    }, error = function(e) {
+      message(paste0("Unable to open codebook, please visit: ",
+                     cli::style_hyperlink(paste0(url, "/urgency_codebok.pdf"),
+                                          paste0(url, "/urgency_codebok.pdf"))))
+    })
+  } else if (codebook == "topic") {
+    tryCatch({
+      utils::browseURL(paste0(url, "/topic.pdf"), browser = getOption("browser"), encodeIfNeeded = FALSE)
+    }, error = function(e) {
+      message(paste0("Unable to open codebook, please visit: ",
+                     cli::style_hyperlink(paste0(url, "/topic.pdf"),
+                                          paste0(url, "/topic.pdf"))))
+    })
+  } else {
+    message(paste0("Codebook not found, to see the available codebooks please visit: ",
+                   cli::style_hyperlink(url, url)))
+  }
+}
diff --git a/man/annotate_text.Rd b/man/annotate_text.Rd
diff --git a/man/extract_date.Rd b/man/extract_date.Rd
diff --git a/man/extract_locations.Rd b/man/extract_locations.Rd
diff --git a/man/extract_names.Rd b/man/extract_names.Rd
diff --git a/man/extract_text_similarities.Rd b/man/extract_text_similarities.Rd
diff --git a/man/gather_related_terms.Rd b/man/gather_related_terms.Rd