diff --git a/.Rbuildignore b/.Rbuildignore index 0e91fd5b..095f201b 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -11,3 +11,4 @@ ^pkgdown$ ^CRAN-SUBMISSION$ ^cran-comments\.md$ +^data_raw$ diff --git a/.gitignore b/.gitignore index 7c065559..b3bfd274 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,5 @@ .Ruserdata .DS_Store docs +^CRAN-SUBMISSION$ +^cran-comments\.md$ diff --git a/CRAN-SUBMISSION b/CRAN-SUBMISSION index 64ee2bc3..e277d7c0 100644 --- a/CRAN-SUBMISSION +++ b/CRAN-SUBMISSION @@ -1,3 +1,3 @@ -Version: 0.1.0 -Date: 2024-07-15 16:35:17 UTC -SHA: cae719a67aa2ce72724795a42738d3a75b23ec42 +Version: 0.1.1 +Date: 2024-07-17 15:48:03 UTC +SHA: bc6a443e9a211166975d8aa70f693ddf146102d1 diff --git a/DESCRIPTION b/DESCRIPTION index c8be4c7e..1b361dae 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: poldis Type: Package Title: Analyse Political Texts -Version: 0.1.1 -Date: 2024-07-17 +Version: 0.1.2 +Date: 2024-09-03 Authors@R: c(person(given = "Henrique", family = "Sposito", @@ -46,8 +46,9 @@ Suggests: pdftools, fmsb, ggplot2, - tm -RoxygenNote: 7.3.1 + tm, + cli +RoxygenNote: 7.3.2 Encoding: UTF-8 LazyData: True Depends: diff --git a/NAMESPACE b/NAMESPACE index 118d3f02..533845b0 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -16,6 +16,7 @@ export(gather_topics) export(get_urgency) export(read_pdf) export(select_priorities) +export(sim_urgency) export(split_text) import(dplyr) import(quanteda) @@ -36,5 +37,7 @@ importFrom(stringr,str_detect) importFrom(stringr,str_extract) importFrom(stringr,str_extract_all) importFrom(stringr,str_remove_all) +importFrom(stringr,str_replace_all) importFrom(stringr,str_squish) +importFrom(textstem,lemmatize_strings) importFrom(tidyr,unite) diff --git a/NEWS.md b/NEWS.md index ffd1ddfa..1c448b23 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,19 @@ +# poldis 0.1.2 + +2024-09-03 + +## Package + +- Updated documentation for `get_urgency()` to include more details about urgency scores +- Updated internal data and data raw files to increase transparency + +## Functions + +- Updated `get_urgency()` and `gather_topics()` to open codebooks for urgency or topic when no arguments are declared +- Updated urgency scores in `get_urgency()` according to survey results. +- Added `sim_urgency()` function to simulate urgent priorities +- Updated `select_priorities()` to be more strict and accurate about selection of priorities + # poldis 0.1.1 2024-07-17 diff --git a/R/priorities.R b/R/priorities.R index 3cfaf98f..351c6b7c 100644 --- a/R/priorities.R +++ b/R/priorities.R @@ -13,6 +13,8 @@ #' @importFrom dplyr mutate distinct %>% #' @return A data frame with syntax information by sentences and #' a variable identifying which of these sentences are priorities. +#' @examples +#' #select_priorities(US_News_Conferences_1960_1980[1:2,3]) #' @export select_priorities <- function(.data, na.rm = TRUE) { tags <- sentence <- lemmas <- priorities <- NULL @@ -21,27 +23,17 @@ select_priorities <- function(.data, na.rm = TRUE) { stop("Please declare a text vector or an annotated data frame at the sentence level.") } else .data <- suppressMessages(annotate_text(.data, level = "sentences")) out <- .data %>% - dplyr::mutate(lemmas = tolower(lemmas), - priorities = ifelse(stringr::str_detect(tags, "PRP MD ")| - stringr::str_detect(lemmas, - "going to|go to |need to|ready to| - |is time to|commit to|promise to|have to| - |plan to|intend to|let 's|let us|urge| - |require|want to"), - paste(sentence), NA), # detect priorities - priorities = ifelse(stringr::str_detect(priorities, " not | - |yesterday|last week| - |last month|last year| - |thank|honor|honour| - |applause|greet|laugh| - |privilege to|great to| - |good to be|good to see") | - stringr::str_detect(tags, "MD VB( RB)? VBN| - |VBD( RB)? VBN|VBZ( RB)? VBN| - |VBD( RB)? JJ|PRP( RB)? VBD TO| - |VBN( RB)? VBN"), - # Combinations of NLP tags to select - NA, priorities)) %>% + dplyr::mutate(priorities = ifelse(stringr::str_detect(tags, "PRP MD ")| + stringr::str_detect(lemmas, paste0( + textstem::lemmatize_strings( + Commitment$word[which(Commitment$grammar_function != "adjective")]), + collapse = "|")), + lemmas, NA), # detect priorities + priorities = ifelse(stringr::str_detect(priorities, " not | never ") | + stringr::str_detect(tags, + "MD VB( RB)? VBN|VBD( RB)? VBN|VBZ( RB)? VBN| + |VBD( RB)? JJ|PRP( RB)? VBD TO|VBN( RB)? VBN"), + NA, sentence)) %>% dplyr::distinct() if (isTRUE(na.rm)) out <- filter(out, !is.na(priorities)) class(out) <- c("priorities", class(out)) diff --git a/R/sysdata.rda b/R/sysdata.rda index b4234b60..77bc7cd9 100644 Binary files a/R/sysdata.rda and b/R/sysdata.rda differ diff --git a/R/text_tools.R b/R/text_tools.R index 4c7b8e3e..5c09aff2 100644 --- a/R/text_tools.R +++ b/R/text_tools.R @@ -1,19 +1,22 @@ #' Extract a list of possible names of individuals in texts #' #' @param v A text vector. +#' @import spacyr #' @importFrom dplyr distinct filter %>% summarize group_by #' @importFrom stringr str_squish #' @importFrom stringdist stringsimmatrix -#' @import spacyr +#' @importFrom textstem lemmatize_strings #' @return A data frame of individual names and the number of times they appear. #' @details The function relies on named entity recognition from NLP models. +#' @examples +#' #extract_names(US_News_Conferences_1960_1980[20, 3]) #' @export extract_names <- function(v) { ent_type <- text <- s <- NULL suppressWarnings(spacyr::spacy_initialize(model = "en_core_web_sm")) out <- suppressWarnings(spacyr::spacy_extract_entity(v, type = "named")) %>% dplyr::filter(ent_type == "PERSON") %>% - dplyr::mutate(names = .clean_token(text)) %>% + dplyr::mutate(names = textstem::lemmatize_strings(text)) %>% dplyr::group_by(names) %>% dplyr::count(name = "count") spacyr::spacy_finalize() @@ -33,12 +36,17 @@ extract_names <- function(v) { #' Extract locations from strings #' #' @param v Text vector. +#' @import spacyr #' @importFrom stringi stri_trans_general #' @importFrom stringr str_extract #' @importFrom purrr map_chr #' @importFrom stringdist stringsimmatrix +#' @importFrom textstem lemmatize_strings #' @return A data frame of locations and the number of times they appear. #' @details The function relies on geographical entity detection from NLP models. +#' @examples +#' #extract_locations(c("This is the United States", "This is Sao Paulo", +#' #"I was in Rio de Janeiro and Sao Paulo, then back to the United States")) #' @export extract_locations <- function(v) { v <- stringi::stri_trans_general(v, id = "Latin-ASCII") @@ -46,7 +54,7 @@ extract_locations <- function(v) { suppressWarnings(spacyr::spacy_initialize(model = "en_core_web_sm")) out <- suppressWarnings(spacyr::spacy_extract_entity(v)) %>% dplyr::filter(ent_type == "GPE") %>% - dplyr::mutate(names = .clean_token(text)) %>% + dplyr::mutate(names = textstem::lemmatize_strings(text)) %>% dplyr::group_by(names) %>% dplyr::count(name = "count") spacyr::spacy_finalize() @@ -83,6 +91,8 @@ extract_title <- function(v) { #' Wrapper function for `messydates::as_messydates`. #' @param v Text vector. #' @return A vector of the dates in text. +#' @examples +#' #extract_date("Today is the twenty six of February of two thousand and twenty four") #' @export extract_date <- function(v) { thisRequires("messydates") @@ -213,6 +223,8 @@ extract_context <- function(match, v, level = "sentences", n = 1) { #' in selecting a method, please see `?quanteda.textstats::textstat_simil()`. #' @importFrom dplyr group_by summarise select %>% #' @return A matrix of similarity scores between texts. +#' @examples +#' #extract_text_similarities(US_News_Conferences_1960_1980[1:2,3]) #' @export extract_text_similarities <- function(v, comparison = "similarities", method) { thisRequires("quanteda.textstats") @@ -307,12 +319,16 @@ read_pdf <- function(path) { #' Defaults to "words". #' @import spacyr #' @importFrom dplyr group_by summarise ungroup %>% -#' @importFrom stringr str_squish +#' @importFrom stringr str_squish str_replace_all #' @return A data frame with syntax information by words or sentences in text. +#' @examples +#' #annotate_text(US_News_Conferences_1960_1980[1:2, 3]) +#' #annotate_text(US_News_Conferences_1960_1980[1:2, 3], level = "sentence") #' @export annotate_text <- function(v, level = "words") { doc_id <- sentence_id <- token_id <- token <- pos <- tag <- lemma <- entity <- NULL suppressWarnings(spacyr::spacy_initialize(model = "en_core_web_sm")) + v <- stringr::str_replace_all(v, "\\.\\,|\\. \\,|\\,\\.|\\, \\.|\\.\\\n\\,", ".") parse <- spacyr::spacy_parse(v, tag = TRUE) suppressWarnings(spacyr::spacy_finalize()) if (level == "sentences" | level == "sentence") { diff --git a/R/topic.R b/R/topic.R index 21be3e31..720c28b6 100644 --- a/R/topic.R +++ b/R/topic.R @@ -4,14 +4,18 @@ #' `select_priorities()`, or text vector. #' For data frames, function will search for "text" variable. #' For priorities data frame function will search for "priorities" variable. +#' If missing, opens the webpage containing the political topics codebook. #' @param dictionary The dictionary of 20 major political topics from the #' Comparative Agendas Project (Jones et al., 2023) is used by default. #' Users can also declare a custom dictionary as a vector or a list. #' If users declare a vector, each element is treated as a independent topic. #' If users declare a list of subjects and related terms, function understands #' names as topic and words as terms. +#' For more information on how the CAP topics were adapted, please run +#' `gather_topics()` to access the political topics codebook. #' @import dplyr #' @importFrom tidyr unite +#' @importFrom textstem lemmatize_strings #' @return A list of topics present in each text separated by comma. #' @examples #' \donttest{ @@ -21,20 +25,25 @@ #' gather_topics(US_News_Conferences_1960_1980[1:5, 3], #' dictionary = list("military" = c("military", "gun", "war"), #' "development" = c("development", "interest rate", "banks"))) +#' #summary(gather_topics(US_News_Conferences_1960_1980[1:5, 3])) +#' #plot(gather_topics(US_News_Conferences_1960_1980[1:5, 3], +#' # dictionary = c("military", "development"))) #' } #' @export gather_topics <- function(.data, dictionary = "CAP") { Words <- topics <- NULL + # tries to open topic codebook if no argument is declared + if (missing(.data)) open_codebook(codebook = "topic") # get text variable if (inherits(.data, "priorities")) { - text <- stats::na.omit(.clean_token(getElement(.data, "priorities"))) + text <- stats::na.omit(textstem::lemmatize_strings(getElement(.data, "priorities"))) } else if (inherits(.data, "data.frame")) { - text <- .clean_token(getElement(.data, "text")) - } else text <- .clean_token(.data) + text <- textstem::lemmatize_strings(getElement(.data, "text")) + } else text <- textstem::lemmatize_strings(.data) # get dictionary if (any(dictionary == "CAP")) { dictionary <- CAP_topics %>% - dplyr::mutate(Words = stringr::str_replace_all(.clean_token(Words), + dplyr::mutate(Words = stringr::str_replace_all(textstem::lemmatize_strings(Words), ", ", "\\\\b|\\\\b")) subjects <- dictionary$Words names(subjects) <- dictionary$Topic @@ -72,6 +81,7 @@ gather_topics <- function(.data, dictionary = "CAP") { #' @import quanteda #' @import dplyr #' @importFrom stringr str_detect str_remove_all +#' @importFrom textstem lemmatize_strings #' @return A list of related terms to each of the topics declared in dictionary. #' @details This function relies on keyword assisted topic models implemented #' in the `\{keyATM\}` package to find related words based on the topics @@ -81,16 +91,23 @@ gather_topics <- function(.data, dictionary = "CAP") { #' “Keyword-Assisted Topic Models.” #' _American Journal of Political Science_, 68(2): 730-750. #' \doi{10.1111/ajps.12779} +#' @examples +#' #gather_related_terms(US_News_Conferences_1960_1980[1:5, 3], dictionary = "CAP") +#' #gather_related_terms(US_News_Conferences_1960_1980[1:5, 3], +#' # dictionary = c("military", "development")) +#' #gather_related_terms(US_News_Conferences_1960_1980[1:5, 3], +#' # dictionary = list("military" = c("military", "gun", "war"), +#' # "development" = c("development", "interest rate", "banks"))) #' @export gather_related_terms <- function(.data, dictionary) { Words <- NULL thisRequires("keyATM") # get text variable if (inherits(.data, "priorities")) { - text <- stats::na.omit(.clean_token(getElement(.data, "priorities"))) + text <- stats::na.omit(textstem::lemmatize_strings(getElement(.data, "priorities"))) } else if (inherits(.data, "data.frame")) { - text <- .clean_token(getElement(.data, "text")) - } else text <- .clean_token(.data) + text <- textstem::lemmatize_strings(getElement(.data, "text")) + } else text <- textstem::lemmatize_strings(.data) # check dictionary if (any(dictionary == "CAP")) { subjects <- CAP_topics %>% diff --git a/R/urgency.R b/R/urgency.R index 158ad02f..c375b49f 100644 --- a/R/urgency.R +++ b/R/urgency.R @@ -4,94 +4,231 @@ #' `select_priorities()`, or text vector. #' For data frames, function will search for "text" variable. #' For priorities data frame function will search for "priorities" variable. -#' @param normalize Would you like urgency scores to be normalized? -#' By default, urgency scores are normalized by "tokens", -#' the number of words in text observation. -#' Users can also declare "none", for no normalization. +#' If missing, opens the webpage containing the urgency codebook. +#' @param summarise How to handle multiple matches for the same dimension +#' in the same text observation? +#' By default, multiple matches are added together and +#' their "sum" per text observation is returned. +#' Users can, instead, choose the "mean" which returns the average +#' score per dimension per text observation when there are multiple matches. +#' The "mean" can also be used as a form of normalization per +#' dimension and text observation in certain cases. +#' @details +#' Urgency in political discourses is an expression of how necessary and/or +#' how soon an action should be undertaken or completed. +#' This is measured along four dimensions, +#' two related to necessity and two related to timing. +#' The first two dimensions, degree of intensity and degree of commitment, +#' relate to the necessity of taking the action, while the next two dimensions, +#' frequency of action and timing of action, +#' relate to the timing in which action is taken. +#' Our dictionary includes terms in each of these dimensions. +#' The terms included in each of these dimensions were validated and adjusted +#' through an online survey that took place between July and August of 2024. +#' The survey results were recorded as counts of the number of participants +#' who selected an urgency-related word as more urgent than its pair. +#' To analyze the survey results, we employed Bradley-Terry models for +#' paired comparisons. +#' A rank of the words for each dimension of urgency was obtained from the analysis, +#' which were then used to create the urgency word scores in the dictionaries. +#' For more information on the dimensions, scores, or the survey on urgency, +#' please run `get_urgency()` to access the urgency codebook. +#' For priorities (i.e. coded using the `select_priorities()`), +#' urgency scores are calculated by multiplying the commitment scores by all +#' other dimensions. +#' This is done because commitment words are indicative of political priorities, +#' For more information please refer to the `select_priorities()` function. +#' For vectors or data frames urgency scores are calculated by +#' adding commitment and intensity dimension scores (i.e. how necessary) +#' and multiplying these by the sum of timing and frequency dimension +#' scores (i.e. how soon). +#' In both cases, zero urgency scores are indicative of no urgency but maximum +#' scores can vary. #' @return A scored data frame for each dimension of urgency. #' @import dplyr +#' @importFrom textstem lemmatize_strings #' @examples #' \donttest{ #' get_urgency(US_News_Conferences_1960_1980[1:10, 3]) #' get_urgency(US_News_Conferences_1960_1980[1:10,]) +#' #get_urgency(select_priorities(US_News_Conferences_1960_1980[1:2, 3])) +#' #summary(get_urgency(US_News_Conferences_1960_1980[1:10, 3])) +#' #plot(get_urgency(US_News_Conferences_1960_1980[1:10, 3])) +#' #get_urgency() #' } #' @export -get_urgency <- function(.data, normalize = "tokens") { +get_urgency <- function(.data, summarise = "sum") { Frequency <- Timing <- Commitment <- Intensity <- Urgency <- text_clean <- NULL + # tries to open urgency codebook if no argument is declared + if (missing(.data)) open_codebook(codebook = "urgency") # get text variable if (inherits(.data, "priorities")) { - text_clean <- getElement(.data, "priorities") + text_clean <- getElement(.data, "lemmas") + priorities <- TRUE } else if (inherits(.data, "data.frame")) { - text_clean <- getElement(.data, "text") - } else text_clean <- .data + if ("lemmas" %in% names(.data)) text_clean <- getElement(.data, "lemmas") else + text_clean <- textstem::lemmatize_strings(getElement(.data, "text")) + priorities <- FALSE + } else { + text_clean <- textstem::lemmatize_strings(.data) + priorities <- FALSE + } # assign urgency dimensions - out <- data.frame("text_clean" = .clean_token(text_clean)) %>% - dplyr::mutate(Frequency = .assign_frequencies(text_clean), - Timing = .assign_timing(text_clean), - Intensity = .assign_intensity(text_clean), - Commitment = .assign_commitment(text_clean)) - if (normalize == "tokens") { + out <- data.frame("text_clean" = text_clean) %>% + dplyr::mutate(Frequency = .assign_urgency_dimensions(text_clean, dimension = "frequency", + summarise = summarise, priority = priorities), + Timing = .assign_urgency_dimensions(text_clean, dimension = "timing", + summarise = summarise, priority = priorities), + Intensity = .assign_urgency_dimensions(text_clean, dimension = "intensity", + summarise = summarise, priority = priorities), + Commitment = .assign_urgency_dimensions(text_clean, dimension = "commitment", + summarise = summarise, priority = priorities)) + # calculate urgency scores + if (isTRUE(priorities)) { out <- out %>% - dplyr::mutate(Urgency = (Frequency + Timing + Intensity + Commitment)/ - nchar(text_clean)) - } else if (normalize == "none") { + dplyr::mutate(Urgency = Commitment * ifelse(Intensity == 0, 1, Intensity) * + ifelse(Timing == 0, 1, Timing) * ifelse(Frequency == 0, 1, Frequency)) + } else { out <- out %>% - dplyr::mutate(Urgency = (Frequency + Timing + Intensity + Commitment)) + dplyr::mutate(Urgency = (((1 + Commitment) + (1 + Intensity))/4) * + (((1 + Timing) + (1 + Frequency))/4)) } + # bind data results to original data out <- cbind(.data, out) %>% dplyr::select(-c(text_clean)) class(out) <- c("urgency", class(out)) out } -.assign_frequencies <- function(v) { - frequency <- score_frequency <- NULL - freq_words <- urgency_word_scores[,1:2] %>% - tidyr::drop_na() %>% - dplyr::mutate(frequency = textstem::lemmatize_words(frequency)) %>% - dplyr::distinct() %>% # 61 terms - dplyr::group_by(score_frequency) %>% - summarise(terms = paste0(frequency, collapse = "|")) - rowSums(do.call("cbind", lapply(seq_len(nrow(freq_words)), function(i) - stringr::str_count(as.character(v), paste0("\\b", freq_words$terms[i], "\\b"))* - freq_words$score_frequency[i]))) -} - -.assign_timing <- function(v) { - timing <- score_timing <- NULL - timing_words <- urgency_word_scores[,3:4] %>% +.assign_urgency_dimensions <- function(v, dimension, summarise, priority) { + scaled <- rescaled <- word <- grammar_function <- NULL + # get dictionaries + if (dimension == "frequency") { + out <- Frequency + } else if (dimension == "timing") { + out <- Timing + } else if (dimension == "commitment") { + out <- Commitment + } else if (dimension == "intensity") { + out <- Intensity + } + # check if priorities and get correct scale and data + if (isTRUE(priority)) { # for urgency in priorities + out <- dplyr::rename(out, scale = rescaled) %>% + dplyr::select(word, scale) + } else out <- dplyr::rename(out, scale = scaled) %>% dplyr::select(word, scale) + # remove missing and group words + out <- out %>% tidyr::drop_na() %>% - dplyr::mutate(timing = textstem::lemmatize_words(timing)) %>% - dplyr::distinct() %>% # 41 terms - dplyr::group_by(score_timing) %>% - summarise(terms = paste0(timing, collapse = "|")) - rowSums(do.call("cbind", lapply(seq_len(nrow(timing_words)), function(i) - stringr::str_count(as.character(v), paste0("\\b", timing_words$terms[i], "\\b"))* - timing_words$score_timing[i]))) + dplyr::mutate(word = textstem::lemmatize_strings(word)) %>% + dplyr::distinct() %>% + dplyr::group_by(scale) %>% + summarise(terms = paste0(word, collapse = "|")) + # summarise if multiple matches per dimension per text observation + if (summarise == "sum") { + out <- rowSums(do.call("cbind", lapply(seq_len(nrow(out)), function(i) + stringr::str_count(as.character(v), paste0("\\b", out$terms[i], "\\b"))* + out$scale[i]))) # get added scores + } else if (summarise == "mean") { + rowcount <- rowSums(do.call("cbind", lapply(seq_len(nrow(out)), function(i) + stringr::str_count(as.character(v), paste0("\\b", out$terms[i], "\\b"))))) # get count + out <- rowSums(do.call("cbind", lapply(seq_len(nrow(out)), function(i) + stringr::str_count(as.character(v), paste0("\\b", out$terms[i], "\\b"))* + out$scale[i]))) # get added scores + out <- out/(replace(rowcount, rowcount == 0, 1)) # get average value + } + out } -.assign_intensity <- function(v) { - intensity <- score_intensity <- NULL - intensity_words <- urgency_word_scores[,5:6] %>% - tidyr::drop_na() %>% - dplyr::mutate(intensity = textstem::lemmatize_words(intensity)) %>% - dplyr::distinct() %>% # 103 terms - dplyr::group_by(score_intensity) %>% - summarise(terms = paste0(intensity, collapse = "|")) - rowSums(do.call("cbind", lapply(seq_len(nrow(intensity_words)), function(i) - stringr::str_count(as.character(v), paste0("\\b", intensity_words$terms[i], "\\b"))* - intensity_words$score_intensity[i]))) +#' Simulating urgency in priorities +#' +#' @param urgency Desired urgency score, optional. +#' @param commitment Desired commitment score, optional. +#' @param intensity Desired intensity score, optional. +#' @param timing Desired timing score, optional. +#' @param frequency Desired frequency score, optional. +#' @param pronoun How would you like the simulated priorities to start? +#' By default, priorities start with the pronoun "We". +#' @details +#' Users can declare a score for one or more of the +#' urgency dimensions or an urgency score. +#' This means, if users may not declare an urgency score and the +#' score for one or more dimensions at once. +#' In those cases, the urgency score is favored. +#' @return A sentence that matches the urgency or urgency dimension scores. +#' @examples +#' \donttest{ +#' sim_urgency() +#' sim_urgency(urgency = 0.5) +#' sim_urgency(urgency = 2.5) +#' sim_urgency(urgency = -2.5) +#' sim_urgency(commitment = 0.6) +#' sim_urgency(commitment = 0.6, intensity = 1.4) +#' sim_urgency(commitment = 0.6, intensity = 1.4, timing = 1.4) +#' sim_urgency(commitment = 0.6, intensity = 1.2, timing = 1.4, frequency = 1.8) +#' } +#' @export +sim_urgency <- function(urgency, + commitment, intensity, timing, frequency, + pronoun = "We") { + grammar_function <- rescaled <- NULL + # Filter data to avoid using adverbs, for now + Commitment <- subset(Commitment, !is.na(rescaled) & grepl("^verb$", grammar_function)) + # only "commitment" verbs for now + Intensity <- subset(Intensity, !is.na(rescaled)) + Timing <- subset(Timing, !is.na(rescaled)) + Frequency <- subset(Frequency, !is.na(rescaled)) + if(!missing(urgency)){ + # Either timing or frequency, for now + time_freq <- rbind(Timing, Frequency) + combins <- expand.grid(Intensity$word, Commitment$word, time_freq$word, + stringsAsFactors = FALSE) + combins <- merge(combins, Intensity, by.x = "Var1", by.y = "word") + combins <- combins[,c("Var1", "Var2", "Var3", "rescaled")] + combins <- merge(combins, Commitment, by.x = "Var2", by.y = "word") + combins <- combins[,c("Var1", "Var2", "Var3", "rescaled.x", "rescaled.y")] + combins <- merge(combins, time_freq, by.x = "Var3", by.y = "word") + combins <- combins[,c("Var1", "Var2", "Var3", "rescaled.x", + "rescaled.y", "rescaled")] + combins$combo <- as.numeric(combins$rescaled.x) * + as.numeric(combins$rescaled.y) * as.numeric(combins$rescaled) + selectd <- which.min.diff(abs(urgency), combins$combo) + formul <- combins[selectd,c("Var1", "Var2", "Var3")] + if(urgency < 0) intcom <- c(formul[1:2], sample(c("not", "never"), 1)) else + intcom <- formul[1:2] + out <- paste(pronoun, paste(intcom, collapse = " "), "do this", formul[3]) + cat("Urgency score: ", + combins[selectd,"rescaled.x"] * combins[selectd, "rescaled.y"] * + combins[selectd,"rescaled"], "\n", sep = "") + } else { + if(!missing(commitment)){ + commit <- Commitment$word[which.min.diff(abs(commitment), Commitment$rescaled)] + if(commitment < 0) commit <- paste(commit, sample(c("not","never"), 1)) + if(!missing(intensity)){ + intensifier <- Intensity$word[which.min.diff(intensity, Intensity$rescaled)] + out <- paste(pronoun, intensifier, commit, "do this") + } else out <- paste(pronoun, commit, "do this") + } else out <- paste(pronoun, "do this") + if(!missing(timing)){ + timed <- Timing$word[which.min.diff(timing, Timing$rescaled)] + out <- paste(out, timed) + } + if(!missing(frequency)){ + freq <- Frequency$word[which.min.diff(frequency, Frequency$rescaled)] + out <- paste(out, freq) + } + cat("Urgency score:", + ifelse(missing(commitment),1,commitment) * ifelse(missing(intensity),1,intensity) * + ifelse(missing(timing),1,timing) *ifelse(missing(frequency),1,frequency), + "\n") + } + out <- paste0(trimws(out), ".") + out } -.assign_commitment <- function(v) { - commitment <- score_commitment <- NULL - commitment_words <- urgency_word_scores[,7:8] %>% - tidyr::drop_na() %>% - dplyr::mutate(commitment = textstem::lemmatize_words(commitment)) %>% - dplyr::distinct() %>% # 78 terms - dplyr::group_by(score_commitment) %>% - summarise(terms = paste0(commitment, collapse = "|")) - out <- list() - rowSums(do.call("cbind", lapply(seq_len(nrow(commitment_words)), function(i) - stringr::str_count(as.character(v), paste0("\\b", commitment_words$terms[i], "\\b"))* - commitment_words$score_commitment[i]))) +which.min.diff <- function(x, y){ + diffs <- abs(x - y) + y <- which(diffs == min(diffs, na.rm = TRUE)) + if (length(y) > 1L) + sample(y, 1L) + else y } diff --git a/R/utils.R b/R/utils.R index b0599f36..9627be2f 100644 --- a/R/utils.R +++ b/R/utils.R @@ -10,6 +10,29 @@ thisRequires <- function(pkgname){ } } -.clean_token <- function(v) { - textstem::lemmatize_words(stringr::str_squish(tolower(v))) +# Helper function to get urgency or topics codebook. +open_codebook <- function(codebook = "urgency") { + thisRequires("cli") + url <- "https://github.com/henriquesposito/poldis/tree/develop/inst" + if (codebook == "urgency") { + tryCatch({ + utils::browseURL(paste0(url, "/urgency_codebook.pdf"), + browser = getOption("browser"), encodeIfNeeded = FALSE) + }, error = function(e) { + message(paste0("Unable to open codebook, please visit: ", + cli::style_hyperlink(paste0(url, "/urgency_codebook.pdf"), + paste0(url, "/urgency_codebook.pdf")))) + }) + } else if (codebook == "topic") { + tryCatch({ + utils::browseURL(paste0(url, "/topic.pdf"), browser = getOption("browser"), encodeIfNeeded = FALSE) + }, error = function(e) { + message(paste0("Unable to open codebook, please visit: ", + cli::style_hyperlink(paste0(url, "/topic.pdf"), + paste0(url, "/topic.pdf")))) + }) + } else { + message(paste0("Codebook not found, to see the available codebooks please visit: ", + cli::style_hyperlink(url, url))) + } } diff --git a/data_raw/comm.csv b/data_raw/comm.csv new file mode 100644 index 00000000..6f564e4c --- /dev/null +++ b/data_raw/comm.csv @@ -0,0 +1,88 @@ +word,assigned_score,coefficient,se,grammar_function +get to,0.7,,,verb +oblige to,1,-1.01,0.49,verb +need to,1,-1.04,0.43,verb +must,1,0,NA,verb +require to,1,NA,NA,verb +time to,0.8,-0.85,0.47,verb +have to,0.8,-0.92,0.41,verb +going to,0.8,-1.64,0.58,verb +ready to,0.8,-1.94,0.44,verb +will,0.8,-2.65,0.54,verb +urge to,0.8,NA,NA,verb +promise to,0.7,-2.84,0.5,verb +commit to,0.7,NA,NA,verb +intend to,0.7,NA,NA,verb +should,0.5,-2.49,0.52,verb +want to,0.5,-2.58,0.47,verb +would,0.5,-5.49,1.03,verb +can,0.5,-5.64,0.94,verb +might,0.5,-6.1,1.53,verb +could,0.5,-7.04,1.05,verb +may,0.5,-7.63,1.34,verb +shall,0.5,NA,NA,verb +plan to,0.5,NA,NA,verb +option to,0.5,NA,NA,noun +let's,0.5,-1.8,0.52,imperative +let us,0.5,NA,NA,imperative +necessarily,1,NA,NA,adverb +importantly,1,NA,NA,adverb +unavoidably,0.8,NA,NA,adverb +inevitably,0.8,NA,NA,adverb +inexorably,0.8,NA,NA,adverb +prominently,0.8,NA,NA,adverb +relevantly,0.7,NA,NA,adverb +ideally,0.5,NA,NA,adverb +preferably,0.5,NA,NA,adverb +carefully,0.5,NA,NA,adverb +plausibly,0.5,NA,NA,adverb +possibly,0.5,NA,NA,adverb +it is imperative,1,1.09,0.5,verb and adjective +serious,1,0.79,0.58,adjective +it is essential,1,0.7,0.37,verb and adjective +severe,1,0.57,0.87,adjective +it is best to,1,0.12,0.75,verb and adjective +it is important,1,-0.39,0.35,verb and adjective +it is significant,1,-1.03,0.5,verb and adjective +major,1,-1.88,1.24,adjective +it is necessary,1,0,NA,verb and adjective +obligatory,1,NA,NA,adjective +momentous,1,NA,NA,adjective +it is unavoidable,0.8,0.44,0.47,verb and adjective +it is invaluable,0.8,-0.8,0.4,verb and adjective +it is quintessential,0.8,NA,NA,verb and adjective +it is worthy,0.8,NA,NA,verb and adjective +it is valuable,0.8,NA,NA,verb and adjective +it is inevitable,0.8,NA,NA,verb and adjective +inexorable,0.8,NA,NA,adjective +it is eminent,0.8,NA,NA,verb and adjective +prominent,0.8,NA,NA,adjective +preeminent,0.8,NA,NA,adjective +pre-eminent,0.8,NA,NA,adjective +it is relevant,0.7,-1.6,0.6,verb and adjective +it is optimal,0.5,-0.28,0.7,verb and adjective +it is timely,0.5,-0.82,0.95,verb and adjective +it is better to,0.5,-1.28,0.86,verb and adjective +are willing to,0.5,-1.53,0.72,verb and adjective +it is opportune,0.5,-1.57,1.05,verb and adjective +cautious to,0.5,-1.69,1.04,verb and adjective +able to,0.5,-2.55,0.9,verb and adjective +it is good to,0.5,-2.04,0.52,verb and adjective +it is preferable,0.5,-2.06,0.75,verb and adjective +it is ideal,0.5,-2.06,0.49,verb and adjective +it is useful,0.5,-2.53,0.51,verb and adjective +it is great to,0.5,-2.75,0.71,verb and adjective +it is optional,0.5,-3.28,1.06,verb and adjective +it is possible,0.5,-4.59,1.04,verb and adjective +tenable,0.5,NA,NA,adjective +it is doable,0.5,NA,NA,verb and adjective +it is obvious,0.5,NA,NA,verb and adjective +useable,0.5,NA,NA,adjective +marginal,0.5,NA,NA,adjective +are careful to,0.5,NA,NA,verb and adjective +it is plausible,0.5,NA,NA,verb and adjective +it is trivial,0.2,-2.23,1.1,verb and adjective +are ambiguous to,0.2,-3.07,1.29,verb and adjective +are ambivalent to,0.2,-3.14,1.27,verb and adjective +it is insignificant,0.2,-3.69,1.08,verb and adjective +vague,0.2,NA,NA,adjective \ No newline at end of file diff --git a/data_raw/freq.csv b/data_raw/freq.csv new file mode 100644 index 00000000..060879c5 --- /dev/null +++ b/data_raw/freq.csv @@ -0,0 +1,64 @@ +word,assigned_score,coefficient,se,grammar_function +persistently,1,13.18,1873.64,adverb +relentlessly,1,-0.33,1.91,adverb +constantly,1,-0.99,1.21,adverb +incessantly,1,-1.02,1.77,adverb +always,1,0,NA,adverb +daily,1,-0.38,0.8,adverb +hourly,1,0,NA,adverb +by the minute,1,NA,NA,adverb +by the hour,1,NA,NA,adverb +every hour,1,NA,NA,adverb +every minute,1,NA,NA,adverb +persistent,1,NA,NA,adverb +relentless,1,NA,NA,adverb +unrelentingly,1,NA,NA,adverb +everyday,1,NA,NA,adverb +nightly,1,NA,NA,adverb +every night,1,NA,NA,adverb +constant,1,NA,NA,adjective +incessant,1,NA,NA,adjective +interminable,1,NA,NA,adjective +interminably,1,NA,NA,adverb +weekly,0.9,-1.65,0.65,adverb +every week,0.9,NA,NA,adverb +steadily,0.8,-2.81,1.13,adverb +generally,0.8,-3.99,1.21,adverb +usually,0.8,-4.27,1.11,adverb +fortnightly,0.8,NA,NA,adverb +steady,0.8,NA,NA,adjective +normally,0.8,NA,NA,adverb +frequently,0.7,-2.56,1.09,adverb +regularly,0.7,-2.92,1.18,adverb +often,0.7,-3.37,1.06,adverb +monthly,0.7,-3.21,0.76,adverb +frequent,0.7,NA,NA,adjective +regular,0.7,NA,NA,adjective +every month,0.7,NA,NA,adverb +incrementally,0.6,-2.6,1.8,adverb +progressively,0.6,-2.74,1.2,adverb +gradually,0.6,-4.75,1.09,adverb +quarterly,0.6,-3.38,0.91,adverb +progressive,0.6,NA,NA,adjective +gradual,0.6,NA,NA,adjective +sometimes,0.5,-6.28,1.19,adverb +annually,0.5,-5.01,1.03,adverb +yearly,0.5,NA,NA,adverb +every year,0.5,NA,NA,adverb +intermittently,0.3,-3.29,1.69,adverb +sporadically,0.3,-5.36,1.31,adverb +occasionally,0.3,-6.32,1.19,adverb +irregularly,0.3,-7.33,1.25,adverb +intermittent,0.3,NA,NA,adjective +sporadic,0.3,NA,NA,adjective +occasional,0.3,NA,NA,adjective +irregular,0.3,NA,NA,adjective +seldom,0.2,-7.13,1.14,adverb +infrequently,0.1,-6.46,1.13,adverb +infrequent,0.1,NA,NA,adjective +unusual,0.05,-5.77,1.18,adjective +rarely,0.05,-6.48,1.11,adverb +rare,0.05,NA,NA,adjective +scarcely,0.02,-5.88,1.2,adverb +hardly,0.02,-6.3,1.14,adverb +hardly ever,0.02,-6.46,1.49,adverb \ No newline at end of file diff --git a/data_raw/int.csv b/data_raw/int.csv new file mode 100644 index 00000000..9d4b3471 --- /dev/null +++ b/data_raw/int.csv @@ -0,0 +1,105 @@ +word,assigned_score,coefficient,se,grammar_function +strongly,1,1.79,1.94,adverb +desperately,1,1.69,0.46,adverb +extremely,1,1.62,0.52,adverb +urgently,1,1.27,0.44,adverb +intensively,1,0.96,0.79,adverb +highly,1,0.83,0.34,adverb +especially,1,0.4,0.72,adverb +firmly,1,0.35,2.07,adverb +really,1,-0.36,0.24,adverb +certainly,1,-0.71,0.25,adverb +everywhere,1,-2.16,1.09,adverb +simply,1,-0.75,0.38,adverb +practically,1,-3.66,1.47,adverb +definitely,1,0,NA,adverb +maximum,1,3.04,1.52,adjective +exceptional,1,1.85,0.92,adjective +extra,1,1.15,0.46,adjective +determined,1,1,1.05,adjective +pervasive,1,0.29,0.78,adjective +extensive,1,-0.35,0.5,adjective +unmatched,1,-0.39,0.8,adjective +remarkable,1,-0.51,1.09,adjective +intense,1,-1.82,1.08,adjective +enormous,1,0,NA,adjective +extraordinary,1,NA,NA,adjective +deeply,1,NA,NA,adverb +urgent,1,NA,NA,adjective +huge,1,NA,NA,adjective +tremendous,1,NA,NA,adjective +immense,1,NA,NA,adjective +vast,1,NA,NA,adjective +ultimate,1,NA,NA,adjective +high,1,NA,NA,adjective +definite,1,NA,NA,adjective +utterly,1,NA,NA,adverb +surely,1,NA,NA,adverb +certain,1,NA,NA,adjective +vigorously,1,NA,NA,adverb +vigorous,1,NA,NA,adjective +prevalent,1,NA,NA,adjective +intensive,1,NA,NA,adjective +extensively,1,NA,NA,adverb +wide,1,NA,NA,adjective +widely,1,NA,NA,adverb +widespread,1,NA,NA,adjective +remarkably,1,NA,NA,adverb +intensely,1,NA,NA,adverb +globally,1,NA,NA,adverb +global,1,NA,NA,adjective +completely,0.6,2.27,1.57,adverb +fully,0.6,1.34,0.74,adverb +most,0.6,0.87,1.79,determiner +very,0.6,0.83,0.51,adverb +substantially,0.6,-0.26,1.26,adverb +more,0.6,-0.53,0.31,determiner +moderately,0.6,-3,0.58,adverb +above,0.6,-0.84,1.08,preposition +realistic,0.6,-0.94,0.53,adjective +insufficient,0.6,-3.2,1.74,adjective +large,0.6,NA,NA,adjective +largest,0.6,NA,NA,adjective +big,0.6,NA,NA,adjective +biggest,0.6,NA,NA,adjective +totally,0.6,NA,NA,adverb +entirely,0.6,NA,NA,adverb +absolutely,0.6,NA,NA,adverb +ambitious,0.6,NA,NA,adjective +ambitiously,0.6,NA,NA,adverb +realistically,0.6,NA,NA,adverb +substantial,0.6,NA,NA,adjective +lots,0.6,NA,NA,quantifier +much,0.6,NA,NA,quantifier +far,0.6,NA,NA,adjective +moderate,0.6,NA,NA,adjective +plenty,0.5,NA,NA,quantifier +quite,0.4,-1.47,0.55,adverb +almost,0.4,-1.98,0.51,adverb +somewhat,0.4,-3.39,1.48,adverb +clear,0.4,-0.74,0.97,adjective +least,0.4,NA,NA,determiner +less,0.4,NA,NA,determiner +clearly,0.4,NA,NA,adverb +enough,0.2,3.97,1.45,determiner +nearly,0.2,-1.34,0.54,adverb +rather,0.2,-1.4,0.39,adverb +barely,0.2,-1.53,0.89,adverb +adequately,0.2,-2.45,1.94,adverb +reasonably,0.2,-2.4,0.98,adverb +slightly,0.2,-3.76,0.89,adverb +minimal,0.2,-1.53,0.72,adjective +modest,0.2,-1.77,0.59,adjective +low,0.2,-1.88,0.75,adjective +little,0.2,-2.41,0.98,adjective +some,0.2,-3.3,1.49,determiner +average,0.2,-3.48,1.59,adjective +limited,0.2,-4.39,1.63,adjective +sufficient,0.2,NA,NA,adjective +minimum,0.2,NA,NA,adjective +near,0.2,NA,NA,adverb +reasonable,0.2,NA,NA,adjective +weakly,0.2,NA,NA,adverb +slight,0.2,NA,NA,adjective +small,0.2,NA,NA,adjective +adequate,0.2,NA,NA,adjective \ No newline at end of file diff --git a/data_raw/time.csv b/data_raw/time.csv new file mode 100644 index 00000000..3cac5e8b --- /dev/null +++ b/data_raw/time.csv @@ -0,0 +1,43 @@ +word,assigned_score,coefficient,se,grammar_function +immediately,1,-0.2,0.18,adverb +imminently,1,-0.56,0.18,adverb +first,1,-0.86,0.4,adverb +hastily,1,-2.01,0.62,adverb +now,1,0,NA,adverb +tonight,1,-0.38,0.35,adverb +today,1,0,NA,adverb +imminent,1,NA,NA,adjective +hasty,1,NA,NA,adjective +immediate,1,NA,NA,adjective +promptly,0.9,-2.08,0.28,adverb +as soon as possible,0.9,-2.08,0.28,adverbial phrase +tomorrow,0.9,-2.32,0.32,adverb +prompt,0.9,NA,NA,adjective +quickly,0.8,-1.6,0.27,adverb +rapidly,0.8,-1.63,0.47,adverb +fast,0.8,-2.11,0.63,adverb +shortly,0.8,-2.69,0.36,adverb +speedily,0.8,-2.89,0.48,adverb +next,0.8,-2.91,0.3,adverb +this week,0.8,-2.24,0.41,adverbial phrase +quick,0.8,NA,NA,adjective +rapid,0.8,NA,NA,adjective +short,0.8,NA,NA,adjective +speedy,0.8,NA,NA,adjective +before,0.7,-0.56,0.46,conjunction +early,0.7,-2.6,0.27,adverb +next week,0.7,-3.47,0.39,adverbial phrase +earlier,0.6,NA,NA,adverb +this year,0.5,-4.96,0.43,adverbial phrase +soon(?! as possible),0.5,-3.03,0.25,adverb +afterwards,0.4,-4.19,0.63,adverb +after,0.4,-4.36,1.1,conjunction +long,0.2,-5.88,0.77,adjective +later,0.2,-6.03,0.53,adverb +slowly,0.2,-6.25,0.59,adverb +late,0.2,NA,NA,adverb +slow,0.2,NA,NA,adjective +finally,0.1,-4.55,0.56,adverb +eventually,0.05,-6.5,0.55,adverb +at some point,0.05,-6.7,0.65,adverbial phrase +at some stage,0.05,-7.74,1.1,adverbial phrase \ No newline at end of file diff --git a/data_raw/urgency_scores_and_scales.R b/data_raw/urgency_scores_and_scales.R new file mode 100644 index 00000000..81c998eb --- /dev/null +++ b/data_raw/urgency_scores_and_scales.R @@ -0,0 +1,244 @@ +# # This script rescales the data for each dictionary gathered from the "urgency" survey. + +# # Note that the coefficients and standard errors in the data refer to the +# # coefficients generated by the Bradley-Terry models for pairwise comparisons. +# # The assigned scores are author assigned scores before survey. +# # These served as controls in models to account for "expected outcomes" and +# # improve the coefficient ranks (i.e. odds of winning in pairwise comparison). + +# # The first task is to get close synonyms or versions of adjective/adverbs +# # scored in the same ways. +# # This is done since not all versions of the same word were surveyed. +# # The second tasks refers to how the scores are rescaled to calculate urgency +# # and the urgency of priorities. +# # Scaled variables refer to scaled coefficients. +# # Scaling is done using 1-coefficient/min(coefficient) scaled from 0.05 to 1. +# # Rescaled variables refer to centered coefficients (around a neutral term) +# # and then scaled. +# # Rescaling is done using 1+centered coefficient/min(centered coefficient). +# # This is only done for timing, frequency, and intensity since rescaled scores +# # are meant to be used to calculate the urgency of priorities only. + +library(readr) +library(dplyr) +library(scales) + +# # Timing dictionary + +# Load original data +timing <- read_csv("data_raw/time.csv") + +# Standardise scores for synonyms/equivalents +timing[which(timing$word=="hasty"),3] <- timing[which(timing$word=="hastily"),3] +timing[which(timing$word=="immediate"),3] <- timing[which(timing$word=="immediately"),3] +timing[which(timing$word=="imminent"),3] <- timing[which(timing$word=="imminently"),3] +timing[which(timing$word=="prompt"),3] <- timing[which(timing$word=="promptly"),3] +timing[which(timing$word=="quick"),3] <- timing[which(timing$word=="quickly"),3] +timing[which(timing$word=="speedy"),3] <- timing[which(timing$word=="speedily"),3] +timing[which(timing$word=="short"),3] <- timing[which(timing$word=="shortly"),3] +timing[which(timing$word=="earlier"),3] <- timing[which(timing$word=="early"),3] +timing[which(timing$word=="rapid"),3] <- timing[which(timing$word=="rapidly"),3] +timing[which(timing$word=="late"),3] <- timing[which(timing$word=="later"),3] +timing[which(timing$word=="slow"),3] <- timing[which(timing$word=="slowly"),3] + +# Rescale coefficients +timing[timing$word=="afterwards",3] # centering word score = 4.19 +timing <- timing %>% + mutate(centered_coefficient = ifelse(grammar_function != "adjective" & grammar_function != "conjunction", + coefficient + 4.19, NA), + scaled = scales::rescale(1-(coefficient/min(timing$coefficient, na.rm = TRUE)), to = c(0.05, 1)), + rescaled = ifelse(grammar_function != "adjective" & grammar_function != "conjunction", + 1+(centered_coefficient/max(timing$centered_coefficient, na.rm = TRUE)), NA)) + +# # Commitment dictionary + +# Get original file +commitment <- read_csv("data_raw/comm.csv") + +# Standardise scores for synonyms/equivalents +commitment[which(commitment$word=="necessarily"),3] <- commitment[which(commitment$word=="it is necessary"),3] +commitment[which(commitment$word=="importantly"),3] <- commitment[which(commitment$word=="it is important"),3] +commitment[which(commitment$word=="obligatory"),3] <- commitment[which(commitment$word=="oblige to"),3] +commitment[which(commitment$word=="require to"),3] <- commitment[which(commitment$word=="need to"),3] +commitment[which(commitment$word=="commit to"),3] <- commitment[which(commitment$word=="promise to"),3] +commitment[which(commitment$word=="intend to"),3] <- commitment[which(commitment$word=="promise to"),3] +commitment[which(commitment$word=="urge to"),3] <- commitment[which(commitment$word=="going to"),3] +commitment[which(commitment$word=="relevantly"),3] <- commitment[which(commitment$word=="it is relevant"),3] +commitment[which(commitment$word=="it is eminent"),3] <- commitment[which(commitment$word=="it is important"),3] +commitment[which(commitment$word=="prominent"),3] <- commitment[which(commitment$word=="it is important"),3] +commitment[which(commitment$word=="prominently"),3] <- commitment[which(commitment$word=="it is important"),3] +commitment[which(commitment$word=="preeminent"),3] <- commitment[which(commitment$word=="it is important"),3] +commitment[which(commitment$word=="pre-eminent"),3] <- commitment[which(commitment$word=="it is important"),3] +commitment[which(commitment$word=="momentous"),3] <- commitment[which(commitment$word=="it is important"),3] +commitment[which(commitment$word=="it is worthy"),3] <- commitment[which(commitment$word=="it is invaluable"),3] +commitment[which(commitment$word=="it is quintessential"),3] <- commitment[which(commitment$word=="it is essential"),3] +commitment[which(commitment$word=="it is valuable"),3] <- commitment[which(commitment$word=="it is invaluable"),3] +commitment[which(commitment$word=="unavoidably"),3] <- commitment[which(commitment$word=="it is unavoidable"),3] +commitment[which(commitment$word=="it is inevitable"),3] <- commitment[which(commitment$word=="it is unavoidable"),3] +commitment[which(commitment$word=="inevitably"),3] <- commitment[which(commitment$word=="it is unavoidable"),3] +commitment[which(commitment$word=="inexorable"),3] <- commitment[which(commitment$word=="it is unavoidable"),3] +commitment[which(commitment$word=="inexorably"),3] <- commitment[which(commitment$word=="it is unavoidable"),3] +commitment[which(commitment$word=="let us"),3] <- commitment[which(commitment$word=="let's"),3] +commitment[which(commitment$word=="shall"),3] <- commitment[which(commitment$word=="should"),3] +commitment[which(commitment$word=="plan to"),3] <- commitment[which(commitment$word=="intend to"),3] +commitment[which(commitment$word=="marginal"),3] <- commitment[which(commitment$word=="it is optional"),3] +commitment[which(commitment$word=="tenable"),3] <- commitment[which(commitment$word=="able to"),3] +commitment[which(commitment$word=="it is doable"),3] <- commitment[which(commitment$word=="able to"),3] +commitment[which(commitment$word=="it is plausible"),3] <- commitment[which(commitment$word=="it is possible"),3] +commitment[which(commitment$word=="plausibly"),3] <- commitment[which(commitment$word=="it is possible"),3] +commitment[which(commitment$word=="possibly"),3] <- commitment[which(commitment$word=="it is possible"),3] +commitment[which(commitment$word=="useable"),3] <- commitment[which(commitment$word=="it is useful"),3] +commitment[which(commitment$word=="ideally"),3] <- commitment[which(commitment$word=="it is ideal"),3] +commitment[which(commitment$word=="are careful to"),3] <- commitment[which(commitment$word=="cautious to"),3] +commitment[which(commitment$word=="carefully"),3] <- commitment[which(commitment$word=="cautious to"),3] +commitment[which(commitment$word=="it is obvious"),3] <- commitment[which(commitment$word=="able to"),3] +commitment[which(commitment$word=="vague"),3] <- commitment[which(commitment$word=="are ambiguous to"),3] +commitment[which(commitment$word=="option to"),3] <- commitment[which(commitment$word=="it is optional"),3] +commitment[which(commitment$word=="preferably"),3] <- commitment[which(commitment$word=="it is preferable"),3] +commitment[which(commitment$word=="get to"),3] <- commitment[which(commitment$word=="have to"),3] + +# Rescale coefficients +commitment <- commitment %>% + mutate(coefficient = ifelse(coefficient > 0, 0, coefficient), # Scale between must and may - bug? + scaled = scales::rescale(1-(coefficient/min(commitment$coefficient, na.rm = TRUE)), to = c(0.05, 1)), + rescaled = scales::rescale(ifelse(grammar_function != "adjective", + 1-(coefficient/min(commitment$coefficient, na.rm = TRUE)), + NA), to = c(0.05, 1))) + +# # Intensity dictionary + +# Load original data +intensity <- read_csv("data_raw/int.csv") + +# Standardise synonyms/equivalents +intensity[which(intensity$word=="deeply"),3] <- intensity[which(intensity$word=="extremely"),3] # add scale values for similar words +intensity[which(intensity$word=="high"),3] <- intensity[which(intensity$word=="highly"),3] +intensity[which(intensity$word=="definite"),3] <- intensity[which(intensity$word=="definitely"),3] +intensity[which(intensity$word=="surely"),3] <- intensity[which(intensity$word=="certainly"),3] +intensity[which(intensity$word=="certain"),3] <- intensity[which(intensity$word=="certainly"),3] +intensity[which(intensity$word=="utterly"),3] <- intensity[which(intensity$word=="completely"),3] +intensity[which(intensity$word=="absolutely"),3] <- intensity[which(intensity$word=="completely"),3] +intensity[which(intensity$word=="urgent"),3] <- intensity[which(intensity$word=="urgently"),3] +intensity[which(intensity$word=="vigorously"),3] <- intensity[which(intensity$word=="strongly"),3] +intensity[which(intensity$word=="vigorous"),3] <- intensity[which(intensity$word=="strongly"),3] +intensity[which(intensity$word=="huge"),3] <- intensity[which(intensity$word=="enormous"),3] +intensity[which(intensity$word=="large"),3] <- intensity[which(intensity$word=="enormous"),3] +intensity[which(intensity$word=="largest"),3] <- intensity[which(intensity$word=="enormous"),3] +intensity[which(intensity$word=="big"),3] <- intensity[which(intensity$word=="enormous"),3] +intensity[which(intensity$word=="biggest"),3] <- intensity[which(intensity$word=="enormous"),3] +intensity[which(intensity$word=="global"),3] <- intensity[which(intensity$word=="everywhere"),3] +intensity[which(intensity$word=="globally"),3] <- intensity[which(intensity$word=="everywhere"),3] +intensity[which(intensity$word=="prevalent"),3] <- intensity[which(intensity$word=="pervasive"),3] +intensity[which(intensity$word=="extensively"),3] <- intensity[which(intensity$word=="extensive"),3] +intensity[which(intensity$word=="tremendous"),3] <- intensity[which(intensity$word=="enormous"),3] +intensity[which(intensity$word=="extraordinary"),3] <- intensity[which(intensity$word=="exceptional"),3] +intensity[which(intensity$word=="remarkably"),3] <- intensity[which(intensity$word=="remarkable"),3] +intensity[which(intensity$word=="immense"),3] <- intensity[which(intensity$word=="enormous"),3] +intensity[which(intensity$word=="vast"),3] <- intensity[which(intensity$word=="enormous"),3] +intensity[which(intensity$word=="ultimate"),3] <- intensity[which(intensity$word=="enormous"),3] +intensity[which(intensity$word=="ambitiously"),3] <- intensity[which(intensity$word=="determined"),3] +intensity[which(intensity$word=="ambitious"),3] <- intensity[which(intensity$word=="determined"),3] +intensity[which(intensity$word=="realistically"),3] <- intensity[which(intensity$word=="realistic"),3] +intensity[which(intensity$word=="more"),3] <- intensity[which(intensity$word=="moderately"),3] +intensity[which(intensity$word=="moderate"),3] <- intensity[which(intensity$word=="moderately"),3] +intensity[which(intensity$word=="substantial"),3] <- intensity[which(intensity$word=="substantially"),3] +intensity[which(intensity$word=="lots"),3] <- intensity[which(intensity$word=="substantially"),3] +intensity[which(intensity$word=="plenty"),3] <- intensity[which(intensity$word=="substantially"),3] +intensity[which(intensity$word=="much"),3] <- intensity[which(intensity$word=="substantially"),3] +intensity[which(intensity$word=="totally"),3] <- intensity[which(intensity$word=="fully"),3] +intensity[which(intensity$word=="entirely"),3] <- intensity[which(intensity$word=="fully"),3] +intensity[which(intensity$word=="far"),3] <- intensity[which(intensity$word=="substantially"),3] +intensity[which(intensity$word=="clearly"),3] <- intensity[which(intensity$word=="clear"),3] +intensity[which(intensity$word=="least"),3] <- intensity[which(intensity$word=="almost"),3] +intensity[which(intensity$word=="less"),3] <- intensity[which(intensity$word=="almost"),3] +intensity[which(intensity$word=="small"),3] <- intensity[which(intensity$word=="limited"),3] +intensity[which(intensity$word=="marginally"),3] <- intensity[which(intensity$word=="limited"),3] +intensity[which(intensity$word=="minimum"),3] <- intensity[which(intensity$word=="minimal"),3] +intensity[which(intensity$word=="weakly"),3] <- intensity[which(intensity$word=="slightly"),3] +intensity[which(intensity$word=="adequate"),3] <- intensity[which(intensity$word=="adequately"),3] +intensity[which(intensity$word=="sufficient"),3] <- intensity[which(intensity$word=="enough"),3] +intensity[which(intensity$word=="near"),3] <- intensity[which(intensity$word=="nearly"),3] +intensity[which(intensity$word=="some"),3] <- intensity[which(intensity$word=="modest"),3] +intensity[which(intensity$word=="slight"),3] <- intensity[which(intensity$word=="slightly"),3] +intensity[which(intensity$word=="reasonable"),3] <- intensity[which(intensity$word=="reasonably"),3] +intensity[which(intensity$word=="wide"),3] <- intensity[which(intensity$word=="extensive"),3] +intensity[which(intensity$word=="widely"),3] <- intensity[which(intensity$word=="extensive"),3] +intensity[which(intensity$word=="widespread"),3] <- intensity[which(intensity$word=="extensive"),3] +intensity[which(intensity$word=="intensive"),3] <- intensity[which(intensity$word=="intense"),3] +intensity[which(intensity$word=="intensely"),3] <- intensity[which(intensity$word=="intense"),3] + +# Merge commitment adverbs and rescale coefficients +intensity[intensity$word=="simply",3] # centering word score = 0.75 +intensity <- intensity %>% + mutate(centered_coefficient = ifelse(grammar_function == "adverb", + coefficient + 0.75, NA), + scaled = scales::rescale(1-(coefficient/min(intensity$coefficient, na.rm = TRUE)), to = c(0.05, 1)), + rescaled = ifelse(grammar_function == "adverb", # What about quantifiers and determiners? + 1+(centered_coefficient/max(intensity$centered_coefficient, na.rm = TRUE)), NA)) + +# # Frequency dictionary + +# Load original data +frequency <- read_csv("data_raw/freq.csv") + +# Persistently is a big outlier, we can remove it for now ... +#frequency[which(frequency$word=="persistently"),3] <- NA +# # Or take the score from another term ... +frequency[which(frequency$word=="persistently"),3] <- + frequency[which(frequency$word=="relentlessly"),3] + +# Standardise scores for synonyms/equivalents +frequency[which(frequency$word=="frequent"),3] <- frequency[which(frequency$word=="frequently"),3] +frequency[which(frequency$word=="constant"),3] <- frequency[which(frequency$word=="constantly"),3] +frequency[which(frequency$word=="by the minute"),3] <- frequency[which(frequency$word=="hourly"),3] +frequency[which(frequency$word=="by the hour"),3] <- frequency[which(frequency$word=="hourly"),3] +frequency[which(frequency$word=="everyday"),3] <- frequency[which(frequency$word=="daily"),3] +frequency[which(frequency$word=="nightly"),3] <- frequency[which(frequency$word=="daily"),3] +frequency[which(frequency$word=="every night"),3] <- frequency[which(frequency$word=="daily"),3] +frequency[which(frequency$word=="persistent"),3] <- frequency[which(frequency$word=="persistently"),3] +frequency[which(frequency$word=="relentless"),3] <- frequency[which(frequency$word=="relentlessly"),3] +frequency[which(frequency$word=="incessant"),3] <- frequency[which(frequency$word=="incessantly"),3] +frequency[which(frequency$word=="unrelentingly"),3] <- frequency[which(frequency$word=="relentlessly"),3] +frequency[which(frequency$word=="interminable"),3] <- frequency[which(frequency$word=="incessantly"),3] +frequency[which(frequency$word=="interminably"),3] <- frequency[which(frequency$word=="incessantly"),3] +frequency[which(frequency$word=="fortnightly"),3] <- frequency[which(frequency$word=="weekly"),3] +frequency[which(frequency$word=="yearly"),3] <- frequency[which(frequency$word=="annually"),3] +frequency[which(frequency$word=="every year"),3] <- frequency[which(frequency$word=="annually"),3] +frequency[which(frequency$word=="every hour"),3] <- frequency[which(frequency$word=="hourly"),3] +frequency[which(frequency$word=="every minute"),3] <- frequency[which(frequency$word=="hourly"),3] +frequency[which(frequency$word=="every week"),3] <- frequency[which(frequency$word=="weekly"),3] +frequency[which(frequency$word=="every month"),3] <- frequency[which(frequency$word=="monthly"),3] +frequency[which(frequency$word=="normally"),3] <- frequency[which(frequency$word=="usually"),3] +frequency[which(frequency$word=="regular"),3] <- frequency[which(frequency$word=="regularly"),3] +frequency[which(frequency$word=="steady"),3] <- frequency[which(frequency$word=="steadily"),3] +frequency[which(frequency$word=="infrequent"),3] <- frequency[which(frequency$word=="infrequently"),3] +frequency[which(frequency$word=="progressive"),3] <- frequency[which(frequency$word=="progressively"),3] +frequency[which(frequency$word=="gradual"),3] <- frequency[which(frequency$word=="gradually"),3] +frequency[which(frequency$word=="occasional"),3] <- frequency[which(frequency$word=="infrequently"),3] +frequency[which(frequency$word=="occasionally"),3] <- frequency[which(frequency$word=="infrequently"),3] +frequency[which(frequency$word=="sporadic"),3] <- frequency[which(frequency$word=="sporadically"),3] +frequency[which(frequency$word=="irregular"),3] <- frequency[which(frequency$word=="irregularly"),3] +frequency[which(frequency$word=="intermittent"),3] <- frequency[which(frequency$word=="intermittently"),3] +frequency[which(frequency$word=="rare"),3] <- frequency[which(frequency$word=="rarely"),3] + +# Rescale coefficients +(unlist(frequency[frequency$word=="gradual",3]) + # between gradual and usually + unlist(frequency[frequency$word=="usually",3]))/2 # centering word score = 4.51 +frequency <- frequency %>% + mutate(centered_coefficient = ifelse(grammar_function != "adjective", + coefficient + 4.51, NA), + scaled = scales::rescale(1-(coefficient/min(frequency$coefficient, na.rm = TRUE)), to = c(0.05, 1)), + rescaled = ifelse(grammar_function != "adjective", + 1+(centered_coefficient/max(frequency$centered_coefficient, na.rm = TRUE)), NA)) + +# Update names to avoid overlap +Commitment <- commitment +Frequency <- frequency +Intensity <- intensity +Timing <- timing + +# # Save the data as internal data +# # Note that the CAP Topics is another type of internal data saved in package +# # that should be (re)saved as well. +usethis::use_data(CAP_topics, Commitment, Frequency, Intensity, Timing, + internal = TRUE, overwrite = TRUE) diff --git a/docs/404.html b/docs/404.html index f410e97b..99bc900c 100644 --- a/docs/404.html +++ b/docs/404.html @@ -24,7 +24,7 @@ poldis - 0.1.0 + 0.1.1 diff --git a/docs/CODE_OF_CONDUCT.html b/docs/CODE_OF_CONDUCT.html index f7196d97..a0575f09 100644 --- a/docs/CODE_OF_CONDUCT.html +++ b/docs/CODE_OF_CONDUCT.html @@ -10,7 +10,7 @@ poldis - 0.1.0 + 0.1.1 @@ -47,7 +47,7 @@ Contributor Covenant Code of Conduct - + Source: .github/CODE_OF_CONDUCT.md diff --git a/docs/CONTRIBUTING.html b/docs/CONTRIBUTING.html index 8116d302..826f39c2 100644 --- a/docs/CONTRIBUTING.html +++ b/docs/CONTRIBUTING.html @@ -10,7 +10,7 @@ poldis - 0.1.0 + 0.1.1 @@ -47,7 +47,7 @@ Contributing - + Source: .github/CONTRIBUTING.md diff --git a/docs/LICENSE-text.html b/docs/LICENSE-text.html index 85bdc8c9..975a0883 100644 --- a/docs/LICENSE-text.html +++ b/docs/LICENSE-text.html @@ -10,7 +10,7 @@ poldis - 0.1.0 + 0.1.1 diff --git a/docs/LICENSE.html b/docs/LICENSE.html index 8fe24a4a..07110487 100644 --- a/docs/LICENSE.html +++ b/docs/LICENSE.html @@ -10,7 +10,7 @@ poldis - 0.1.0 + 0.1.1 @@ -47,7 +47,7 @@ MIT License - + Source: LICENSE.md diff --git a/docs/authors.html b/docs/authors.html index 0cf5d6df..ef3eca33 100644 --- a/docs/authors.html +++ b/docs/authors.html @@ -10,7 +10,7 @@ poldis - 0.1.0 + 0.1.1 @@ -68,7 +68,7 @@ Authors Citation - + Source: inst/CITATION H. Sposito. poldis: Tools for analyzing political discourse. 2021. @Manual{, diff --git a/docs/index.html b/docs/index.html index c77f56b7..5ade21f7 100644 --- a/docs/index.html +++ b/docs/index.html @@ -5,14 +5,14 @@ - -Tools for Analysing Political Texts • poldis + +Analyse Political Texts • poldis - - + + -{poldis}: an R package that provides tools for analyzing political discourses over time! +poldis: an R package that provides tools for analyzing political discourses over time! Why {poldis}? -{poldis} provides the tools necessary to work with political discourse texts, making it easier to analyse political discourses that are less structured and frequently involve multiple actors and topics. Political discourse data come in many different forms, ranging from single-speaker speeches to debates. While other R packages allow us to analyse data from single-speaker speeches easily, this only reveals a portion of the what, how, and where politics “gets done”. For most other settings where political discourse appears as debates, interviews, news conferences, or campaign rallies, working with text programmatically becomes tricky where there may be multiple speakers, questions, different languages, and other types of “noise” in text data. Texts might need to be separated by speakers, have questions removed for analysis, or retain only the context around specific topics and keywords. The package contains the tools required for wrangling and annotating these different types of political discourse texts within R. -Beyond these basic tools for text analysis, {poldis} also introduces tools for Urgency Analysis (UA), a new method for the analysis of urgency in political texts. Urgency is an expression of how critical or immediate problems or solutions are in political discourse. UA rests upon a multidimensional, weighted, and survey-validated conception of how urgency can be expressed in political discourses. It combines Natural Language Processes (NLP) and dictionary approaches to provide a contextualized, comparable, and scalable new method for the analysis of political texts. +poldis provides the tools necessary to work with political discourse texts, making it easier to analyse political discourses that are less structured and frequently involve multiple actors and topics. Political discourse data come in many different forms, ranging from single-speaker speeches to debates. While other R packages allow us to analyse data from single-speaker speeches easily, this only reveals a portion of the what, how, and where politics “gets done”. For most other settings where political discourse appears as debates, interviews, news conferences, or campaign rallies, working with text programmatically becomes tricky where there may be multiple speakers, questions, different languages, and other types of “noise” in text data. Texts might need to be separated by speakers, have questions removed for analysis, or retain only the context around specific topics and keywords. The package contains the tools required for wrangling and annotating these different types of political discourse texts within R. +Beyond these basic tools for text analysis, poldis also introduces tools for Urgency Analysis (UA), a new method for the analysis of urgency in political texts. Urgency is an expression of how critical or immediate problems or solutions are in political discourse. UA rests upon a multidimensional, weighted, and survey-validated conception of how urgency can be expressed in political discourses. It combines Natural Language Processes (NLP) and dictionary approaches to provide a contextualized, comparable, and scalable new method for the analysis of political texts. Installing the package -The easiest way to install {poldis} is directly from CRAN: +The easiest way to install poldis is directly from CRAN: install.packages("poldis") -library(poldis) +library(poldis) However, you may also install the development version from GitHub. # install.packages("remotes") remotes::install_github("henriquesposito/poldis") -library(poldis) +library(poldis) Getting started -{poldis} offers several text tools for wrangling texts within R. For example, users can extract the context surrounding a specific string in the text. +poldis offers several text tools for wrangling texts within R. For example, users can extract the context surrounding a specific string in the text. set.seed(1234) text <- sample(US_News_Conferences_1960_1980$text, 2, replace = TRUE) @@ -154,7 +154,7 @@ Using {poldis} beyond Contributing -If you are interested in using {poldis}, please do not hesitate to contribute to the package by opening issues, asking questions, or proposing changes. +If you are interested in using poldis, please do not hesitate to contribute to the package by opening issues, asking questions, or proposing changes. Citing @@ -176,7 +176,15 @@ Citing ## } -
.github/CODE_OF_CONDUCT.md
.github/CONTRIBUTING.md
LICENSE.md
Source: inst/CITATION
inst/CITATION
H. Sposito. poldis: Tools for analyzing political discourse. 2021.
@Manual{, diff --git a/docs/index.html b/docs/index.html index c77f56b7..5ade21f7 100644 --- a/docs/index.html +++ b/docs/index.html @@ -5,14 +5,14 @@ - -Tools for Analysing Political Texts • poldis + +Analyse Political Texts • poldis - - + + -{poldis}: an R package that provides tools for analyzing political discourses over time! +poldis: an R package that provides tools for analyzing political discourses over time!
{poldis}: an R package that provides tools for analyzing political discourses over time!
{poldis}
poldis: an R package that provides tools for analyzing political discourses over time!
{poldis} provides the tools necessary to work with political discourse texts, making it easier to analyse political discourses that are less structured and frequently involve multiple actors and topics. Political discourse data come in many different forms, ranging from single-speaker speeches to debates. While other R packages allow us to analyse data from single-speaker speeches easily, this only reveals a portion of the what, how, and where politics “gets done”. For most other settings where political discourse appears as debates, interviews, news conferences, or campaign rallies, working with text programmatically becomes tricky where there may be multiple speakers, questions, different languages, and other types of “noise” in text data. Texts might need to be separated by speakers, have questions removed for analysis, or retain only the context around specific topics and keywords. The package contains the tools required for wrangling and annotating these different types of political discourse texts within R.
Beyond these basic tools for text analysis, {poldis} also introduces tools for Urgency Analysis (UA), a new method for the analysis of urgency in political texts. Urgency is an expression of how critical or immediate problems or solutions are in political discourse. UA rests upon a multidimensional, weighted, and survey-validated conception of how urgency can be expressed in political discourses. It combines Natural Language Processes (NLP) and dictionary approaches to provide a contextualized, comparable, and scalable new method for the analysis of political texts.
poldis provides the tools necessary to work with political discourse texts, making it easier to analyse political discourses that are less structured and frequently involve multiple actors and topics. Political discourse data come in many different forms, ranging from single-speaker speeches to debates. While other R packages allow us to analyse data from single-speaker speeches easily, this only reveals a portion of the what, how, and where politics “gets done”. For most other settings where political discourse appears as debates, interviews, news conferences, or campaign rallies, working with text programmatically becomes tricky where there may be multiple speakers, questions, different languages, and other types of “noise” in text data. Texts might need to be separated by speakers, have questions removed for analysis, or retain only the context around specific topics and keywords. The package contains the tools required for wrangling and annotating these different types of political discourse texts within R.
Beyond these basic tools for text analysis, poldis also introduces tools for Urgency Analysis (UA), a new method for the analysis of urgency in political texts. Urgency is an expression of how critical or immediate problems or solutions are in political discourse. UA rests upon a multidimensional, weighted, and survey-validated conception of how urgency can be expressed in political discourses. It combines Natural Language Processes (NLP) and dictionary approaches to provide a contextualized, comparable, and scalable new method for the analysis of political texts.
The easiest way to install {poldis} is directly from CRAN:
The easiest way to install poldis is directly from CRAN:
install.packages("poldis") -library(poldis)
However, you may also install the development version from GitHub.
# install.packages("remotes") remotes::install_github("henriquesposito/poldis") -library(poldis)
{poldis} offers several text tools for wrangling texts within R. For example, users can extract the context surrounding a specific string in the text.
poldis offers several text tools for wrangling texts within R. For example, users can extract the context surrounding a specific string in the text.
set.seed(1234) text <- sample(US_News_Conferences_1960_1980$text, 2, replace = TRUE) @@ -154,7 +154,7 @@ Using {poldis} beyond Contributing -If you are interested in using {poldis}, please do not hesitate to contribute to the package by opening issues, asking questions, or proposing changes. +If you are interested in using poldis, please do not hesitate to contribute to the package by opening issues, asking questions, or proposing changes. Citing @@ -176,7 +176,15 @@ Citing ## }
If you are interested in using {poldis}, please do not hesitate to contribute to the package by opening issues, asking questions, or proposing changes.
If you are interested in using poldis, please do not hesitate to contribute to the package by opening issues, asking questions, or proposing changes.