diff --git a/.Rbuildignore b/.Rbuildignore index 0e91fd5b..095f201b 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -11,3 +11,4 @@ ^pkgdown$ ^CRAN-SUBMISSION$ ^cran-comments\.md$ +^data_raw$ diff --git a/.gitignore b/.gitignore index 7c065559..b3bfd274 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,5 @@ .Ruserdata .DS_Store docs +^CRAN-SUBMISSION$ +^cran-comments\.md$ diff --git a/CRAN-SUBMISSION b/CRAN-SUBMISSION index 64ee2bc3..e277d7c0 100644 --- a/CRAN-SUBMISSION +++ b/CRAN-SUBMISSION @@ -1,3 +1,3 @@ -Version: 0.1.0 -Date: 2024-07-15 16:35:17 UTC -SHA: cae719a67aa2ce72724795a42738d3a75b23ec42 +Version: 0.1.1 +Date: 2024-07-17 15:48:03 UTC +SHA: bc6a443e9a211166975d8aa70f693ddf146102d1 diff --git a/DESCRIPTION b/DESCRIPTION index c8be4c7e..1b361dae 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,8 +1,8 @@ Package: poldis Type: Package Title: Analyse Political Texts -Version: 0.1.1 -Date: 2024-07-17 +Version: 0.1.2 +Date: 2024-09-03 Authors@R: c(person(given = "Henrique", family = "Sposito", @@ -46,8 +46,9 @@ Suggests: pdftools, fmsb, ggplot2, - tm -RoxygenNote: 7.3.1 + tm, + cli +RoxygenNote: 7.3.2 Encoding: UTF-8 LazyData: True Depends: diff --git a/NAMESPACE b/NAMESPACE index 118d3f02..533845b0 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -16,6 +16,7 @@ export(gather_topics) export(get_urgency) export(read_pdf) export(select_priorities) +export(sim_urgency) export(split_text) import(dplyr) import(quanteda) @@ -36,5 +37,7 @@ importFrom(stringr,str_detect) importFrom(stringr,str_extract) importFrom(stringr,str_extract_all) importFrom(stringr,str_remove_all) +importFrom(stringr,str_replace_all) importFrom(stringr,str_squish) +importFrom(textstem,lemmatize_strings) importFrom(tidyr,unite) diff --git a/NEWS.md b/NEWS.md index ffd1ddfa..1c448b23 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,19 @@ +# poldis 0.1.2 + +2024-09-03 + +## Package + +- Updated documentation for `get_urgency()` to include more details about urgency scores +- Updated internal data and data raw files to increase transparency + +## Functions + +- Updated `get_urgency()` and `gather_topics()` to open codebooks for urgency or topic when no arguments are declared +- Updated urgency scores in `get_urgency()` according to survey results. +- Added `sim_urgency()` function to simulate urgent priorities +- Updated `select_priorities()` to be more strict and accurate about selection of priorities + # poldis 0.1.1 2024-07-17 diff --git a/R/priorities.R b/R/priorities.R index 3cfaf98f..351c6b7c 100644 --- a/R/priorities.R +++ b/R/priorities.R @@ -13,6 +13,8 @@ #' @importFrom dplyr mutate distinct %>% #' @return A data frame with syntax information by sentences and #' a variable identifying which of these sentences are priorities. +#' @examples +#' #select_priorities(US_News_Conferences_1960_1980[1:2,3]) #' @export select_priorities <- function(.data, na.rm = TRUE) { tags <- sentence <- lemmas <- priorities <- NULL @@ -21,27 +23,17 @@ select_priorities <- function(.data, na.rm = TRUE) { stop("Please declare a text vector or an annotated data frame at the sentence level.") } else .data <- suppressMessages(annotate_text(.data, level = "sentences")) out <- .data %>% - dplyr::mutate(lemmas = tolower(lemmas), - priorities = ifelse(stringr::str_detect(tags, "PRP MD ")| - stringr::str_detect(lemmas, - "going to|go to |need to|ready to| - |is time to|commit to|promise to|have to| - |plan to|intend to|let 's|let us|urge| - |require|want to"), - paste(sentence), NA), # detect priorities - priorities = ifelse(stringr::str_detect(priorities, " not | - |yesterday|last week| - |last month|last year| - |thank|honor|honour| - |applause|greet|laugh| - |privilege to|great to| - |good to be|good to see") | - stringr::str_detect(tags, "MD VB( RB)? VBN| - |VBD( RB)? VBN|VBZ( RB)? VBN| - |VBD( RB)? JJ|PRP( RB)? VBD TO| - |VBN( RB)? VBN"), - # Combinations of NLP tags to select - NA, priorities)) %>% + dplyr::mutate(priorities = ifelse(stringr::str_detect(tags, "PRP MD ")| + stringr::str_detect(lemmas, paste0( + textstem::lemmatize_strings( + Commitment$word[which(Commitment$grammar_function != "adjective")]), + collapse = "|")), + lemmas, NA), # detect priorities + priorities = ifelse(stringr::str_detect(priorities, " not | never ") | + stringr::str_detect(tags, + "MD VB( RB)? VBN|VBD( RB)? VBN|VBZ( RB)? VBN| + |VBD( RB)? JJ|PRP( RB)? VBD TO|VBN( RB)? VBN"), + NA, sentence)) %>% dplyr::distinct() if (isTRUE(na.rm)) out <- filter(out, !is.na(priorities)) class(out) <- c("priorities", class(out)) diff --git a/R/sysdata.rda b/R/sysdata.rda index b4234b60..77bc7cd9 100644 Binary files a/R/sysdata.rda and b/R/sysdata.rda differ diff --git a/R/text_tools.R b/R/text_tools.R index 4c7b8e3e..5c09aff2 100644 --- a/R/text_tools.R +++ b/R/text_tools.R @@ -1,19 +1,22 @@ #' Extract a list of possible names of individuals in texts #' #' @param v A text vector. +#' @import spacyr #' @importFrom dplyr distinct filter %>% summarize group_by #' @importFrom stringr str_squish #' @importFrom stringdist stringsimmatrix -#' @import spacyr +#' @importFrom textstem lemmatize_strings #' @return A data frame of individual names and the number of times they appear. #' @details The function relies on named entity recognition from NLP models. +#' @examples +#' #extract_names(US_News_Conferences_1960_1980[20, 3]) #' @export extract_names <- function(v) { ent_type <- text <- s <- NULL suppressWarnings(spacyr::spacy_initialize(model = "en_core_web_sm")) out <- suppressWarnings(spacyr::spacy_extract_entity(v, type = "named")) %>% dplyr::filter(ent_type == "PERSON") %>% - dplyr::mutate(names = .clean_token(text)) %>% + dplyr::mutate(names = textstem::lemmatize_strings(text)) %>% dplyr::group_by(names) %>% dplyr::count(name = "count") spacyr::spacy_finalize() @@ -33,12 +36,17 @@ extract_names <- function(v) { #' Extract locations from strings #' #' @param v Text vector. +#' @import spacyr #' @importFrom stringi stri_trans_general #' @importFrom stringr str_extract #' @importFrom purrr map_chr #' @importFrom stringdist stringsimmatrix +#' @importFrom textstem lemmatize_strings #' @return A data frame of locations and the number of times they appear. #' @details The function relies on geographical entity detection from NLP models. +#' @examples +#' #extract_locations(c("This is the United States", "This is Sao Paulo", +#' #"I was in Rio de Janeiro and Sao Paulo, then back to the United States")) #' @export extract_locations <- function(v) { v <- stringi::stri_trans_general(v, id = "Latin-ASCII") @@ -46,7 +54,7 @@ extract_locations <- function(v) { suppressWarnings(spacyr::spacy_initialize(model = "en_core_web_sm")) out <- suppressWarnings(spacyr::spacy_extract_entity(v)) %>% dplyr::filter(ent_type == "GPE") %>% - dplyr::mutate(names = .clean_token(text)) %>% + dplyr::mutate(names = textstem::lemmatize_strings(text)) %>% dplyr::group_by(names) %>% dplyr::count(name = "count") spacyr::spacy_finalize() @@ -83,6 +91,8 @@ extract_title <- function(v) { #' Wrapper function for `messydates::as_messydates`. #' @param v Text vector. #' @return A vector of the dates in text. +#' @examples +#' #extract_date("Today is the twenty six of February of two thousand and twenty four") #' @export extract_date <- function(v) { thisRequires("messydates") @@ -213,6 +223,8 @@ extract_context <- function(match, v, level = "sentences", n = 1) { #' in selecting a method, please see `?quanteda.textstats::textstat_simil()`. #' @importFrom dplyr group_by summarise select %>% #' @return A matrix of similarity scores between texts. +#' @examples +#' #extract_text_similarities(US_News_Conferences_1960_1980[1:2,3]) #' @export extract_text_similarities <- function(v, comparison = "similarities", method) { thisRequires("quanteda.textstats") @@ -307,12 +319,16 @@ read_pdf <- function(path) { #' Defaults to "words". #' @import spacyr #' @importFrom dplyr group_by summarise ungroup %>% -#' @importFrom stringr str_squish +#' @importFrom stringr str_squish str_replace_all #' @return A data frame with syntax information by words or sentences in text. +#' @examples +#' #annotate_text(US_News_Conferences_1960_1980[1:2, 3]) +#' #annotate_text(US_News_Conferences_1960_1980[1:2, 3], level = "sentence") #' @export annotate_text <- function(v, level = "words") { doc_id <- sentence_id <- token_id <- token <- pos <- tag <- lemma <- entity <- NULL suppressWarnings(spacyr::spacy_initialize(model = "en_core_web_sm")) + v <- stringr::str_replace_all(v, "\\.\\,|\\. \\,|\\,\\.|\\, \\.|\\.\\\n\\,", ".") parse <- spacyr::spacy_parse(v, tag = TRUE) suppressWarnings(spacyr::spacy_finalize()) if (level == "sentences" | level == "sentence") { diff --git a/R/topic.R b/R/topic.R index 21be3e31..720c28b6 100644 --- a/R/topic.R +++ b/R/topic.R @@ -4,14 +4,18 @@ #' `select_priorities()`, or text vector. #' For data frames, function will search for "text" variable. #' For priorities data frame function will search for "priorities" variable. +#' If missing, opens the webpage containing the political topics codebook. #' @param dictionary The dictionary of 20 major political topics from the #' Comparative Agendas Project (Jones et al., 2023) is used by default. #' Users can also declare a custom dictionary as a vector or a list. #' If users declare a vector, each element is treated as a independent topic. #' If users declare a list of subjects and related terms, function understands #' names as topic and words as terms. +#' For more information on how the CAP topics were adapted, please run +#' `gather_topics()` to access the political topics codebook. #' @import dplyr #' @importFrom tidyr unite +#' @importFrom textstem lemmatize_strings #' @return A list of topics present in each text separated by comma. #' @examples #' \donttest{ @@ -21,20 +25,25 @@ #' gather_topics(US_News_Conferences_1960_1980[1:5, 3], #' dictionary = list("military" = c("military", "gun", "war"), #' "development" = c("development", "interest rate", "banks"))) +#' #summary(gather_topics(US_News_Conferences_1960_1980[1:5, 3])) +#' #plot(gather_topics(US_News_Conferences_1960_1980[1:5, 3], +#' # dictionary = c("military", "development"))) #' } #' @export gather_topics <- function(.data, dictionary = "CAP") { Words <- topics <- NULL + # tries to open topic codebook if no argument is declared + if (missing(.data)) open_codebook(codebook = "topic") # get text variable if (inherits(.data, "priorities")) { - text <- stats::na.omit(.clean_token(getElement(.data, "priorities"))) + text <- stats::na.omit(textstem::lemmatize_strings(getElement(.data, "priorities"))) } else if (inherits(.data, "data.frame")) { - text <- .clean_token(getElement(.data, "text")) - } else text <- .clean_token(.data) + text <- textstem::lemmatize_strings(getElement(.data, "text")) + } else text <- textstem::lemmatize_strings(.data) # get dictionary if (any(dictionary == "CAP")) { dictionary <- CAP_topics %>% - dplyr::mutate(Words = stringr::str_replace_all(.clean_token(Words), + dplyr::mutate(Words = stringr::str_replace_all(textstem::lemmatize_strings(Words), ", ", "\\\\b|\\\\b")) subjects <- dictionary$Words names(subjects) <- dictionary$Topic @@ -72,6 +81,7 @@ gather_topics <- function(.data, dictionary = "CAP") { #' @import quanteda #' @import dplyr #' @importFrom stringr str_detect str_remove_all +#' @importFrom textstem lemmatize_strings #' @return A list of related terms to each of the topics declared in dictionary. #' @details This function relies on keyword assisted topic models implemented #' in the `\{keyATM\}` package to find related words based on the topics @@ -81,16 +91,23 @@ gather_topics <- function(.data, dictionary = "CAP") { #' “Keyword-Assisted Topic Models.” #' _American Journal of Political Science_, 68(2): 730-750. #' \doi{10.1111/ajps.12779} +#' @examples +#' #gather_related_terms(US_News_Conferences_1960_1980[1:5, 3], dictionary = "CAP") +#' #gather_related_terms(US_News_Conferences_1960_1980[1:5, 3], +#' # dictionary = c("military", "development")) +#' #gather_related_terms(US_News_Conferences_1960_1980[1:5, 3], +#' # dictionary = list("military" = c("military", "gun", "war"), +#' # "development" = c("development", "interest rate", "banks"))) #' @export gather_related_terms <- function(.data, dictionary) { Words <- NULL thisRequires("keyATM") # get text variable if (inherits(.data, "priorities")) { - text <- stats::na.omit(.clean_token(getElement(.data, "priorities"))) + text <- stats::na.omit(textstem::lemmatize_strings(getElement(.data, "priorities"))) } else if (inherits(.data, "data.frame")) { - text <- .clean_token(getElement(.data, "text")) - } else text <- .clean_token(.data) + text <- textstem::lemmatize_strings(getElement(.data, "text")) + } else text <- textstem::lemmatize_strings(.data) # check dictionary if (any(dictionary == "CAP")) { subjects <- CAP_topics %>% diff --git a/R/urgency.R b/R/urgency.R index 158ad02f..c375b49f 100644 --- a/R/urgency.R +++ b/R/urgency.R @@ -4,94 +4,231 @@ #' `select_priorities()`, or text vector. #' For data frames, function will search for "text" variable. #' For priorities data frame function will search for "priorities" variable. -#' @param normalize Would you like urgency scores to be normalized? -#' By default, urgency scores are normalized by "tokens", -#' the number of words in text observation. -#' Users can also declare "none", for no normalization. +#' If missing, opens the webpage containing the urgency codebook. +#' @param summarise How to handle multiple matches for the same dimension +#' in the same text observation? +#' By default, multiple matches are added together and +#' their "sum" per text observation is returned. +#' Users can, instead, choose the "mean" which returns the average +#' score per dimension per text observation when there are multiple matches. +#' The "mean" can also be used as a form of normalization per +#' dimension and text observation in certain cases. +#' @details +#' Urgency in political discourses is an expression of how necessary and/or +#' how soon an action should be undertaken or completed. +#' This is measured along four dimensions, +#' two related to necessity and two related to timing. +#' The first two dimensions, degree of intensity and degree of commitment, +#' relate to the necessity of taking the action, while the next two dimensions, +#' frequency of action and timing of action, +#' relate to the timing in which action is taken. +#' Our dictionary includes terms in each of these dimensions. +#' The terms included in each of these dimensions were validated and adjusted +#' through an online survey that took place between July and August of 2024. +#' The survey results were recorded as counts of the number of participants +#' who selected an urgency-related word as more urgent than its pair. +#' To analyze the survey results, we employed Bradley-Terry models for +#' paired comparisons. +#' A rank of the words for each dimension of urgency was obtained from the analysis, +#' which were then used to create the urgency word scores in the dictionaries. +#' For more information on the dimensions, scores, or the survey on urgency, +#' please run `get_urgency()` to access the urgency codebook. +#' For priorities (i.e. coded using the `select_priorities()`), +#' urgency scores are calculated by multiplying the commitment scores by all +#' other dimensions. +#' This is done because commitment words are indicative of political priorities, +#' For more information please refer to the `select_priorities()` function. +#' For vectors or data frames urgency scores are calculated by +#' adding commitment and intensity dimension scores (i.e. how necessary) +#' and multiplying these by the sum of timing and frequency dimension +#' scores (i.e. how soon). +#' In both cases, zero urgency scores are indicative of no urgency but maximum +#' scores can vary. #' @return A scored data frame for each dimension of urgency. #' @import dplyr +#' @importFrom textstem lemmatize_strings #' @examples #' \donttest{ #' get_urgency(US_News_Conferences_1960_1980[1:10, 3]) #' get_urgency(US_News_Conferences_1960_1980[1:10,]) +#' #get_urgency(select_priorities(US_News_Conferences_1960_1980[1:2, 3])) +#' #summary(get_urgency(US_News_Conferences_1960_1980[1:10, 3])) +#' #plot(get_urgency(US_News_Conferences_1960_1980[1:10, 3])) +#' #get_urgency() #' } #' @export -get_urgency <- function(.data, normalize = "tokens") { +get_urgency <- function(.data, summarise = "sum") { Frequency <- Timing <- Commitment <- Intensity <- Urgency <- text_clean <- NULL + # tries to open urgency codebook if no argument is declared + if (missing(.data)) open_codebook(codebook = "urgency") # get text variable if (inherits(.data, "priorities")) { - text_clean <- getElement(.data, "priorities") + text_clean <- getElement(.data, "lemmas") + priorities <- TRUE } else if (inherits(.data, "data.frame")) { - text_clean <- getElement(.data, "text") - } else text_clean <- .data + if ("lemmas" %in% names(.data)) text_clean <- getElement(.data, "lemmas") else + text_clean <- textstem::lemmatize_strings(getElement(.data, "text")) + priorities <- FALSE + } else { + text_clean <- textstem::lemmatize_strings(.data) + priorities <- FALSE + } # assign urgency dimensions - out <- data.frame("text_clean" = .clean_token(text_clean)) %>% - dplyr::mutate(Frequency = .assign_frequencies(text_clean), - Timing = .assign_timing(text_clean), - Intensity = .assign_intensity(text_clean), - Commitment = .assign_commitment(text_clean)) - if (normalize == "tokens") { + out <- data.frame("text_clean" = text_clean) %>% + dplyr::mutate(Frequency = .assign_urgency_dimensions(text_clean, dimension = "frequency", + summarise = summarise, priority = priorities), + Timing = .assign_urgency_dimensions(text_clean, dimension = "timing", + summarise = summarise, priority = priorities), + Intensity = .assign_urgency_dimensions(text_clean, dimension = "intensity", + summarise = summarise, priority = priorities), + Commitment = .assign_urgency_dimensions(text_clean, dimension = "commitment", + summarise = summarise, priority = priorities)) + # calculate urgency scores + if (isTRUE(priorities)) { out <- out %>% - dplyr::mutate(Urgency = (Frequency + Timing + Intensity + Commitment)/ - nchar(text_clean)) - } else if (normalize == "none") { + dplyr::mutate(Urgency = Commitment * ifelse(Intensity == 0, 1, Intensity) * + ifelse(Timing == 0, 1, Timing) * ifelse(Frequency == 0, 1, Frequency)) + } else { out <- out %>% - dplyr::mutate(Urgency = (Frequency + Timing + Intensity + Commitment)) + dplyr::mutate(Urgency = (((1 + Commitment) + (1 + Intensity))/4) * + (((1 + Timing) + (1 + Frequency))/4)) } + # bind data results to original data out <- cbind(.data, out) %>% dplyr::select(-c(text_clean)) class(out) <- c("urgency", class(out)) out } -.assign_frequencies <- function(v) { - frequency <- score_frequency <- NULL - freq_words <- urgency_word_scores[,1:2] %>% - tidyr::drop_na() %>% - dplyr::mutate(frequency = textstem::lemmatize_words(frequency)) %>% - dplyr::distinct() %>% # 61 terms - dplyr::group_by(score_frequency) %>% - summarise(terms = paste0(frequency, collapse = "|")) - rowSums(do.call("cbind", lapply(seq_len(nrow(freq_words)), function(i) - stringr::str_count(as.character(v), paste0("\\b", freq_words$terms[i], "\\b"))* - freq_words$score_frequency[i]))) -} - -.assign_timing <- function(v) { - timing <- score_timing <- NULL - timing_words <- urgency_word_scores[,3:4] %>% +.assign_urgency_dimensions <- function(v, dimension, summarise, priority) { + scaled <- rescaled <- word <- grammar_function <- NULL + # get dictionaries + if (dimension == "frequency") { + out <- Frequency + } else if (dimension == "timing") { + out <- Timing + } else if (dimension == "commitment") { + out <- Commitment + } else if (dimension == "intensity") { + out <- Intensity + } + # check if priorities and get correct scale and data + if (isTRUE(priority)) { # for urgency in priorities + out <- dplyr::rename(out, scale = rescaled) %>% + dplyr::select(word, scale) + } else out <- dplyr::rename(out, scale = scaled) %>% dplyr::select(word, scale) + # remove missing and group words + out <- out %>% tidyr::drop_na() %>% - dplyr::mutate(timing = textstem::lemmatize_words(timing)) %>% - dplyr::distinct() %>% # 41 terms - dplyr::group_by(score_timing) %>% - summarise(terms = paste0(timing, collapse = "|")) - rowSums(do.call("cbind", lapply(seq_len(nrow(timing_words)), function(i) - stringr::str_count(as.character(v), paste0("\\b", timing_words$terms[i], "\\b"))* - timing_words$score_timing[i]))) + dplyr::mutate(word = textstem::lemmatize_strings(word)) %>% + dplyr::distinct() %>% + dplyr::group_by(scale) %>% + summarise(terms = paste0(word, collapse = "|")) + # summarise if multiple matches per dimension per text observation + if (summarise == "sum") { + out <- rowSums(do.call("cbind", lapply(seq_len(nrow(out)), function(i) + stringr::str_count(as.character(v), paste0("\\b", out$terms[i], "\\b"))* + out$scale[i]))) # get added scores + } else if (summarise == "mean") { + rowcount <- rowSums(do.call("cbind", lapply(seq_len(nrow(out)), function(i) + stringr::str_count(as.character(v), paste0("\\b", out$terms[i], "\\b"))))) # get count + out <- rowSums(do.call("cbind", lapply(seq_len(nrow(out)), function(i) + stringr::str_count(as.character(v), paste0("\\b", out$terms[i], "\\b"))* + out$scale[i]))) # get added scores + out <- out/(replace(rowcount, rowcount == 0, 1)) # get average value + } + out } -.assign_intensity <- function(v) { - intensity <- score_intensity <- NULL - intensity_words <- urgency_word_scores[,5:6] %>% - tidyr::drop_na() %>% - dplyr::mutate(intensity = textstem::lemmatize_words(intensity)) %>% - dplyr::distinct() %>% # 103 terms - dplyr::group_by(score_intensity) %>% - summarise(terms = paste0(intensity, collapse = "|")) - rowSums(do.call("cbind", lapply(seq_len(nrow(intensity_words)), function(i) - stringr::str_count(as.character(v), paste0("\\b", intensity_words$terms[i], "\\b"))* - intensity_words$score_intensity[i]))) +#' Simulating urgency in priorities +#' +#' @param urgency Desired urgency score, optional. +#' @param commitment Desired commitment score, optional. +#' @param intensity Desired intensity score, optional. +#' @param timing Desired timing score, optional. +#' @param frequency Desired frequency score, optional. +#' @param pronoun How would you like the simulated priorities to start? +#' By default, priorities start with the pronoun "We". +#' @details +#' Users can declare a score for one or more of the +#' urgency dimensions or an urgency score. +#' This means, if users may not declare an urgency score and the +#' score for one or more dimensions at once. +#' In those cases, the urgency score is favored. +#' @return A sentence that matches the urgency or urgency dimension scores. +#' @examples +#' \donttest{ +#' sim_urgency() +#' sim_urgency(urgency = 0.5) +#' sim_urgency(urgency = 2.5) +#' sim_urgency(urgency = -2.5) +#' sim_urgency(commitment = 0.6) +#' sim_urgency(commitment = 0.6, intensity = 1.4) +#' sim_urgency(commitment = 0.6, intensity = 1.4, timing = 1.4) +#' sim_urgency(commitment = 0.6, intensity = 1.2, timing = 1.4, frequency = 1.8) +#' } +#' @export +sim_urgency <- function(urgency, + commitment, intensity, timing, frequency, + pronoun = "We") { + grammar_function <- rescaled <- NULL + # Filter data to avoid using adverbs, for now + Commitment <- subset(Commitment, !is.na(rescaled) & grepl("^verb$", grammar_function)) + # only "commitment" verbs for now + Intensity <- subset(Intensity, !is.na(rescaled)) + Timing <- subset(Timing, !is.na(rescaled)) + Frequency <- subset(Frequency, !is.na(rescaled)) + if(!missing(urgency)){ + # Either timing or frequency, for now + time_freq <- rbind(Timing, Frequency) + combins <- expand.grid(Intensity$word, Commitment$word, time_freq$word, + stringsAsFactors = FALSE) + combins <- merge(combins, Intensity, by.x = "Var1", by.y = "word") + combins <- combins[,c("Var1", "Var2", "Var3", "rescaled")] + combins <- merge(combins, Commitment, by.x = "Var2", by.y = "word") + combins <- combins[,c("Var1", "Var2", "Var3", "rescaled.x", "rescaled.y")] + combins <- merge(combins, time_freq, by.x = "Var3", by.y = "word") + combins <- combins[,c("Var1", "Var2", "Var3", "rescaled.x", + "rescaled.y", "rescaled")] + combins$combo <- as.numeric(combins$rescaled.x) * + as.numeric(combins$rescaled.y) * as.numeric(combins$rescaled) + selectd <- which.min.diff(abs(urgency), combins$combo) + formul <- combins[selectd,c("Var1", "Var2", "Var3")] + if(urgency < 0) intcom <- c(formul[1:2], sample(c("not", "never"), 1)) else + intcom <- formul[1:2] + out <- paste(pronoun, paste(intcom, collapse = " "), "do this", formul[3]) + cat("Urgency score: ", + combins[selectd,"rescaled.x"] * combins[selectd, "rescaled.y"] * + combins[selectd,"rescaled"], "\n", sep = "") + } else { + if(!missing(commitment)){ + commit <- Commitment$word[which.min.diff(abs(commitment), Commitment$rescaled)] + if(commitment < 0) commit <- paste(commit, sample(c("not","never"), 1)) + if(!missing(intensity)){ + intensifier <- Intensity$word[which.min.diff(intensity, Intensity$rescaled)] + out <- paste(pronoun, intensifier, commit, "do this") + } else out <- paste(pronoun, commit, "do this") + } else out <- paste(pronoun, "do this") + if(!missing(timing)){ + timed <- Timing$word[which.min.diff(timing, Timing$rescaled)] + out <- paste(out, timed) + } + if(!missing(frequency)){ + freq <- Frequency$word[which.min.diff(frequency, Frequency$rescaled)] + out <- paste(out, freq) + } + cat("Urgency score:", + ifelse(missing(commitment),1,commitment) * ifelse(missing(intensity),1,intensity) * + ifelse(missing(timing),1,timing) *ifelse(missing(frequency),1,frequency), + "\n") + } + out <- paste0(trimws(out), ".") + out } -.assign_commitment <- function(v) { - commitment <- score_commitment <- NULL - commitment_words <- urgency_word_scores[,7:8] %>% - tidyr::drop_na() %>% - dplyr::mutate(commitment = textstem::lemmatize_words(commitment)) %>% - dplyr::distinct() %>% # 78 terms - dplyr::group_by(score_commitment) %>% - summarise(terms = paste0(commitment, collapse = "|")) - out <- list() - rowSums(do.call("cbind", lapply(seq_len(nrow(commitment_words)), function(i) - stringr::str_count(as.character(v), paste0("\\b", commitment_words$terms[i], "\\b"))* - commitment_words$score_commitment[i]))) +which.min.diff <- function(x, y){ + diffs <- abs(x - y) + y <- which(diffs == min(diffs, na.rm = TRUE)) + if (length(y) > 1L) + sample(y, 1L) + else y } diff --git a/R/utils.R b/R/utils.R index b0599f36..9627be2f 100644 --- a/R/utils.R +++ b/R/utils.R @@ -10,6 +10,29 @@ thisRequires <- function(pkgname){ } } -.clean_token <- function(v) { - textstem::lemmatize_words(stringr::str_squish(tolower(v))) +# Helper function to get urgency or topics codebook. +open_codebook <- function(codebook = "urgency") { + thisRequires("cli") + url <- "https://github.com/henriquesposito/poldis/tree/develop/inst" + if (codebook == "urgency") { + tryCatch({ + utils::browseURL(paste0(url, "/urgency_codebook.pdf"), + browser = getOption("browser"), encodeIfNeeded = FALSE) + }, error = function(e) { + message(paste0("Unable to open codebook, please visit: ", + cli::style_hyperlink(paste0(url, "/urgency_codebook.pdf"), + paste0(url, "/urgency_codebook.pdf")))) + }) + } else if (codebook == "topic") { + tryCatch({ + utils::browseURL(paste0(url, "/topic.pdf"), browser = getOption("browser"), encodeIfNeeded = FALSE) + }, error = function(e) { + message(paste0("Unable to open codebook, please visit: ", + cli::style_hyperlink(paste0(url, "/topic.pdf"), + paste0(url, "/topic.pdf")))) + }) + } else { + message(paste0("Codebook not found, to see the available codebooks please visit: ", + cli::style_hyperlink(url, url))) + } } diff --git a/data_raw/comm.csv b/data_raw/comm.csv new file mode 100644 index 00000000..6f564e4c --- /dev/null +++ b/data_raw/comm.csv @@ -0,0 +1,88 @@ +word,assigned_score,coefficient,se,grammar_function +get to,0.7,,,verb +oblige to,1,-1.01,0.49,verb +need to,1,-1.04,0.43,verb +must,1,0,NA,verb +require to,1,NA,NA,verb +time to,0.8,-0.85,0.47,verb +have to,0.8,-0.92,0.41,verb +going to,0.8,-1.64,0.58,verb +ready to,0.8,-1.94,0.44,verb +will,0.8,-2.65,0.54,verb +urge to,0.8,NA,NA,verb +promise to,0.7,-2.84,0.5,verb +commit to,0.7,NA,NA,verb +intend to,0.7,NA,NA,verb +should,0.5,-2.49,0.52,verb +want to,0.5,-2.58,0.47,verb +would,0.5,-5.49,1.03,verb +can,0.5,-5.64,0.94,verb +might,0.5,-6.1,1.53,verb +could,0.5,-7.04,1.05,verb +may,0.5,-7.63,1.34,verb +shall,0.5,NA,NA,verb +plan to,0.5,NA,NA,verb +option to,0.5,NA,NA,noun +let's,0.5,-1.8,0.52,imperative +let us,0.5,NA,NA,imperative +necessarily,1,NA,NA,adverb +importantly,1,NA,NA,adverb +unavoidably,0.8,NA,NA,adverb +inevitably,0.8,NA,NA,adverb +inexorably,0.8,NA,NA,adverb +prominently,0.8,NA,NA,adverb +relevantly,0.7,NA,NA,adverb +ideally,0.5,NA,NA,adverb +preferably,0.5,NA,NA,adverb +carefully,0.5,NA,NA,adverb +plausibly,0.5,NA,NA,adverb +possibly,0.5,NA,NA,adverb +it is imperative,1,1.09,0.5,verb and adjective +serious,1,0.79,0.58,adjective +it is essential,1,0.7,0.37,verb and adjective +severe,1,0.57,0.87,adjective +it is best to,1,0.12,0.75,verb and adjective +it is important,1,-0.39,0.35,verb and adjective +it is significant,1,-1.03,0.5,verb and adjective +major,1,-1.88,1.24,adjective +it is necessary,1,0,NA,verb and adjective +obligatory,1,NA,NA,adjective +momentous,1,NA,NA,adjective +it is unavoidable,0.8,0.44,0.47,verb and adjective +it is invaluable,0.8,-0.8,0.4,verb and adjective +it is quintessential,0.8,NA,NA,verb and adjective +it is worthy,0.8,NA,NA,verb and adjective +it is valuable,0.8,NA,NA,verb and adjective +it is inevitable,0.8,NA,NA,verb and adjective +inexorable,0.8,NA,NA,adjective +it is eminent,0.8,NA,NA,verb and adjective +prominent,0.8,NA,NA,adjective +preeminent,0.8,NA,NA,adjective +pre-eminent,0.8,NA,NA,adjective +it is relevant,0.7,-1.6,0.6,verb and adjective +it is optimal,0.5,-0.28,0.7,verb and adjective +it is timely,0.5,-0.82,0.95,verb and adjective +it is better to,0.5,-1.28,0.86,verb and adjective +are willing to,0.5,-1.53,0.72,verb and adjective +it is opportune,0.5,-1.57,1.05,verb and adjective +cautious to,0.5,-1.69,1.04,verb and adjective +able to,0.5,-2.55,0.9,verb and adjective +it is good to,0.5,-2.04,0.52,verb and adjective +it is preferable,0.5,-2.06,0.75,verb and adjective +it is ideal,0.5,-2.06,0.49,verb and adjective +it is useful,0.5,-2.53,0.51,verb and adjective +it is great to,0.5,-2.75,0.71,verb and adjective +it is optional,0.5,-3.28,1.06,verb and adjective +it is possible,0.5,-4.59,1.04,verb and adjective +tenable,0.5,NA,NA,adjective +it is doable,0.5,NA,NA,verb and adjective +it is obvious,0.5,NA,NA,verb and adjective +useable,0.5,NA,NA,adjective +marginal,0.5,NA,NA,adjective +are careful to,0.5,NA,NA,verb and adjective +it is plausible,0.5,NA,NA,verb and adjective +it is trivial,0.2,-2.23,1.1,verb and adjective +are ambiguous to,0.2,-3.07,1.29,verb and adjective +are ambivalent to,0.2,-3.14,1.27,verb and adjective +it is insignificant,0.2,-3.69,1.08,verb and adjective +vague,0.2,NA,NA,adjective \ No newline at end of file diff --git a/data_raw/freq.csv b/data_raw/freq.csv new file mode 100644 index 00000000..060879c5 --- /dev/null +++ b/data_raw/freq.csv @@ -0,0 +1,64 @@ +word,assigned_score,coefficient,se,grammar_function +persistently,1,13.18,1873.64,adverb +relentlessly,1,-0.33,1.91,adverb +constantly,1,-0.99,1.21,adverb +incessantly,1,-1.02,1.77,adverb +always,1,0,NA,adverb +daily,1,-0.38,0.8,adverb +hourly,1,0,NA,adverb +by the minute,1,NA,NA,adverb +by the hour,1,NA,NA,adverb +every hour,1,NA,NA,adverb +every minute,1,NA,NA,adverb +persistent,1,NA,NA,adverb +relentless,1,NA,NA,adverb +unrelentingly,1,NA,NA,adverb +everyday,1,NA,NA,adverb +nightly,1,NA,NA,adverb +every night,1,NA,NA,adverb +constant,1,NA,NA,adjective +incessant,1,NA,NA,adjective +interminable,1,NA,NA,adjective +interminably,1,NA,NA,adverb +weekly,0.9,-1.65,0.65,adverb +every week,0.9,NA,NA,adverb +steadily,0.8,-2.81,1.13,adverb +generally,0.8,-3.99,1.21,adverb +usually,0.8,-4.27,1.11,adverb +fortnightly,0.8,NA,NA,adverb +steady,0.8,NA,NA,adjective +normally,0.8,NA,NA,adverb +frequently,0.7,-2.56,1.09,adverb +regularly,0.7,-2.92,1.18,adverb +often,0.7,-3.37,1.06,adverb +monthly,0.7,-3.21,0.76,adverb +frequent,0.7,NA,NA,adjective +regular,0.7,NA,NA,adjective +every month,0.7,NA,NA,adverb +incrementally,0.6,-2.6,1.8,adverb +progressively,0.6,-2.74,1.2,adverb +gradually,0.6,-4.75,1.09,adverb +quarterly,0.6,-3.38,0.91,adverb +progressive,0.6,NA,NA,adjective +gradual,0.6,NA,NA,adjective +sometimes,0.5,-6.28,1.19,adverb +annually,0.5,-5.01,1.03,adverb +yearly,0.5,NA,NA,adverb +every year,0.5,NA,NA,adverb +intermittently,0.3,-3.29,1.69,adverb +sporadically,0.3,-5.36,1.31,adverb +occasionally,0.3,-6.32,1.19,adverb +irregularly,0.3,-7.33,1.25,adverb +intermittent,0.3,NA,NA,adjective +sporadic,0.3,NA,NA,adjective +occasional,0.3,NA,NA,adjective +irregular,0.3,NA,NA,adjective +seldom,0.2,-7.13,1.14,adverb +infrequently,0.1,-6.46,1.13,adverb +infrequent,0.1,NA,NA,adjective +unusual,0.05,-5.77,1.18,adjective +rarely,0.05,-6.48,1.11,adverb +rare,0.05,NA,NA,adjective +scarcely,0.02,-5.88,1.2,adverb +hardly,0.02,-6.3,1.14,adverb +hardly ever,0.02,-6.46,1.49,adverb \ No newline at end of file diff --git a/data_raw/int.csv b/data_raw/int.csv new file mode 100644 index 00000000..9d4b3471 --- /dev/null +++ b/data_raw/int.csv @@ -0,0 +1,105 @@ +word,assigned_score,coefficient,se,grammar_function +strongly,1,1.79,1.94,adverb +desperately,1,1.69,0.46,adverb +extremely,1,1.62,0.52,adverb +urgently,1,1.27,0.44,adverb +intensively,1,0.96,0.79,adverb +highly,1,0.83,0.34,adverb +especially,1,0.4,0.72,adverb +firmly,1,0.35,2.07,adverb +really,1,-0.36,0.24,adverb +certainly,1,-0.71,0.25,adverb +everywhere,1,-2.16,1.09,adverb +simply,1,-0.75,0.38,adverb +practically,1,-3.66,1.47,adverb +definitely,1,0,NA,adverb +maximum,1,3.04,1.52,adjective +exceptional,1,1.85,0.92,adjective +extra,1,1.15,0.46,adjective +determined,1,1,1.05,adjective +pervasive,1,0.29,0.78,adjective +extensive,1,-0.35,0.5,adjective +unmatched,1,-0.39,0.8,adjective +remarkable,1,-0.51,1.09,adjective +intense,1,-1.82,1.08,adjective +enormous,1,0,NA,adjective +extraordinary,1,NA,NA,adjective +deeply,1,NA,NA,adverb +urgent,1,NA,NA,adjective +huge,1,NA,NA,adjective +tremendous,1,NA,NA,adjective +immense,1,NA,NA,adjective +vast,1,NA,NA,adjective +ultimate,1,NA,NA,adjective +high,1,NA,NA,adjective +definite,1,NA,NA,adjective +utterly,1,NA,NA,adverb +surely,1,NA,NA,adverb +certain,1,NA,NA,adjective +vigorously,1,NA,NA,adverb +vigorous,1,NA,NA,adjective +prevalent,1,NA,NA,adjective +intensive,1,NA,NA,adjective +extensively,1,NA,NA,adverb +wide,1,NA,NA,adjective +widely,1,NA,NA,adverb +widespread,1,NA,NA,adjective +remarkably,1,NA,NA,adverb +intensely,1,NA,NA,adverb +globally,1,NA,NA,adverb +global,1,NA,NA,adjective +completely,0.6,2.27,1.57,adverb +fully,0.6,1.34,0.74,adverb +most,0.6,0.87,1.79,determiner +very,0.6,0.83,0.51,adverb +substantially,0.6,-0.26,1.26,adverb +more,0.6,-0.53,0.31,determiner +moderately,0.6,-3,0.58,adverb +above,0.6,-0.84,1.08,preposition +realistic,0.6,-0.94,0.53,adjective +insufficient,0.6,-3.2,1.74,adjective +large,0.6,NA,NA,adjective +largest,0.6,NA,NA,adjective +big,0.6,NA,NA,adjective +biggest,0.6,NA,NA,adjective +totally,0.6,NA,NA,adverb +entirely,0.6,NA,NA,adverb +absolutely,0.6,NA,NA,adverb +ambitious,0.6,NA,NA,adjective +ambitiously,0.6,NA,NA,adverb +realistically,0.6,NA,NA,adverb +substantial,0.6,NA,NA,adjective +lots,0.6,NA,NA,quantifier +much,0.6,NA,NA,quantifier +far,0.6,NA,NA,adjective +moderate,0.6,NA,NA,adjective +plenty,0.5,NA,NA,quantifier +quite,0.4,-1.47,0.55,adverb +almost,0.4,-1.98,0.51,adverb +somewhat,0.4,-3.39,1.48,adverb +clear,0.4,-0.74,0.97,adjective +least,0.4,NA,NA,determiner +less,0.4,NA,NA,determiner +clearly,0.4,NA,NA,adverb +enough,0.2,3.97,1.45,determiner +nearly,0.2,-1.34,0.54,adverb +rather,0.2,-1.4,0.39,adverb +barely,0.2,-1.53,0.89,adverb +adequately,0.2,-2.45,1.94,adverb +reasonably,0.2,-2.4,0.98,adverb +slightly,0.2,-3.76,0.89,adverb +minimal,0.2,-1.53,0.72,adjective +modest,0.2,-1.77,0.59,adjective +low,0.2,-1.88,0.75,adjective +little,0.2,-2.41,0.98,adjective +some,0.2,-3.3,1.49,determiner +average,0.2,-3.48,1.59,adjective +limited,0.2,-4.39,1.63,adjective +sufficient,0.2,NA,NA,adjective +minimum,0.2,NA,NA,adjective +near,0.2,NA,NA,adverb +reasonable,0.2,NA,NA,adjective +weakly,0.2,NA,NA,adverb +slight,0.2,NA,NA,adjective +small,0.2,NA,NA,adjective +adequate,0.2,NA,NA,adjective \ No newline at end of file diff --git a/data_raw/time.csv b/data_raw/time.csv new file mode 100644 index 00000000..3cac5e8b --- /dev/null +++ b/data_raw/time.csv @@ -0,0 +1,43 @@ +word,assigned_score,coefficient,se,grammar_function +immediately,1,-0.2,0.18,adverb +imminently,1,-0.56,0.18,adverb +first,1,-0.86,0.4,adverb +hastily,1,-2.01,0.62,adverb +now,1,0,NA,adverb +tonight,1,-0.38,0.35,adverb +today,1,0,NA,adverb +imminent,1,NA,NA,adjective +hasty,1,NA,NA,adjective +immediate,1,NA,NA,adjective +promptly,0.9,-2.08,0.28,adverb +as soon as possible,0.9,-2.08,0.28,adverbial phrase +tomorrow,0.9,-2.32,0.32,adverb +prompt,0.9,NA,NA,adjective +quickly,0.8,-1.6,0.27,adverb +rapidly,0.8,-1.63,0.47,adverb +fast,0.8,-2.11,0.63,adverb +shortly,0.8,-2.69,0.36,adverb +speedily,0.8,-2.89,0.48,adverb +next,0.8,-2.91,0.3,adverb +this week,0.8,-2.24,0.41,adverbial phrase +quick,0.8,NA,NA,adjective +rapid,0.8,NA,NA,adjective +short,0.8,NA,NA,adjective +speedy,0.8,NA,NA,adjective +before,0.7,-0.56,0.46,conjunction +early,0.7,-2.6,0.27,adverb +next week,0.7,-3.47,0.39,adverbial phrase +earlier,0.6,NA,NA,adverb +this year,0.5,-4.96,0.43,adverbial phrase +soon(?! as possible),0.5,-3.03,0.25,adverb +afterwards,0.4,-4.19,0.63,adverb +after,0.4,-4.36,1.1,conjunction +long,0.2,-5.88,0.77,adjective +later,0.2,-6.03,0.53,adverb +slowly,0.2,-6.25,0.59,adverb +late,0.2,NA,NA,adverb +slow,0.2,NA,NA,adjective +finally,0.1,-4.55,0.56,adverb +eventually,0.05,-6.5,0.55,adverb +at some point,0.05,-6.7,0.65,adverbial phrase +at some stage,0.05,-7.74,1.1,adverbial phrase \ No newline at end of file diff --git a/data_raw/urgency_scores_and_scales.R b/data_raw/urgency_scores_and_scales.R new file mode 100644 index 00000000..81c998eb --- /dev/null +++ b/data_raw/urgency_scores_and_scales.R @@ -0,0 +1,244 @@ +# # This script rescales the data for each dictionary gathered from the "urgency" survey. + +# # Note that the coefficients and standard errors in the data refer to the +# # coefficients generated by the Bradley-Terry models for pairwise comparisons. +# # The assigned scores are author assigned scores before survey. +# # These served as controls in models to account for "expected outcomes" and +# # improve the coefficient ranks (i.e. odds of winning in pairwise comparison). + +# # The first task is to get close synonyms or versions of adjective/adverbs +# # scored in the same ways. +# # This is done since not all versions of the same word were surveyed. +# # The second tasks refers to how the scores are rescaled to calculate urgency +# # and the urgency of priorities. +# # Scaled variables refer to scaled coefficients. +# # Scaling is done using 1-coefficient/min(coefficient) scaled from 0.05 to 1. +# # Rescaled variables refer to centered coefficients (around a neutral term) +# # and then scaled. +# # Rescaling is done using 1+centered coefficient/min(centered coefficient). +# # This is only done for timing, frequency, and intensity since rescaled scores +# # are meant to be used to calculate the urgency of priorities only. + +library(readr) +library(dplyr) +library(scales) + +# # Timing dictionary + +# Load original data +timing <- read_csv("data_raw/time.csv") + +# Standardise scores for synonyms/equivalents +timing[which(timing$word=="hasty"),3] <- timing[which(timing$word=="hastily"),3] +timing[which(timing$word=="immediate"),3] <- timing[which(timing$word=="immediately"),3] +timing[which(timing$word=="imminent"),3] <- timing[which(timing$word=="imminently"),3] +timing[which(timing$word=="prompt"),3] <- timing[which(timing$word=="promptly"),3] +timing[which(timing$word=="quick"),3] <- timing[which(timing$word=="quickly"),3] +timing[which(timing$word=="speedy"),3] <- timing[which(timing$word=="speedily"),3] +timing[which(timing$word=="short"),3] <- timing[which(timing$word=="shortly"),3] +timing[which(timing$word=="earlier"),3] <- timing[which(timing$word=="early"),3] +timing[which(timing$word=="rapid"),3] <- timing[which(timing$word=="rapidly"),3] +timing[which(timing$word=="late"),3] <- timing[which(timing$word=="later"),3] +timing[which(timing$word=="slow"),3] <- timing[which(timing$word=="slowly"),3] + +# Rescale coefficients +timing[timing$word=="afterwards",3] # centering word score = 4.19 +timing <- timing %>% + mutate(centered_coefficient = ifelse(grammar_function != "adjective" & grammar_function != "conjunction", + coefficient + 4.19, NA), + scaled = scales::rescale(1-(coefficient/min(timing$coefficient, na.rm = TRUE)), to = c(0.05, 1)), + rescaled = ifelse(grammar_function != "adjective" & grammar_function != "conjunction", + 1+(centered_coefficient/max(timing$centered_coefficient, na.rm = TRUE)), NA)) + +# # Commitment dictionary + +# Get original file +commitment <- read_csv("data_raw/comm.csv") + +# Standardise scores for synonyms/equivalents +commitment[which(commitment$word=="necessarily"),3] <- commitment[which(commitment$word=="it is necessary"),3] +commitment[which(commitment$word=="importantly"),3] <- commitment[which(commitment$word=="it is important"),3] +commitment[which(commitment$word=="obligatory"),3] <- commitment[which(commitment$word=="oblige to"),3] +commitment[which(commitment$word=="require to"),3] <- commitment[which(commitment$word=="need to"),3] +commitment[which(commitment$word=="commit to"),3] <- commitment[which(commitment$word=="promise to"),3] +commitment[which(commitment$word=="intend to"),3] <- commitment[which(commitment$word=="promise to"),3] +commitment[which(commitment$word=="urge to"),3] <- commitment[which(commitment$word=="going to"),3] +commitment[which(commitment$word=="relevantly"),3] <- commitment[which(commitment$word=="it is relevant"),3] +commitment[which(commitment$word=="it is eminent"),3] <- commitment[which(commitment$word=="it is important"),3] +commitment[which(commitment$word=="prominent"),3] <- commitment[which(commitment$word=="it is important"),3] +commitment[which(commitment$word=="prominently"),3] <- commitment[which(commitment$word=="it is important"),3] +commitment[which(commitment$word=="preeminent"),3] <- commitment[which(commitment$word=="it is important"),3] +commitment[which(commitment$word=="pre-eminent"),3] <- commitment[which(commitment$word=="it is important"),3] +commitment[which(commitment$word=="momentous"),3] <- commitment[which(commitment$word=="it is important"),3] +commitment[which(commitment$word=="it is worthy"),3] <- commitment[which(commitment$word=="it is invaluable"),3] +commitment[which(commitment$word=="it is quintessential"),3] <- commitment[which(commitment$word=="it is essential"),3] +commitment[which(commitment$word=="it is valuable"),3] <- commitment[which(commitment$word=="it is invaluable"),3] +commitment[which(commitment$word=="unavoidably"),3] <- commitment[which(commitment$word=="it is unavoidable"),3] +commitment[which(commitment$word=="it is inevitable"),3] <- commitment[which(commitment$word=="it is unavoidable"),3] +commitment[which(commitment$word=="inevitably"),3] <- commitment[which(commitment$word=="it is unavoidable"),3] +commitment[which(commitment$word=="inexorable"),3] <- commitment[which(commitment$word=="it is unavoidable"),3] +commitment[which(commitment$word=="inexorably"),3] <- commitment[which(commitment$word=="it is unavoidable"),3] +commitment[which(commitment$word=="let us"),3] <- commitment[which(commitment$word=="let's"),3] +commitment[which(commitment$word=="shall"),3] <- commitment[which(commitment$word=="should"),3] +commitment[which(commitment$word=="plan to"),3] <- commitment[which(commitment$word=="intend to"),3] +commitment[which(commitment$word=="marginal"),3] <- commitment[which(commitment$word=="it is optional"),3] +commitment[which(commitment$word=="tenable"),3] <- commitment[which(commitment$word=="able to"),3] +commitment[which(commitment$word=="it is doable"),3] <- commitment[which(commitment$word=="able to"),3] +commitment[which(commitment$word=="it is plausible"),3] <- commitment[which(commitment$word=="it is possible"),3] +commitment[which(commitment$word=="plausibly"),3] <- commitment[which(commitment$word=="it is possible"),3] +commitment[which(commitment$word=="possibly"),3] <- commitment[which(commitment$word=="it is possible"),3] +commitment[which(commitment$word=="useable"),3] <- commitment[which(commitment$word=="it is useful"),3] +commitment[which(commitment$word=="ideally"),3] <- commitment[which(commitment$word=="it is ideal"),3] +commitment[which(commitment$word=="are careful to"),3] <- commitment[which(commitment$word=="cautious to"),3] +commitment[which(commitment$word=="carefully"),3] <- commitment[which(commitment$word=="cautious to"),3] +commitment[which(commitment$word=="it is obvious"),3] <- commitment[which(commitment$word=="able to"),3] +commitment[which(commitment$word=="vague"),3] <- commitment[which(commitment$word=="are ambiguous to"),3] +commitment[which(commitment$word=="option to"),3] <- commitment[which(commitment$word=="it is optional"),3] +commitment[which(commitment$word=="preferably"),3] <- commitment[which(commitment$word=="it is preferable"),3] +commitment[which(commitment$word=="get to"),3] <- commitment[which(commitment$word=="have to"),3] + +# Rescale coefficients +commitment <- commitment %>% + mutate(coefficient = ifelse(coefficient > 0, 0, coefficient), # Scale between must and may - bug? + scaled = scales::rescale(1-(coefficient/min(commitment$coefficient, na.rm = TRUE)), to = c(0.05, 1)), + rescaled = scales::rescale(ifelse(grammar_function != "adjective", + 1-(coefficient/min(commitment$coefficient, na.rm = TRUE)), + NA), to = c(0.05, 1))) + +# # Intensity dictionary + +# Load original data +intensity <- read_csv("data_raw/int.csv") + +# Standardise synonyms/equivalents +intensity[which(intensity$word=="deeply"),3] <- intensity[which(intensity$word=="extremely"),3] # add scale values for similar words +intensity[which(intensity$word=="high"),3] <- intensity[which(intensity$word=="highly"),3] +intensity[which(intensity$word=="definite"),3] <- intensity[which(intensity$word=="definitely"),3] +intensity[which(intensity$word=="surely"),3] <- intensity[which(intensity$word=="certainly"),3] +intensity[which(intensity$word=="certain"),3] <- intensity[which(intensity$word=="certainly"),3] +intensity[which(intensity$word=="utterly"),3] <- intensity[which(intensity$word=="completely"),3] +intensity[which(intensity$word=="absolutely"),3] <- intensity[which(intensity$word=="completely"),3] +intensity[which(intensity$word=="urgent"),3] <- intensity[which(intensity$word=="urgently"),3] +intensity[which(intensity$word=="vigorously"),3] <- intensity[which(intensity$word=="strongly"),3] +intensity[which(intensity$word=="vigorous"),3] <- intensity[which(intensity$word=="strongly"),3] +intensity[which(intensity$word=="huge"),3] <- intensity[which(intensity$word=="enormous"),3] +intensity[which(intensity$word=="large"),3] <- intensity[which(intensity$word=="enormous"),3] +intensity[which(intensity$word=="largest"),3] <- intensity[which(intensity$word=="enormous"),3] +intensity[which(intensity$word=="big"),3] <- intensity[which(intensity$word=="enormous"),3] +intensity[which(intensity$word=="biggest"),3] <- intensity[which(intensity$word=="enormous"),3] +intensity[which(intensity$word=="global"),3] <- intensity[which(intensity$word=="everywhere"),3] +intensity[which(intensity$word=="globally"),3] <- intensity[which(intensity$word=="everywhere"),3] +intensity[which(intensity$word=="prevalent"),3] <- intensity[which(intensity$word=="pervasive"),3] +intensity[which(intensity$word=="extensively"),3] <- intensity[which(intensity$word=="extensive"),3] +intensity[which(intensity$word=="tremendous"),3] <- intensity[which(intensity$word=="enormous"),3] +intensity[which(intensity$word=="extraordinary"),3] <- intensity[which(intensity$word=="exceptional"),3] +intensity[which(intensity$word=="remarkably"),3] <- intensity[which(intensity$word=="remarkable"),3] +intensity[which(intensity$word=="immense"),3] <- intensity[which(intensity$word=="enormous"),3] +intensity[which(intensity$word=="vast"),3] <- intensity[which(intensity$word=="enormous"),3] +intensity[which(intensity$word=="ultimate"),3] <- intensity[which(intensity$word=="enormous"),3] +intensity[which(intensity$word=="ambitiously"),3] <- intensity[which(intensity$word=="determined"),3] +intensity[which(intensity$word=="ambitious"),3] <- intensity[which(intensity$word=="determined"),3] +intensity[which(intensity$word=="realistically"),3] <- intensity[which(intensity$word=="realistic"),3] +intensity[which(intensity$word=="more"),3] <- intensity[which(intensity$word=="moderately"),3] +intensity[which(intensity$word=="moderate"),3] <- intensity[which(intensity$word=="moderately"),3] +intensity[which(intensity$word=="substantial"),3] <- intensity[which(intensity$word=="substantially"),3] +intensity[which(intensity$word=="lots"),3] <- intensity[which(intensity$word=="substantially"),3] +intensity[which(intensity$word=="plenty"),3] <- intensity[which(intensity$word=="substantially"),3] +intensity[which(intensity$word=="much"),3] <- intensity[which(intensity$word=="substantially"),3] +intensity[which(intensity$word=="totally"),3] <- intensity[which(intensity$word=="fully"),3] +intensity[which(intensity$word=="entirely"),3] <- intensity[which(intensity$word=="fully"),3] +intensity[which(intensity$word=="far"),3] <- intensity[which(intensity$word=="substantially"),3] +intensity[which(intensity$word=="clearly"),3] <- intensity[which(intensity$word=="clear"),3] +intensity[which(intensity$word=="least"),3] <- intensity[which(intensity$word=="almost"),3] +intensity[which(intensity$word=="less"),3] <- intensity[which(intensity$word=="almost"),3] +intensity[which(intensity$word=="small"),3] <- intensity[which(intensity$word=="limited"),3] +intensity[which(intensity$word=="marginally"),3] <- intensity[which(intensity$word=="limited"),3] +intensity[which(intensity$word=="minimum"),3] <- intensity[which(intensity$word=="minimal"),3] +intensity[which(intensity$word=="weakly"),3] <- intensity[which(intensity$word=="slightly"),3] +intensity[which(intensity$word=="adequate"),3] <- intensity[which(intensity$word=="adequately"),3] +intensity[which(intensity$word=="sufficient"),3] <- intensity[which(intensity$word=="enough"),3] +intensity[which(intensity$word=="near"),3] <- intensity[which(intensity$word=="nearly"),3] +intensity[which(intensity$word=="some"),3] <- intensity[which(intensity$word=="modest"),3] +intensity[which(intensity$word=="slight"),3] <- intensity[which(intensity$word=="slightly"),3] +intensity[which(intensity$word=="reasonable"),3] <- intensity[which(intensity$word=="reasonably"),3] +intensity[which(intensity$word=="wide"),3] <- intensity[which(intensity$word=="extensive"),3] +intensity[which(intensity$word=="widely"),3] <- intensity[which(intensity$word=="extensive"),3] +intensity[which(intensity$word=="widespread"),3] <- intensity[which(intensity$word=="extensive"),3] +intensity[which(intensity$word=="intensive"),3] <- intensity[which(intensity$word=="intense"),3] +intensity[which(intensity$word=="intensely"),3] <- intensity[which(intensity$word=="intense"),3] + +# Merge commitment adverbs and rescale coefficients +intensity[intensity$word=="simply",3] # centering word score = 0.75 +intensity <- intensity %>% + mutate(centered_coefficient = ifelse(grammar_function == "adverb", + coefficient + 0.75, NA), + scaled = scales::rescale(1-(coefficient/min(intensity$coefficient, na.rm = TRUE)), to = c(0.05, 1)), + rescaled = ifelse(grammar_function == "adverb", # What about quantifiers and determiners? + 1+(centered_coefficient/max(intensity$centered_coefficient, na.rm = TRUE)), NA)) + +# # Frequency dictionary + +# Load original data +frequency <- read_csv("data_raw/freq.csv") + +# Persistently is a big outlier, we can remove it for now ... +#frequency[which(frequency$word=="persistently"),3] <- NA +# # Or take the score from another term ... +frequency[which(frequency$word=="persistently"),3] <- + frequency[which(frequency$word=="relentlessly"),3] + +# Standardise scores for synonyms/equivalents +frequency[which(frequency$word=="frequent"),3] <- frequency[which(frequency$word=="frequently"),3] +frequency[which(frequency$word=="constant"),3] <- frequency[which(frequency$word=="constantly"),3] +frequency[which(frequency$word=="by the minute"),3] <- frequency[which(frequency$word=="hourly"),3] +frequency[which(frequency$word=="by the hour"),3] <- frequency[which(frequency$word=="hourly"),3] +frequency[which(frequency$word=="everyday"),3] <- frequency[which(frequency$word=="daily"),3] +frequency[which(frequency$word=="nightly"),3] <- frequency[which(frequency$word=="daily"),3] +frequency[which(frequency$word=="every night"),3] <- frequency[which(frequency$word=="daily"),3] +frequency[which(frequency$word=="persistent"),3] <- frequency[which(frequency$word=="persistently"),3] +frequency[which(frequency$word=="relentless"),3] <- frequency[which(frequency$word=="relentlessly"),3] +frequency[which(frequency$word=="incessant"),3] <- frequency[which(frequency$word=="incessantly"),3] +frequency[which(frequency$word=="unrelentingly"),3] <- frequency[which(frequency$word=="relentlessly"),3] +frequency[which(frequency$word=="interminable"),3] <- frequency[which(frequency$word=="incessantly"),3] +frequency[which(frequency$word=="interminably"),3] <- frequency[which(frequency$word=="incessantly"),3] +frequency[which(frequency$word=="fortnightly"),3] <- frequency[which(frequency$word=="weekly"),3] +frequency[which(frequency$word=="yearly"),3] <- frequency[which(frequency$word=="annually"),3] +frequency[which(frequency$word=="every year"),3] <- frequency[which(frequency$word=="annually"),3] +frequency[which(frequency$word=="every hour"),3] <- frequency[which(frequency$word=="hourly"),3] +frequency[which(frequency$word=="every minute"),3] <- frequency[which(frequency$word=="hourly"),3] +frequency[which(frequency$word=="every week"),3] <- frequency[which(frequency$word=="weekly"),3] +frequency[which(frequency$word=="every month"),3] <- frequency[which(frequency$word=="monthly"),3] +frequency[which(frequency$word=="normally"),3] <- frequency[which(frequency$word=="usually"),3] +frequency[which(frequency$word=="regular"),3] <- frequency[which(frequency$word=="regularly"),3] +frequency[which(frequency$word=="steady"),3] <- frequency[which(frequency$word=="steadily"),3] +frequency[which(frequency$word=="infrequent"),3] <- frequency[which(frequency$word=="infrequently"),3] +frequency[which(frequency$word=="progressive"),3] <- frequency[which(frequency$word=="progressively"),3] +frequency[which(frequency$word=="gradual"),3] <- frequency[which(frequency$word=="gradually"),3] +frequency[which(frequency$word=="occasional"),3] <- frequency[which(frequency$word=="infrequently"),3] +frequency[which(frequency$word=="occasionally"),3] <- frequency[which(frequency$word=="infrequently"),3] +frequency[which(frequency$word=="sporadic"),3] <- frequency[which(frequency$word=="sporadically"),3] +frequency[which(frequency$word=="irregular"),3] <- frequency[which(frequency$word=="irregularly"),3] +frequency[which(frequency$word=="intermittent"),3] <- frequency[which(frequency$word=="intermittently"),3] +frequency[which(frequency$word=="rare"),3] <- frequency[which(frequency$word=="rarely"),3] + +# Rescale coefficients +(unlist(frequency[frequency$word=="gradual",3]) + # between gradual and usually + unlist(frequency[frequency$word=="usually",3]))/2 # centering word score = 4.51 +frequency <- frequency %>% + mutate(centered_coefficient = ifelse(grammar_function != "adjective", + coefficient + 4.51, NA), + scaled = scales::rescale(1-(coefficient/min(frequency$coefficient, na.rm = TRUE)), to = c(0.05, 1)), + rescaled = ifelse(grammar_function != "adjective", + 1+(centered_coefficient/max(frequency$centered_coefficient, na.rm = TRUE)), NA)) + +# Update names to avoid overlap +Commitment <- commitment +Frequency <- frequency +Intensity <- intensity +Timing <- timing + +# # Save the data as internal data +# # Note that the CAP Topics is another type of internal data saved in package +# # that should be (re)saved as well. +usethis::use_data(CAP_topics, Commitment, Frequency, Intensity, Timing, + internal = TRUE, overwrite = TRUE) diff --git a/docs/404.html b/docs/404.html index f410e97b..99bc900c 100644 --- a/docs/404.html +++ b/docs/404.html @@ -24,7 +24,7 @@ poldis - 0.1.0 + 0.1.1