diff --git a/NAMESPACE b/NAMESPACE index 5b5eac80..533845b0 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -37,6 +37,7 @@ importFrom(stringr,str_detect) importFrom(stringr,str_extract) importFrom(stringr,str_extract_all) importFrom(stringr,str_remove_all) +importFrom(stringr,str_replace_all) importFrom(stringr,str_squish) importFrom(textstem,lemmatize_strings) importFrom(tidyr,unite) diff --git a/R/text_tools.R b/R/text_tools.R index 5867f8e0..d8bec433 100644 --- a/R/text_tools.R +++ b/R/text_tools.R @@ -319,7 +319,7 @@ read_pdf <- function(path) { #' Defaults to "words". #' @import spacyr #' @importFrom dplyr group_by summarise ungroup %>% -#' @importFrom stringr str_squish +#' @importFrom stringr str_squish str_replace_all #' @return A data frame with syntax information by words or sentences in text. #' @examples #' #annotate_text(US_News_Conferences_1960_1980[1:2, 3]) @@ -328,6 +328,7 @@ read_pdf <- function(path) { annotate_text <- function(v, level = "words") { doc_id <- sentence_id <- token_id <- token <- pos <- tag <- lemma <- entity <- NULL suppressWarnings(spacyr::spacy_initialize(model = "en_core_web_sm")) + v <- stringr::str_replace_all(v, "\\.\\,|\\. \\,|\\,\\.|\\, \\.", ".") parse <- spacyr::spacy_parse(v, tag = TRUE) suppressWarnings(spacyr::spacy_finalize()) if (level == "sentences" | level == "sentence") {