From df976e14db1eb31985483bd7a8a681bc2473c9af Mon Sep 17 00:00:00 2001 From: Henrique Sposito Date: Fri, 23 Aug 2024 18:09:06 +0200 Subject: [PATCH] More fixes to sentence splitting when annotating texts --- R/text_tools.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/text_tools.R b/R/text_tools.R index d8bec43..5c09aff 100644 --- a/R/text_tools.R +++ b/R/text_tools.R @@ -328,7 +328,7 @@ read_pdf <- function(path) { annotate_text <- function(v, level = "words") { doc_id <- sentence_id <- token_id <- token <- pos <- tag <- lemma <- entity <- NULL suppressWarnings(spacyr::spacy_initialize(model = "en_core_web_sm")) - v <- stringr::str_replace_all(v, "\\.\\,|\\. \\,|\\,\\.|\\, \\.", ".") + v <- stringr::str_replace_all(v, "\\.\\,|\\. \\,|\\,\\.|\\, \\.|\\.\\\n\\,", ".") parse <- spacyr::spacy_parse(v, tag = TRUE) suppressWarnings(spacyr::spacy_finalize()) if (level == "sentences" | level == "sentence") {