From a1c75412d0d5a8bbc5d2f51b7cf857dcdcdf75ae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20=C5=A0kvr=C5=88=C3=A1k?= Date: Wed, 18 Oct 2023 09:57:17 +0200 Subject: [PATCH] Enable downloading Czech model (#3) * Enable downloading Czech model from UFAL --- DESCRIPTION | 4 ++-- NEWS.md | 4 ++++ R/nametagger.R | 38 +++++++++++++++++++++++--------- man/nametagger.Rd | 3 +-- man/nametagger_download_model.Rd | 9 +++++--- 5 files changed, 40 insertions(+), 18 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 163e203..0136809 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: nametagger Type: Package Title: Named Entity Recognition in Texts using 'NameTag' -Version: 0.1.3 +Version: 0.1.4 Authors@R: c( person('Jan', 'Wijffels', role = c('aut', 'cre', 'cph'), email = 'jwijffels@bnosac.be'), person('BNOSAC', role = 'cph'), @@ -15,7 +15,7 @@ URL: https://github.com/bnosac/nametagger License: MPL-2.0 Encoding: UTF-8 LazyData: true -RoxygenNote: 7.1.2 +RoxygenNote: 7.2.3 Depends: R (>= 2.10) Imports: Rcpp (>= 0.11.5), utils Suggests: udpipe (>= 0.2) diff --git a/NEWS.md b/NEWS.md index fbf3e72..8c4e6a7 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,7 @@ +## CHANGES IN nametagger VERSION 0.1.4 + +- nametagger_download_model now allows to download a model for Czech: czech-cnec-140304 + ## CHANGES IN nametagger VERSION 0.1.3 - Add explicit initialization to silence false positive valgrind report in compressor_save.cpp diff --git a/R/nametagger.R b/R/nametagger.R index 634905d..f2dc37e 100644 --- a/R/nametagger.R +++ b/R/nametagger.R @@ -496,27 +496,43 @@ print.nametagger_options <- function(x, ...){ } - #' @title Download a Nametag model #' @description Download a Nametag model. Note that models have licence CC-BY-SA-NC. #' More details at \url{https://ufal.mff.cuni.cz/nametag/1}. -#' @param language 'english-conll-140408' +#' @param language Language model to download, 'english-conll-140408' (default) or 'czech-cnec-140304' #' @param model_dir a path where the model will be downloaded to. #' @return an object of class nametagger -#' @references \url{https://lindat.mff.cuni.cz/repository/xmlui/handle/11234/1-3118} +#' @references +#' \url{http://ufal.mff.cuni.cz/nametag/users-manual} +#' \url{https://lindat.mff.cuni.cz/repository/xmlui/handle/11234/1-3118} +#' \url{https://lindat.mff.cuni.cz/repository/xmlui/handle/11858/00-097C-0000-0023-7D42-8} #' @export #' @examples #' \donttest{ #' model <- nametagger_download_model("english-conll-140408", model_dir = tempdir()) +#' model <- nametagger_download_model("czech-cnec-140304", model_dir = tempdir()) #' } -nametagger_download_model <- function(language = c("english-conll-140408"), model_dir = tempdir()){ +nametagger_download_model <- function(language = c("english-conll-140408", "czech-cnec-140304"), model_dir = tempdir()){ + language <- match.arg(language) - f <- file.path(tempdir(), "english-conll-140408.zip") - download.file(url = "https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-3118/english-conll-140408.zip?sequence=1&isAllowed=y", - destfile = f, mode = "wb") - f <- utils::unzip(f, exdir = tempdir(), files = "english-conll-140408/english-conll-140408.ner") - from <- file.path(tempdir(), "english-conll-140408/english-conll-140408.ner") - to <- file.path(model_dir, "english-conll-140408.ner") + + f <- file.path(tempdir(), paste(language, ".zip", sep = "")) + switch (language, + "english-conll-140408" = { + url <- "https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-3118/english-conll-140408.zip?sequence=1&isAllowed=y" + download.file(url = url, destfile = f, mode = "wb") + ner_file_path <- "english-conll-140408/english-conll-140408.ner" + }, + "czech-cnec-140304" = { + url <- "https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11858/00-097C-0000-0023-7D42-8/czech-cnec-140304.zip?sequence=1&isAllowed=y" + download.file(url = url, destfile = f, mode = "wb") + ner_file_path <- "czech-cnec-140304/czech-cnec2.0-140304.ner" + } + ) + + f <- utils::unzip(f, exdir = tempdir(), files = ner_file_path) + from <- file.path(tempdir(), ner_file_path) + to <- file.path(model_dir, paste(language, ".ner", sep = "")) file.copy(from, to = to, overwrite = TRUE) nametagger_load_model(to) -} \ No newline at end of file +} diff --git a/man/nametagger.Rd b/man/nametagger.Rd index decacf2..99f5e33 100644 --- a/man/nametagger.Rd +++ b/man/nametagger.Rd @@ -15,8 +15,7 @@ nametagger( control = nametagger_options(token = list(window = 2)), type = if (inherits(control, "nametagger_options")) control$type else "generic", tagger = if (inherits(control, "nametagger_options")) control$tagger else "trivial", - file = if (inherits(control, "nametagger_options")) control$file else - "nametagger.ner" + file = if (inherits(control, "nametagger_options")) control$file else "nametagger.ner" ) } \arguments{ diff --git a/man/nametagger_download_model.Rd b/man/nametagger_download_model.Rd index 8aae197..11a1171 100644 --- a/man/nametagger_download_model.Rd +++ b/man/nametagger_download_model.Rd @@ -5,12 +5,12 @@ \title{Download a Nametag model} \usage{ nametagger_download_model( - language = c("english-conll-140408"), + language = c("english-conll-140408", "czech-cnec-140304"), model_dir = tempdir() ) } \arguments{ -\item{language}{'english-conll-140408'} +\item{language}{Language model to download, 'english-conll-140408' (default) or 'czech-cnec-140304'} \item{model_dir}{a path where the model will be downloaded to.} } @@ -24,8 +24,11 @@ More details at \url{https://ufal.mff.cuni.cz/nametag/1}. \examples{ \donttest{ model <- nametagger_download_model("english-conll-140408", model_dir = tempdir()) +model <- nametagger_download_model("czech-cnec-140304", model_dir = tempdir()) } } \references{ -\url{https://lindat.mff.cuni.cz/repository/xmlui/handle/11234/1-3118} +\url{http://ufal.mff.cuni.cz/nametag/users-manual} + \url{https://lindat.mff.cuni.cz/repository/xmlui/handle/11234/1-3118} + \url{https://lindat.mff.cuni.cz/repository/xmlui/handle/11858/00-097C-0000-0023-7D42-8} }