From 71cc0551e2d5219c98999d18704699a1d7ac6d49 Mon Sep 17 00:00:00 2001 From: Kohei Watanabe Date: Thu, 22 Aug 2024 17:39:07 +0900 Subject: [PATCH 1/3] Drop docvars from input object --- R/seededlda.R | 1 + tests/testthat/test-internal.R | 18 ++++++++++++++++++ 2 files changed, 19 insertions(+) diff --git a/R/seededlda.R b/R/seededlda.R index 56414b1..f5a509a 100644 --- a/R/seededlda.R +++ b/R/seededlda.R @@ -189,6 +189,7 @@ tfm <- function(x, dictionary, levels = 1, if (!quanteda::is.dictionary(dictionary)) stop("dictionary must be a dictionary object", call. = FALSE) + docvars(x) <- NULL # sanitize dfm dict <- flatten_dictionary(dictionary, levels) key <- names(dict) feat <- featnames(x) diff --git a/tests/testthat/test-internal.R b/tests/testthat/test-internal.R index 0b8f766..882e3d4 100644 --- a/tests/testthat/test-internal.R +++ b/tests/testthat/test-internal.R @@ -116,6 +116,24 @@ test_that("tfm works with ngrams", { c("un" = 2, "icc" = 2, "other" = 0)) }) +test_that("tfm works with dfm with x in docvars (#87)", { + + dict <- dictionary(list("A" = "a", "B" = "b")) + dat <- data.frame(text = c("a b c", "A B C"), + x = c(1, 2)) + corp <- corpus(dat) + toks <- tokens(corp) + dfmt <- dfm(toks) + + expect_equal( + as.matrix(seededlda:::tfm(dfmt, dict, residula = 1)), + matrix(c(2, 0, 0, 0, 2, 0, 0, 0 ,0), nrow = 3, + dimnames = list(c("A", "B", "other"), c("a", "b", "c"))) + ) + +}) + + test_that("levels is working", { dict <- dictionary(list(A = list( From 1f8e773fec8776f91b5dde1e53320508038cc8e5 Mon Sep 17 00:00:00 2001 From: Kohei Watanabe Date: Mon, 26 Aug 2024 14:58:39 +0900 Subject: [PATCH 2/3] Save adjust_alpha in the object --- R/lda.R | 3 ++- man/textmodel_lda.Rd | 3 ++- src/lda.cpp | 1 + tests/testthat/test-textmodel_lda.R | 4 +++- tests/testthat/test-textmodel_seededlda.R | 5 +++-- 5 files changed, 11 insertions(+), 5 deletions(-) diff --git a/R/lda.R b/R/lda.R index 024978f..0d2fb17 100644 --- a/R/lda.R +++ b/R/lda.R @@ -54,7 +54,8 @@ #' \item{k}{the number of topics.} #' \item{last_iter}{the number of iterations in Gibbs sampling.} #' \item{max_iter}{the maximum number of iterations in Gibbs sampling.} -#' \item{auto_iter}{`auto_iter` is used if `TRUE`.} +#' \item{auto_iter}{the use of `auto_iter`} +#' \item{adjust_alpha}{the value of `adjust_alpha`.} #' \item{alpha}{the smoothing parameter for `theta`.} #' \item{beta}{the smoothing parameter for `phi`.} #' \item{epsilon}{the amount of adjustment for `adjust_alpha`.} diff --git a/man/textmodel_lda.Rd b/man/textmodel_lda.Rd index 50976e1..237f80a 100644 --- a/man/textmodel_lda.Rd +++ b/man/textmodel_lda.Rd @@ -59,7 +59,8 @@ Returns a list of model parameters: \item{k}{the number of topics.} \item{last_iter}{the number of iterations in Gibbs sampling.} \item{max_iter}{the maximum number of iterations in Gibbs sampling.} -\item{auto_iter}{\code{auto_iter} is used if \code{TRUE}.} +\item{auto_iter}{the use of \code{auto_iter}} +\item{adjust_alpha}{the value of \code{adjust_alpha}.} \item{alpha}{the smoothing parameter for \code{theta}.} \item{beta}{the smoothing parameter for \code{phi}.} \item{epsilon}{the amount of adjustment for \code{adjust_alpha}.} diff --git a/src/lda.cpp b/src/lda.cpp index 254c314..d50d85d 100644 --- a/src/lda.cpp +++ b/src/lda.cpp @@ -38,6 +38,7 @@ List cpp_lda(arma::sp_mat &mt, int k, int max_iter, double min_delta, Rcpp::Named("max_iter") = lda.max_iter, Rcpp::Named("last_iter") = lda.iter, Rcpp::Named("auto_iter") = (lda.min_delta == 0), + Rcpp::Named("adjust_alpha") = lda.adjust, Rcpp::Named("alpha") = as(wrap(lda.alpha)), Rcpp::Named("beta") = as(wrap(lda.beta)), Rcpp::Named("epsilon") = as(wrap(lda.epsilon)), diff --git a/tests/testthat/test-textmodel_lda.R b/tests/testthat/test-textmodel_lda.R index 4e938ff..42775c0 100644 --- a/tests/testthat/test-textmodel_lda.R +++ b/tests/testthat/test-textmodel_lda.R @@ -72,7 +72,8 @@ test_that("LDA is working", { ) expect_equal( names(lda), - c("k", "max_iter", "last_iter", "auto_iter", "alpha", "beta", "epsilon", "gamma", + c("k", "max_iter", "last_iter", "auto_iter", "adjust_alpha", + "alpha", "beta", "epsilon", "gamma", "phi", "theta", "words", "data", "batch_size", "call", "version") ) expect_equal(lda$last_iter, 200) @@ -120,6 +121,7 @@ test_that("adjust_alpha works", { set.seed(1234) lda <- textmodel_lda(dfmt, max_iter = 200, adjust_alpha = 0.5) + expect_equal(lda$adjust_alpha, 0.5) expect_true(all(lda$alpha != 0.5)) expect_true(all(lda$alpha > 0.25)) expect_true(all(lda$epsilon > 0)) diff --git a/tests/testthat/test-textmodel_seededlda.R b/tests/testthat/test-textmodel_seededlda.R index aa36486..3b4b685 100644 --- a/tests/testthat/test-textmodel_seededlda.R +++ b/tests/testthat/test-textmodel_seededlda.R @@ -104,8 +104,9 @@ test_that("seeded LDA is working", { ) expect_equal( names(lda), - c("k", "max_iter", "last_iter", "auto_iter", "alpha", "beta", "epsilon", "gamma", - "phi", "theta", "words", "data", "batch_size", "call", "version", + c("k", "max_iter", "last_iter", "auto_iter", "adjust_alpha", + "alpha", "beta", "epsilon", "gamma", "phi", "theta", + "words", "data", "batch_size", "call", "version", "dictionary", "valuetype", "case_insensitive", "seeds", "residual", "weight") ) From f212d75a5405dd49e8a2d566d624c3ae05e21e2e Mon Sep 17 00:00:00 2001 From: Kohei Watanabe Date: Wed, 4 Sep 2024 19:58:43 +0900 Subject: [PATCH 3/3] Include RcppArmadillo --- src/lda.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/lda.cpp b/src/lda.cpp index d50d85d..7a3b080 100644 --- a/src/lda.cpp +++ b/src/lda.cpp @@ -1,3 +1,4 @@ +#include #include "lib.h" #include "dev.h" #include "lda.h"