Folder lda consists of the files to run lda experiments
-
utils.R which includes functions for testing lda topics
-
main.R which includes methods for creating and testing lda models from different libraries
Folder data is there to store your data.
Folder results stores the results of your experiment
Data
data_dir <- "./data/depression_anxiety_cleaned.csv"
data_col <- "dep_all_words"
id_col <- "unique_id"
group_col <- "minidep_diagnose" # now necessary, but only used for t-test
Document Term Matrix
ngram_window <- c(1,3)
stopwords <- stopwords::stopwords("en", source = "snowball")
removalword <- "" # just possible with one word
occ_rate <- 0
removal_num_most <- 1
removal_num_least <- 0
removal_mode <- "absolute" # "relative"
split <- 1
LDA
model_type <- "textmineR" # or "mallet"
num_topics <- 20
num_top_words <- 10
num_train_iterations <- 2000
num_pred_iterations <- 200
pred_mode <- "function" # or "custom" for mallet
Analysis
cor_var <- "PHQtot" # grouping variable for t-test, to be predicted variable for other
control_vars <- c("PHQtot") # vector of variables to control analysis with if test_method is linear_regression
test_method <- "textTrain_regression" # linear_regression, logistic_regression, t-test
Miscellaneous
seed <- 1234
All objects created within the pipeline are created in the directory below. These include
-
Document Term Matrix
-
model
-
predictions
-
analysis results
save_dir <- paste0("./results/",
model_type,"_",
data_col, "_",
num_topics,
"_most_",removal_num_most,
"_least_", removal_num_least,
"_occ_", occ_rate,
"_pred_", mode)
library(textmineR)
library(tidyverse)
library(dplyr)
library(textmineR)
library(mallet)
library(rJava)
library(tokenizers)
library(text2vec)
library(quanteda)
source("./lda/main.R")
dtms <- get_dtm(data_dir = data_dir,
id_col = id_col,
data_col = data_col,
group_var = group_col,
ngram_window = ngram_window,
stopwords = stopwords,
removalword = removalword,
occ_rate = occ_rate,
removal_mode = removal_mode,
removal_rate_most = removal_num_most,
removal_rate_least = removal_num_least,
split=split,
seed=seed,
save_dir=save_dir)
model <- get_lda_model(model_type=model_type,
dtm=dtms$train_dtm,
num_topics=num_topics,
num_top_words=num_top_words,
num_iterations = num_train_iterations,
seed=seed,
save_dir=save_dir)
preds <- get_lda_preds(model = model,
num_iterations=num_pred_iterations,
data = dtms$train_data,
dtm = dtms$train_dtm,
group_var = c(cor_var),
seed=seed,
mode=mode,
save_dir = save_dir)
test <- get_lda_test(model=model,
preds=preds,
group_var = cor_var,
control_vars = control_vars,
test_method = "textTrain_regression",
seed=seed,
save_dir=save_dir)
test <- get_lda_test(model=model,
preds=preds,
group_var = cor_var,
control_vars = control_vars,
test_method = "linear_regression",
seed=seed,
save_dir=save_dir)