Skip to content

OscarKjell/ml_word_visualisations

 
 

Folders and files

NameName
Last commit message
Last commit date

Latest commit

 

History

30 Commits
 
 
 
 
 
 
 
 
 
 
 
 

Repository files navigation

ml_word_visualisations

Structure of repository

Folder lda consists of the files to run lda experiments

  1. utils.R which includes functions for testing lda topics

  2. main.R which includes methods for creating and testing lda models from different libraries

Folder data is there to store your data.

Folder results stores the results of your experiment

Example

LDA

0. Preparation

0.1 Set (hyper)parameters

Data

data_dir <- "./data/depression_anxiety_cleaned.csv"
data_col <- "dep_all_words"
id_col <- "unique_id"
group_col <- "minidep_diagnose" # now necessary, but only used for t-test

Document Term Matrix

ngram_window <- c(1,3)
stopwords <- stopwords::stopwords("en", source = "snowball")
removalword <- "" # just possible with one word
occ_rate <- 0
removal_num_most <- 1
removal_num_least <- 0
removal_mode <- "absolute" # "relative"
split <- 1

LDA

model_type <- "textmineR" # or "mallet"
num_topics <- 20
num_top_words <- 10
num_train_iterations <- 2000
num_pred_iterations <- 200
pred_mode <- "function" # or "custom" for mallet

Analysis

cor_var <- "PHQtot" # grouping variable for t-test, to be predicted variable for other
control_vars <- c("PHQtot") # vector of variables to control analysis with if test_method is linear_regression
test_method <- "textTrain_regression" # linear_regression, logistic_regression, t-test

Miscellaneous

seed <- 1234
0.2 Create directory to save all computations

All objects created within the pipeline are created in the directory below. These include

  • Document Term Matrix

  • model

  • predictions

  • analysis results

save_dir <- paste0("./results/",
            model_type,"_",
            data_col, "_",
            num_topics, 
            "_most_",removal_num_most, 
            "_least_", removal_num_least, 
            "_occ_", occ_rate, 
            "_pred_", mode)
0.3 Imports
library(textmineR)
library(tidyverse)
library(dplyr)
library(textmineR)
library(mallet)
library(rJava)
library(tokenizers)
library(text2vec)
library(quanteda)
source("./lda/main.R")

1. Compute Document Term Matrix

dtms <- get_dtm(data_dir = data_dir,
                id_col = id_col,
                data_col = data_col,
                group_var = group_col,
                ngram_window = ngram_window,
                stopwords = stopwords,
                removalword = removalword,
                occ_rate = occ_rate,
                removal_mode = removal_mode,
                removal_rate_most = removal_num_most,
                removal_rate_least = removal_num_least,
                split=split,
                seed=seed,
                save_dir=save_dir)

2. Create LDA Model

model <- get_lda_model(model_type=model_type,
                        dtm=dtms$train_dtm,
                        num_topics=num_topics,
                        num_top_words=num_top_words,
                        num_iterations = num_train_iterations,
                        seed=seed,
                        save_dir=save_dir)

3. Create Predictions

preds <- get_lda_preds(model = model,
                        num_iterations=num_pred_iterations,
                        data = dtms$train_data,
                        dtm = dtms$train_dtm,
                        group_var = c(cor_var),
                        seed=seed,
                        mode=mode,
                        save_dir = save_dir)

4. Analysis

4.1 textTrain_regression
test <- get_lda_test(model=model,
                    preds=preds,
                    group_var = cor_var,
                    control_vars = control_vars,
                    test_method = "textTrain_regression",
                    seed=seed,
                    save_dir=save_dir)
4.2 Linear Regression
test <- get_lda_test(model=model,
                    preds=preds,
                    group_var = cor_var,
                    control_vars = control_vars,
                    test_method = "linear_regression",
                    seed=seed,
                    save_dir=save_dir)

About

No description, website, or topics provided.

Resources

Stars

Watchers

Forks

Releases

No releases published

Packages

No packages published

Languages

  • R 100.0%