-
Notifications
You must be signed in to change notification settings - Fork 1
/
buildModel.R
34 lines (26 loc) · 1.28 KB
/
buildModel.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
#This script builds LDA models for different numbers of topics.
#The models have been built and saved as R objects. "25 topics", "30 topics", etc.
#No need to use this script unless you want to rebuild them.
source('dependencies.R')
source('helpers.R')
articles <- readData()
trash <- c('rsquo10','rsquo11','ldquoi','ldquoth','rsquo12','ldquow','rsquo13','ldquoit',
'collegersquo','donrsquot','ldquoitrsquo','yearrsquo','itrsquo','irsquom',
'ldquoi','yoursquor','thatrsquo','didnrsquot','canrsquot','doesnrsquot',
'irsquov','wersquor','therersquo','theyrsquor','dont','people','your','ive','thing',
'lot', 'didnt')
#create the corpus
myReader <- readTabular(mapping=list(content="content",id="id"))
myCorpus<-VCorpus(DataframeSource(articles), readerControl=list(reader=myReader))
myCorpus <- tm_map(myCorpus, content_transformer(tolower))
myCorpus <- tm_map(myCorpus, removePunctuation)
myCorpus <- tm_map(myCorpus, removeWords, c(stopwords("english"),stopwords("SMART"),trash))
myCorpus <- tm_map(myCorpus, stemDocument,lazy=TRUE)
dtm = DocumentTermMatrix(myCorpus)
dtm <- removeSparseTerms(dtm, 0.99)
ks <- c(20,25,30,35,40,45,50)
for(k in ks){
SEED <- 47
topic.model <- LDA(dtm,k,control=list(seed=SEED))
save(topic.model,file=paste(k,"topics"))
}