-
Notifications
You must be signed in to change notification settings - Fork 0
/
Clustering.R
67 lines (57 loc) · 2.49 KB
/
Clustering.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
library(readxl)
Dataset <- read_excel("D:/sem4/A-SIN/PROJECT/Dataset.xlsx")
View(Dataset)
data<-Dataset
library(dplyr)
positive = subset(data, airline_sentiment == 'positive')
negative = subset(data, airline_sentiment == 'negative')
library(NLP);
library(tm);
library(SnowballC);
wordsToRemove = c('get', 'cant', 'can', 'now', 'just', 'will', 'dont', 'ive', 'got', 'much')
analyseText = function(text_to_analyse){
text_to_analyse = iconv(text_to_analyse, to = "utf-8")
CorpusTranscript = Corpus(VectorSource(text_to_analyse))
CorpusTranscript = tm_map(CorpusTranscript, tolower)
CorpusTranscript = tm_map(CorpusTranscript, removeNumbers)
CorpusTranscript = tm_map(CorpusTranscript, removePunctuation)
CorpusTranscript = tm_map(CorpusTranscript, removeWords, wordsToRemove)
CorpusTranscript = tm_map(CorpusTranscript, removeWords, stopwords("english"))
CorpusTranscript = TermDocumentMatrix(CorpusTranscript)
CorpusTranscript = removeSparseTerms(CorpusTranscript, 0.97) # keeps a matrix 97% sparse
CorpusTranscript = CorpusTranscript[names(tail(sort(rowSums(as.matrix(CorpusTranscript))), 50)), ]
return(CorpusTranscript)
}
words_neg = analyseText(negative$text)
words_pos = analyseText(positive$text)
d = dist(t(as.matrix(words_neg)), method = 'euclidean')
fit = hclust(d = d, method = 'ward.D')
#fancy plot
op = par(bg = "#DDE3CA")
plot(fit, col = "#487AA1", col.main = "#45ADA8", col.lab = "#7C8071", main = 'Negative Sentiment', xlab = '',
col.axis = "#F38630", lwd = 3, lty = 3, sub = "", hang = -1, axes = FALSE)
# add axis
axis(side = 2, at = seq(0, 400, 100), col = "#F38630", labels = FALSE,
lwd = 2)
# add text in margin
mtext(seq(0, 100, 10), side = 2, at = seq(0, 100, 10), line = 1,
col = "#A38630", las = 2)
plot.new()
plot(fit, hang=-1, main = 'Negative Sentiment', xlab = '')
rect.hclust(fit, k=4, border="red")
# positive sentiment tweets
d = dist(t(as.matrix(words_pos)), method = 'euclidean')
fit = hclust(d = d, method = 'ward.D')
#fancy plot
op = par(bg = "#DDE3CA")
plot(fit, col = "#487AA1", col.main = "#45ADA8", col.lab = "#7C8071", main = 'Positive Sentiment', xlab = '',
col.axis = "#F38630", lwd = 3, lty = 3, sub = "", hang = -1, axes = FALSE)
# add axis
axis(side = 2, at = seq(0, 400, 100), col = "#F38630", labels = FALSE,
lwd = 2)
# add text in margin
mtext(seq(0, 100, 10), side = 2, at = seq(0, 100, 10), line = 1,
col = "#A38630", las = 2)
plot.new()
plot(fit, hang=-1, main = 'Positive Sentiment', xlab = '')
rect.hclust(fit, k=4, border="red")