-
Notifications
You must be signed in to change notification settings - Fork 0
/
analyzerProcessamento.R
62 lines (43 loc) · 1.74 KB
/
analyzerProcessamento.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
processarTexto <- function(textLoaded){
# Carrega texto
textLoaded <- iconv(textLoaded, from='UTF-8', to='ASCII//TRANSLIT')
# Transforma em Corpus
docs <- Corpus(VectorSource(textLoaded))
# ???
toSpace <- content_transformer(function (x , pattern) gsub(pattern, " ", x))
# Padrões de remoção de caracteres
urlPat <- function(x) gsub("(ftp|http)(s?)://.*\\b", "", x)
emlPat <- function(x) gsub("\\b[A-Z a-z 0-9._ - ]*[@](.*?)[.]{1,3} \\b", "", x)
tun <- function(x) gsub("[@][a - zA - Z0 - 9_]{1,15}", "", x)
tt <- function(x) gsub("RT |via", "", x)
# Remove caracteres especiais
docs <- tm_map(docs, toSpace, "/")
docs <- tm_map(docs, toSpace, "@")
docs <- tm_map(docs, toSpace, "\\|")
docs <- tm_map(docs, content_transformer(tolower))
docs <- tm_map(docs, removeNumbers)
docs <- tm_map(docs, removePunctuation)
docs <- tm_map(docs, stripWhitespace)
# Remove caracteres de URLs, e-mails e Twitter.
docs <- tm_map(docs, urlPat)
docs <- tm_map(docs, emlPat)
docs <- tm_map(docs, tt)
docs <- tm_map(docs, tun)
# Remove as stopwords
stopWords <- stopwords("portuguese")
stopWords <- iconv(stopWords, from="UTF-8", to="ASCII//TRANSLIT")
docs <- tm_map(docs, removeWords, stopWords)
# Transforma em matriz
dtm <<- TermDocumentMatrix(docs)
m <<- as.matrix(dtm)
print(m)
# Organiza em ordem alfabética
v <<- sort(rowSums(m),decreasing=TRUE)
# Cria um dataframe
dataFrame <<- data.frame(word = names(v),freq=v)
#print(dataFrame)
op30 <<- oplexicon_v3.0
sent <<- sentiLex_lem_PT02
m <<- as.matrix(dtm)
textPrepared <<- textLoaded
}