-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathWordCloud.R
93 lines (63 loc) · 2.92 KB
/
WordCloud.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
#Needed <- c("tm", "SnowballCC", "RColorBrewer", "ggplot2", "wordcloud", "biclust", "cluster", "igraph", "fpc")
#install.packages(Needed, dependencies=TRUE,repos = 'https://mirrors.tuna.tsinghua.edu.cn/CRAN')
#install.packages("Rcampdf")
#install.packages("Rcampdf", repos = "http://datacube.wu.ac.at/", type = "source")
## Search PubMed with (("2019/12/01"[Date - Publication] : "2020/01/30"[Date - Publication])) AND "Science (New York, N.Y.)"[Journal]
## Query term: https://pubmed.ncbi.nlm.nih.gov/?term=%28%28%222019%2F12%2F01%22%5BDate+-+Publication%5D+%3A+%222020%2F01%2F30%22%5BDate+-+Publication%5D%29%29+AND+%22Nature%22%5BJournal%5D
## the file consists of 279 abstracts from Science.
## Note: Webpage downloaind is not supported any longger.
##E-Util has to be used nowadays. https://www.ncbi.nlm.nih.gov/books/NBK179288/
##Create a folder named with "Corpus"
setwd("/Users/jingboxia/BioNLPcourse/")
cname <- file.path("", "Users", "jingboxia", "BioNLPcourse", "corpus")
#################Load the R package for text mining and then load your texts into R.
library(NLP)
library(tm)
docs <- Corpus(DirSource(cname))
summary(docs)
#Removing punctuation:
docs <- tm_map(docs, removePunctuation)
for(j in seq(docs))
{
docs[[j]] <- gsub("/", " ", docs[[j]])
docs[[j]] <- gsub("@", " ", docs[[j]])
docs[[j]] <- gsub("\\|", " ", docs[[j]])
}
#################Removing numbers:
docs <- tm_map(docs, removeNumbers)
###################Converting to lowercase:
docs <- tm_map(docs, tolower)
############Removing “stopwords” (common words) that usually have no analytic value.
docs <- tm_map(docs, removeWords, stopwords("english"))
######Removing particular words:
docs <- tm_map(docs, removeWords, c("department", "email", "doi", "center", "sciences", "pubmed", "nature", "university", "pmid", "author", "school", "research"))
####Tell R to treat your preprocessed documents as text documents.
docs <- tm_map(docs, PlainTextDocument)
#########To proceed, create a document term matrix.
dtm <- DocumentTermMatrix(docs)
##########You’ll also need a transpose of this matrix. Create it using:
tdm <- TermDocumentMatrix(docs)
#Organize terms by their frequency:
freq <- colSums(as.matrix(dtm))
freq
names(freq)
ord <- order(freq)
###############Word Frequency
###There are lots of terms, just check some of the most and least frequently occurring words.
freq[head(ord, 10)]
freq[tail(ord, 50)]
###If you prefer to export the matrix to Excel:
m <- as.matrix(dtm)
write.csv(m, file="dtm.csv")
wf <- data.frame(word = names(freq), freq=freq)
head(wf)
############Plot words that appear at least 50 times.
library(ggplot2)
p <- ggplot(subset(wf, freq>50), aes(word, freq))
p <- p + geom_bar(stat="identity")
p <- p + theme(axis.text.x=element_text(angle=45, hjust=1))
p
#########word cloud
set.seed(142)
library(wordcloud)
wordcloud(names(freq), freq, min.freq=25)