This repository has been archived by the owner on Apr 22, 2019. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 86
/
generate_dataset.R
158 lines (139 loc) · 7.82 KB
/
generate_dataset.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
################################################################################################
#
# 01. Creation of benchmark data sets
#
################################################################################################
# 01a. Get text from Coursera quizzes
################################################################################################
quizzes <- readLines('data/quizzes.txt', encoding = 'UTF-8')
# 01b. Get text from randomly selected tweets
################################################################################################
install.packages(c('twitteR', 'xml', 'textcat'))
library('twitteR')
library('XML')
library('textcat')
# To use the Twitter API, you'll need to create an application via the following link:
# https://apps.twitter.com/app/new
# Once the app was created, paste the consumer and access details below.
setup_twitter_oauth(consumer_key = '...',
consumer_secret = '...',
access_token = '...',
access_secret = '...')
# extract 1000 tweets containing the word 'the'
tweets.orig <- searchTwitter('the', 1000)
# extract the relevant information into a data.table
tweets.dt <- data.table(
text = sapply(tweets.orig, function(x) {
stri_replace_all_regex(xpathSApply(htmlParse(x$text,
asText=T,
encoding = 'UTF-8'),
'//body/p/text()',
xmlValue)
, '[\r\n]', ' ')
}),
user = sapply(tweets.orig, function(x){x$screenName}),
rt.count = sapply(tweets.orig, function(x){x$retweetCount}),
is.truncated = sapply(tweets.orig, function(x){x$truncated})
)
# determine the language of each tweet (this is not very precise, but all we
# want to do here is filter out the obvious non-English tweets)
tweets.dt[,lang:=textcat(text, ECIMCI_profiles)]
# retain only the unique tweets fulfilling the following criteria:
# 1. they contain the string 'the' (the search somehow also returns tweets not
# containing this)
# 2. they were recognized as language 'en'
# 3. the tweet is not flagged as truncated
tweets <- unique(tweets.dt[stri_detect_fixed(text, 'the') &
lang=='en' &
!is.truncated]$text)
# put everything into a file
writeLines(tweets, 'data/tweets.txt', useBytes = T)
# make sure we can read it back in
tweets2<-readLines('data/tweets.txt', encoding = 'UTF-8')
identical(tweets, tweets2)
# TRUE
rm(tweets2)
# 01c. Get text from randomly selected blog descriptions
################################################################################################
# Note that we're not really crawling blog content here, but rather just descriptions of blogs.
# full list of RSS feeds taken from http://www.blog-search.com/feeds/
urls <- c('http://www.blog-search.com/categories/autos/recent.rss',
'http://www.blog-search.com/categories/autos/popular.rss',
'http://www.blog-search.com/categories/business/recent.rss',
'http://www.blog-search.com/categories/business/popular.rss',
'http://www.blog-search.com/categories/computers/recent.rss',
'http://www.blog-search.com/categories/computers/popular.rss',
'http://www.blog-search.com/categories/education/recent.rss',
'http://www.blog-search.com/categories/education/popular.rss',
'http://www.blog-search.com/categories/entertainment/recent.rss',
'http://www.blog-search.com/categories/entertainment/popular.rss',
'http://www.blog-search.com/categories/environment/recent.rss',
'http://www.blog-search.com/categories/environment/popular.rss',
'http://www.blog-search.com/categories/family/recent.rss',
'http://www.blog-search.com/categories/family/popular.rss',
'http://www.blog-search.com/categories/finance/recent.rss',
'http://www.blog-search.com/categories/finance/popular.rss',
'http://www.blog-search.com/categories/fitness/recent.rss',
'http://www.blog-search.com/categories/fitness/popular.rss',
'http://www.blog-search.com/categories/food/recent.rss',
'http://www.blog-search.com/categories/food/popular.rss',
'http://www.blog-search.com/categories/gardening/recent.rss',
'http://www.blog-search.com/categories/gardening/popular.rss',
'http://www.blog-search.com/categories/health/recent.rss',
'http://www.blog-search.com/categories/health/popular.rss',
'http://www.blog-search.com/categories/hobbies/recent.rss',
'http://www.blog-search.com/categories/hobbies/popular.rss',
'http://www.blog-search.com/categories/home-repair/recent.rss',
'http://www.blog-search.com/categories/home-repair/popular.rss',
'http://www.blog-search.com/categories/humor/recent.rss',
'http://www.blog-search.com/categories/humor/popular.rss',
'http://www.blog-search.com/categories/internet/recent.rss',
'http://www.blog-search.com/categories/internet/popular.rss',
'http://www.blog-search.com/categories/law/recent.rss',
'http://www.blog-search.com/categories/law/popular.rss',
'http://www.blog-search.com/categories/marketing/recent.rss',
'http://www.blog-search.com/categories/marketing/popular.rss',
'http://www.blog-search.com/categories/multimedia/recent.rss',
'http://www.blog-search.com/categories/multimedia/popular.rss',
'http://www.blog-search.com/categories/personal/recent.rss',
'http://www.blog-search.com/categories/personal/popular.rss',
'http://www.blog-search.com/categories/pets/recent.rss',
'http://www.blog-search.com/categories/pets/popular.rss',
'http://www.blog-search.com/categories/politics/recent.rss',
'http://www.blog-search.com/categories/politics/popular.rss',
'http://www.blog-search.com/categories/religion/recent.rss',
'http://www.blog-search.com/categories/religion/popular.rss',
'http://www.blog-search.com/categories/science/recent.rss',
'http://www.blog-search.com/categories/science/popular.rss',
'http://www.blog-search.com/categories/self-help/recent.rss',
'http://www.blog-search.com/categories/self-help/popular.rss',
'http://www.blog-search.com/categories/social-issues/recent.rss',
'http://www.blog-search.com/categories/social-issues/popular.rss',
'http://www.blog-search.com/categories/sports/recent.rss',
'http://www.blog-search.com/categories/sports/popular.rss',
'http://www.blog-search.com/categories/technology/recent.rss',
'http://www.blog-search.com/categories/technology/popular.rss',
'http://www.blog-search.com/categories/travel/recent.rss',
'http://www.blog-search.com/categories/travel/popular.rss',
'http://www.blog-search.com/categories/webmasters/recent.rss',
'http://www.blog-search.com/categories/webmasters/popular.rss')
# extract the blog descriptions from all RSS feeds
blogs <- unique(unlist(
lapply(urls,
function(url) {
xpathSApply(xmlParse(url, isURL = T),
'//description',
function(cont) {
xpathSApply(htmlParse(xmlValue(cont), asText = T, encoding = 'UTF-8'),
'//body/p/text()',
xmlValue)
})
})
))
# put everything into a file
writeLines(blogs, 'data/blogs.txt', useBytes = T)
# make sure we can read it back in
blogs2<-readLines('data/blogs.txt', encoding = 'UTF-8')
identical(blogs, blogs2)
# TRUE
rm(blogs2)