-
Notifications
You must be signed in to change notification settings - Fork 0
/
clean and analyze text.R
431 lines (317 loc) · 14.5 KB
/
clean and analyze text.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
require(tm)
require(SnowballC)
require(wordcloud)
require(streamR)
require(stringr)
require(dplyr)
require(memoise)
# CLEAN TEXT:
# We want to isolate only the text protion of the tweets, removing handles, hashtags, URLs, and special
# characters.
# remove twitter handles (usernames starting with @) from tweets
# remove special characters
# remove URLs
# send all text to lowercase, to standardize all capitalization
trim_tweet <- function(tweet.text){
# remove handles
clean <- gsub("@\\w+ *", "", tweet.text)
# remove newline character
clean <- gsub("\n", " ", clean)
# we want to get rid of anything that is not a letter, space, or hashtag
# we do not want numbers because they are not text
clean <- gsub("[^a-zA-Z #]","",clean)
# remove URLs
clean <- gsub("https\\w+ *", "", clean)
# send all characters to lowercase
clean <- tolower(clean)
# trim spaces that may have piled up after removing hastags, handles, and URLs
clean <- str_trim(clean, side = "both")
# return the cleaned tweet, as a STRING
return(clean)
}
# function to clean text of whole dataframe at once
# input must be a vector
clean_text <- function(tweets.vector){
if (!is.vector(tweets.vector)){
warning("To trim and analyze tweets, you must input a vector. Input '[*parsed tweets dataframe name*]$text' into function to continue")
}else{
# clean tweets
clean_tweets <- sapply(X = tweets.vector, FUN = trim_tweet, simplify = TRUE, USE.NAMES = FALSE)
# convert tweets to dataframe
tweets.df <- data.frame(clean_tweets, stringsAsFactors = FALSE)
# name the column of the new dataframe
colnames(tweets.df) <- c("text")
# return all of the cleaned tweets, as a DATAFRAME
return(tweets.df)
}
}
##########################################################################################################
# EXTRACT HASHTAGS:
# We want to extract and isolate only the hashtags to analyze what hashtags are trending
# in the dataset. These are different than regular text because there can be hashtags used
# by everyone, because they are more universal/standardized.
# Creating a vector of ONLY hashtags
hashtags <- function(tweet.text){
# extract all words that start with #
clean <- str_extract_all(tweet.text, "#\\w+ *")
#convert to dataframe
clean.vector <- unlist(clean, use.names = FALSE)
# send all characters to lowercase to standardize capitalization
hashtags.vector <- sapply(X = clean.vector, FUN = tolower, simplify = TRUE, USE.NAMES = FALSE)
# return the isolated hashtags, as a VECTOR
return(hashtags.vector)
}
# function to extract hashtags from whole dataframe
# input must be a vector
extract_hashtags <- function(tweets.vector){
if (!is.vector(tweets.vector) || !is.character(tweets.vector[1])){
warning("Input must be a character vector.")
}else{
# extract hashtags to a LIST
htags.list <- apply(X = matrix(tweets.vector, nrow = 1), MARGIN = 2, FUN = hashtags)
# convert list to VECTOR to remove list names and empty list entries,
# and split entries of multiple hashtags into distinct values
htags.vector <- unlist(htags.list, use.names = FALSE)
# create dataframe of hashtags
hashtags.df <- data.frame(htags.vector, stringsAsFactors = FALSE)
colnames(hashtags.df) <- c("hashtags")
# return ALL of the isolated hashtags, as a DATAFRAME
return(hashtags.df)
}
}
####################################################################################
# AVERAGE NUMBER OF WORDS PER TWEET
# calculate number of words in tweet
num_words <- function(tweet.text){
# split the tweets at the spaces to isloate each word
# the output is a LIST
word_list <- str_split(tweet.text, " ")
# convert the list into a VECTOR
word_vector <- unlist(word_list, use.names = FALSE)
# remove any blank entries in the vector
# (indicating there were multiple spaces in the tweet text)
words <- word_vector[word_vector != ""]
# return NUMBER of words
return(length(words))
}
# calculate AVERAGE number of words per tweet in the dataset
# input must be a vector
avg_words <- function(tweets.vector){
if (!is.vector(tweets.vector) || !is.character(tweets.vector[1])){
warning("Input must be a character vector.")
}else{
# calculate number of words per tweet
words_per_tweet <- sapply(X = tweets.vector, FUN = num_words, simplify = TRUE, USE.NAMES = FALSE)
# calculate descriptive statistics
m <- mean(words_per_tweet)
md <- median(words_per_tweet)
v <- var(words_per_tweet)
s <- sd(words_per_tweet)
# return a DATAFRAME with the descriptive statistics of the number of words per tweet
return(data.frame("mean" = m, "median" = md, "variance" = v, "standard.deviation" = s, row.names = c("num.words")))
}
}
####################################################################################
# AVERAGE LENGTH (number of characters) OF TWEET
# calculate number of characters (chars) in tweet
tweet_length <- function(tweet.text){
# split the tweets at the spaces to isloate each word
# the output is a LIST
char_list <- str_split(tweet.text, "")
# convert the list into a VECTOR
char_vector <- unlist(char_list, use.names = FALSE)
# remove any spaces
chars <- char_vector[char_vector != " "]
# return NUMBER of words
return(length(chars))
}
# calculate AVERAGE length of a tweet in the dataset
# input must be a vector
avg_length <- function(tweets.vector){
if (!is.vector(tweets.vector) || !is.character(tweets.vector[1])){
warning("Input must be a character vector.")
}else{
# initialize vector to hold length of tweet
lengths_of_tweets <- sapply(X = tweets.vector, FUN = tweet_length, simplify = TRUE, USE.NAMES = FALSE)
# calculate descriptive statistics
m <- mean(lengths_of_tweets)
md <- median(lengths_of_tweets)
v <- var(lengths_of_tweets)
s <- sd(lengths_of_tweets)
# return a DATAFRAME with the descriptive statistics of the length of the tweets
return(data.frame("mean" = m, "median" = md, "variance" = v, "standard.deviation" = s, row.names = c("tweet.length")))
}
}
####################################################################################
# AVERAGE LENGTH OF WORD
# Here, we do NOT want to include hashtags in our analysis. We want to know the average length
# of the words that people use to communicate their meaning, whereas hashtags are TAGS for their
# content to associate it with a larger conversation.
## This is the function we will use to possibly detect Australian slang
# calculate length of words in tweet
# input must be a string
lengths_of_words <- function(tweet.text){
# remove all hashtags
no.hashtags <- gsub("#\\w+ *", "", tweet.text)
# split the tweets at the spaces to isloate each word
# the output is a LIST
word_list <- str_split(no.hashtags, " ")
# convert the list into a VECTOR
word_vector <- unlist(word_list, use.names = FALSE)
# remove any blank entries in the vector
# (indicating there were multiple spaces in the tweet text)
words <- word_vector[word_vector != ""]
# calculate word lengths
word_lengths <- sapply(X = words, FUN = nchar, simplify = TRUE, USE.NAMES = FALSE)
# return a VECTOR of the number of words
return(word_lengths)
}
# calculate AVERAGE length of words in the dataset
# input must be a vector
avg_wrd_lngth <- function(tweets.vector){
if (!is.vector(tweets.vector) || !is.character(tweets.vector[1])){
warning("Input must be a character vector.")
}else{
# create a LIST of the word lengths in each tweet
word_lengths.list <- apply(X = matrix(tweets.vector, nrow = 1), MARGIN = 2, FUN = lengths_of_words)
# unlist the list to create a VECTOR of word lengths
lengths.vector <- unlist(word_lengths.list, use.names = FALSE)
# calculate descriptive statistics
m <- mean(lengths.vector)
md <- median(lengths.vector)
v <- var(lengths.vector)
s <- sd(lengths.vector)
# return a DATAFRAME with the descriptive statistics of the number of words per tweet
return(data.frame("mean" = m, "median" = md, "variance" = v, "standard.deviation" = s, row.names = c("word.length")))
}
}
####################################################################################
# AVERAGE NUMBER OF HASHTAGS PER TWEET
# Creating a data frame of ONLY hashtags
num_hashtags <- function(tweet.text){
# extract all words that start with #
# result is a LIST
htags.list <- str_extract_all(tweet.text, "#\\w+ *")
# convert to vector
htags.vector <- unlist(htags.list, use.names = FALSE)
# return NUMBER of hashtags
return(length(htags.vector))
}
avg_num_hashtags <- function(tweets.vector){
if (!is.vector(tweets.vector) || !is.character(tweets.vector[1])){
warning("Input must be a character vector.")
}else{
# calculate number of hashtags per tweet
num.htags <- sapply(X = tweets.vector, FUN = num_hashtags, simplify = TRUE, USE.NAMES = FALSE)
# calculate descriptive statistics
m <- mean(num.htags)
md <- median(num.htags)
v <- var(num.htags)
s <- sd(num.htags)
# return a DATAFRAME with the descriptive statistics of the number of words per tweet
return(data.frame("mean" = m, "median" = md, "variance" = v, "standard.deviation" = s, row.names = c("num.hashtags")))
}
}
####################################################################################
# COMPLETE TEXT ANALYSIS
## combining all above functions into one function
analyze_text <- function(tweets.vector){
if (!is.vector(tweets.vector) || !is.character(tweets.vector[1])){
warning("Input must be a character vector.")
}else{
# clean the text
clean.txt <- clean_text(tweets.vector)
# number of words per tweet
num.words <- avg_words(clean.txt$text)
# length of tweets
twt.lngth <- avg_length(clean.txt$text)
# length of words
wrd.length <- avg_wrd_lngth(clean.txt$text)
# number of hashtags
num.htags <- avg_num_hashtags(clean.txt$text)
# create of dataframe of analysis results
analysis.df <- rbind(num.words, twt.lngth, wrd.length, num.htags)
return(analysis.df)
}
}
##########################################################################################################
##########################################################################################################
##########################################################################################################
# If we want to conduct t-tests on the above qualities of the tweets in each city
# (number of words per tweet, length of tweet, length of word, number of hashtags per tweet),
# we need to have access to numeric vectors. The above functions return dataframes of
# descriptive statistics, so the functions below return the original vectors used
# to calculate the descriptive statistics.
# NUMBER OF WORDS PER TWEET
# get a vector of the number of words in every tweet collected
num_words.vector <- function(tweets.vector){
if (!is.vector(tweets.vector) || !is.character(tweets.vector[1])){
warning("Input must be a character vector.")
}else{
# crate LIST of number of words per tweet
words_per_tweet <- sapply(X = tweets.vector, FUN = num_words, simplify = TRUE, USE.NAMES = FALSE)
return(words_per_tweet)
}
}
####################################################################################
# LENGTH (number of characters) OF TWEET
# get a vector of the length of every tweet collected
tweet_length.vector <- function(tweets.vector){
if (!is.vector(tweets.vector) || !is.character(tweets.vector[1])){
warning("Input must be a character vector.")
}else{
# create LIST of lengths of tweet
lengths_of_tweets <- sapply(X = tweets.vector, FUN = tweet_length, simplify = TRUE, USE.NAMES = FALSE)
return(lengths_of_tweets)
}
}
####################################################################################
# LENGTH OF WORD
# get a vector of the length of every word tweeted
word_length.vector <- function(tweets.vector){
if (!is.vector(tweets.vector) || !is.character(tweets.vector[1])){
warning("Input must be a character vector.")
}else{
# create a LIST of word lengths
word_lengths <- apply(X = matrix(tweets.vector, nrow = 1), MARGIN = 2, FUN = lengths_of_words)
# unlist the list into a VECTOR
lengths.vector <- unlist(word_lengths, use.names = FALSE)
return(lengths.vector)
}
}
####################################################################################
# NUMBER OF HASHTAGS PER TWEET
# get a vector of the number of hashtags in every tweet collected
num_hashtags.vector <- function(tweets.vector){
if (!is.vector(tweets.vector) || !is.character(tweets.vector[1])){
warning("Input must be a character vector.")
}else{
# initialize vector to hold number of hashtags per tweet
num.htags <- sapply(X = tweets.vector, FUN = num_hashtags, simplify = TRUE, USE.NAMES = FALSE)
return(num.htags)
}
}
##########################################################################################################
##########################################################################################################
##########################################################################################################
# GET TWEETS FROM JSON FILES
tweets.Boston <- parseTweets("tweetsBoston.json")
tweets.Sydney <- parseTweets("tweetsSydney.json")
# Boston tweets
boston.text <- clean_text(tweets.Boston$text)
# Sydney tweets
sydney.text <- clean_text(tweets.Sydney$text)
# Descriptive statistics of Boston tweets
boston.descriptives <- analyze_text(tweets.Boston$text)
# Descriptive statistics of Sydney tweets
sydney.descriptives <- analyze_text(tweets.Sydney$text)
# Vectors of Boston tweet qualities
b.num.words <- num_words.vector(boston.text$text)
b.tweet.length <- tweet_length.vector(boston.text$text)
b.word.length <- word_length.vector(boston.text$text)
b.num.hashtags <- num_hashtags.vector(boston.text$text)
# Vectors of Sydney tweets qualities
s.num.words <- num_words.vector(sydney.text$text)
s.tweet.length <- tweet_length.vector(sydney.text$text)
s.word.length <- word_length.vector(sydney.text$text)
s.num.hashtags <- num_hashtags.vector(sydney.text$text)