-
Notifications
You must be signed in to change notification settings - Fork 0
/
GetTwitterData.R
83 lines (65 loc) · 3 KB
/
GetTwitterData.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
#install.packages("rJava", dependencies = TRUE)
library(rJava)
source("FunctionsTwitterApi.R")
#query_string = string to be searched
#my_filename = string to be used as a part of the filename
#For example
query_string = "#jenkins"
my_filename = "jenkins"
#get_TwitterData = function (query_string, my_filename) {
#This may take quite a long time, depending on the data
#You may test duration like this. Then compute how long it would take to get max tweets
#system.time(my_articles <- get_twitter_data(query_string, maxtweets=100))
#system.time(my_articles <- get_twitter_data(query_string, maxtweets=200))
my_articles <- get_twitter_data(query_string, maxtweets=6000)
#save(my_articles, file="data/my_Twitter_articles_dirty.RData")
if (is.factor(my_articles$Abstract))
my_articles$Abstract = levels(my_articles$Abstract)[my_articles$Abstract]
abstract = my_articles$Abstract
title <- my_articles$Title
#Hashtags
abstract = gsub("#", " ", abstract)
abstract = gsub("(http|https)[://][^ ]*", " ", abstract)
abstract = gsub("@.*? ", " ", abstract)
abstract = gsub("@.*", " ", abstract)
abstract = gsub("[[:punct:]]", " ", abstract)
abstract = gsub("[\'\"/.,-:;!=%~*]", " ", abstract)
abstract = gsub("[.]", " ", abstract)
abstract = gsub("[ \t]{2,}", " ", abstract)
abstract <- chartr("åäáàâãöóòôõúùûüéèíìïëêñý", "aaaaaaooooouuuueeiiieeny", abstract)
#Text
title = gsub("#", " ", title)
title = gsub("(http|https)[://][^ ]*"," ",title)
title = gsub("@.*? ", " ", title)
title = gsub("@.*", " ", title)
title = gsub("[[:punct:]]", " ", title)
title = gsub("[\'\"/.,-:;!=%~*]", " ", title)
title = gsub("[.]", " ", title)
title = gsub("[ \t]{2,}", " ", title)
title <- chartr("åäáàâãöóòôõúùûüéèíìïëêñý", "aaaaaaooooouuuueeiiieeny", title)
if (is.factor(my_articles$AuthorName))
my_articles$AuthorName = levels(my_articles$AuthorName)[my_articles$AuthorName]
if (is.factor(my_articles$Cites)) {
my_articles$Cites = levels(my_articles$Cites)[my_articles$Cites]
my_articles$Cites = as.numeric(my_articles$Cites)
my_articles$Cites[is.na(my_articles$Cites)] = 0
}
if (is.factor(my_articles$Id)){
my_articles$Id = levels(my_articles$Id)[my_articles$Id]
my_articles$Id = as.numeric(my_articles$Id)
my_articles$Id[is.na(my_articles$Id)] = 0
}
#Add cleaned abstracts as a new column.
#We could also replace the existing but debugging is easier if we keep both.
my_articles$Abstract_clean = tolower(abstract)
my_articles$Title = tolower(title)
#Date is character covert to Date objec
my_articles$Date = as.Date(my_articles$Date)
#Fixed filename: /data/my_twitter_<xxx>_data.RData
my_file = my_work_dir
my_file = paste(my_file, "/data/my_twitter_", sep="", collapse=" ")
my_file = paste(my_file, my_filename, sep="", collapse=" ")
my_file = paste(my_file, "_data.RData", sep="", collapse=" ")
save(my_articles, file=my_file)
# return(my_file)
#}