-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape.R
103 lines (86 loc) · 3.14 KB
/
scrape.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
#!/usr/bin/Rscript
require(rvest)
library(methods)
url <-
"https://it.wikipedia.org/wiki/Sondaggi_sul_referendum_costituzionale_del_2016_in_Italia"
getPollTable <- function(xpath, drop_last_row) {
col_names <-
c('date', 'pollster', 'client', 'sample',
'yes_wtout_undecided', 'no_wtout_undecided',
'yes_wt_undecided', 'no_wt_undecided',
'novote', 'undecided', 'affluence', 'error')
tab <- url %>%
read_html() %>%
html_nodes(xpath=xpath) %>%
html_table(fill = TRUE)
tab <- tab[[1]]
tab <- tab[-1,]
if (drop_last_row == TRUE) {
tab <- tab[-nrow(tab),]
}
colnames(tab) <- col_names
# Dates
returnAsDate <- function(str) {
library(stringr)
it_month <-
c('gennaio', 'febbraio', 'marzo', 'aprile', 'maggio', 'giugno', 'luglio',
'agosto', 'settembre', 'ottobre', 'novembre', 'dicembre')
names(it_month) <- sprintf("%02d", 1:12)
it_month_regex <- paste(it_month, collapse = "|")
date_str <-
str_extract(str, paste0("^(\\d{1,2})?(-\\d{1,2})?( )?", "(", it_month_regex, ") \\d{4}"))
if (is.na(date_str)) {
# Double month
date_str <-
str_extract(str, paste0("^(.*)\\d{4}"))
date_str <- gsub("-(.*) ", " ", date_str)
date_str <- gsub("gen(\\.)?", "gennaio", date_str)
date_str <- gsub("feb(\\.)?", "febbraio", date_str)
date_str <- gsub("mar(\\.)?", "marzo", date_str)
date_str <- gsub("apr(\\.)?", "aprile", date_str)
date_str <- gsub("mag(\\.)?", "maggio", date_str)
date_str <- gsub("giu(\\.)?", "giugno", date_str)
date_str <- gsub("lug(\\.)?", "luglio", date_str)
date_str <- gsub("ago(\\.)?", "agosto", date_str)
date_str <- gsub("set(\\.)?", "settembre", date_str)
date_str <- gsub("ott(\\.)?", "ottobre", date_str)
date_str <- gsub("nov(\\.)?", "novembre", date_str)
date_str <- gsub("dic(\\.)?", "dicembre", date_str)
}
day <- str_extract(date_str, "^\\d{1,2}")
if(is.na(day)) {
day <- "15"
}
day <- sprintf("%02s", day)
month <- str_extract(date_str, it_month_regex)
month <- names(it_month)[which(month == it_month)]
year <- str_extract(date_str, "\\d{4}$")
return(paste(year, month, day, sep = "-"))
}
tab$date <- as.Date(sapply(tab$date, returnAsDate))
# Sample conversion
tab$sample <- as.numeric(gsub("[^0-9]", "", tab$sample))
tab$sample[!grepl("\\d", tab$sample)] <- NA
# Percentage conversion
formatPerc <- function(x) {
x <- gsub("\\[(.*)\\]", "", x)
x <- gsub("%", "", x)
x <- gsub(",", ".", x)
x <- gsub("±", "", x)
x <- gsub("\\(|\\)", "", x)
if(!grepl("\\d", x)) {
x <- NA
}
return(x)
}
for (i in 5:ncol(tab)) {
tab[[i]] <- as.numeric(sapply(tab[[i]], formatPerc))
}
return(tab)
}
tab1 <- getPollTable('//*[@id="mw-content-text"]/table[1]', drop_last_row = TRUE)
tab2 <- getPollTable('//*[@id="mw-content-text"]/table[2]', drop_last_row = TRUE)
tab3 <- getPollTable('//*[@id="mw-content-text"]/table[3]', drop_last_row = FALSE)
polls <- rbind(tab1, tab2, tab3)
setwd('~/public_git/referendum_ita_2016')
save(polls, file = 'data/polls.RData')