-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy patharrow.R
67 lines (54 loc) · 1.66 KB
/
arrow.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
# https://arrow-user2022.netlify.app/hello-arrow.html
# Time flies like an arrow, fruit flies like a banana
# do we really need all these loaded?
libs <- c("arrow",
"dplyr",
"lubridate",
"duckdb",
"stringr",
"palmerpenguins",
"tictoc",
"scales",
"janitor",
"fs",
"ggplot2",
"ggrepel",
"sf")
dkUtils::loadLibraries(libs)
# Test on SAVE electricity use data (large dataset)
# https://beta.ukdataservice.ac.uk/datacatalogue/studies/study?id=8676
path <- "~/Dropbox/data/SAVE/UKDA-SN-8676-1/save_consumption_data/"
save_Wh <- arrow::open_dataset(path, format = "csv")
save_Wh %>%
head() %>%
collect()
# Energy = cumulative Wh within bmg_id - you'd forgotten, right? lol
dkUtils::tidyNum(nrow(save_Wh), round = 1)
tic()
# create an arrow table object with just 2018 data
save_2018_Wh <- save_Wh %>%
mutate( dateTime = cast(recorded_timestamp, timestamp()),
year = year(dateTime)) %>%
filter(year == 2018) %>%
compute()
toc()
dkUtils::tidyNum(nrow(save_2018_Wh), round = 1)
tic()
save_2018_Wh <- save_2018_Wh %>%
group_by(bmg_id) %>% # do we need to 'arrange' by dateTime here to be sure?
arrange(bmg_id, dateTime) %>%
mutate(wh = energy - lag(energy)) %>% # pulls into R so subsequent memory issues
compute()
toc()
save_2018_Wh %>%
head() %>%
collect()
summary(save_2018_Wh)
t <- save_2018_Wh %>%
mutate(date = date(dateTime)) %>%
group_by(date) %>%
summarise(meanEnergy = mean(wh/1000, na.rm = TRUE),
medianEnergy = median(wh/1000)) %>%
collect()
ggplot2::ggplot(t, aes(x = date, y = medianEnergy)) +
geom_point()