-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcreate_ancil_data.R
135 lines (109 loc) · 4.17 KB
/
create_ancil_data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
# Author: Saeesh Mangwani
# Date: 2021-06-02
# Description: A script that generates the ancilliary data files from the
# hourly observation well data (scraped monthly by the obswell_scraping.py
# script)
# ==== Loading libraries ====
library(dplyr)
library(readr)
library(stringr)
library(lubridate)
# ==== Reading data ====
# Path to the data (since the data is huge, we can't read it and just keep it
# in memory. It is read selectively for each step and then thrown out)
path_to_dat <- 'ObsWellHourly.csv'
# Path to the directory where data archives are stored (defaults to the data
# folder in the working directory)
path_to_archive = 'archive'
# ==== Creating a daily mean dataset ====
# Reading the hourly dataset and picking only relevant vars
obswell <- read_csv(path_to_dat,
# Selecting only relevant columns and types
col_types = 'Td_c') %>%
# Keeping only the dates from the datetime string
mutate(Time = as.Date(x = Time, format = '%Y-%m-%d'))
# Removing
# Since the dataset is massive, it needs to be processed in chunks. Setting a
# chunksize to 10000
chunksize <- 100000
n <- 0
i <- 0
while (n < nrow(obswell)) {
i <- n + 1
# If n is bigger than the number of rows, setting it equal to nrow to
# prevent an out of bounds error. Otherwise just keep incrementing n by
# 100000
n <- ifelse(n > nrow(obswell), nrow(obswell), n + chunksize)
# For each chunk getting a mean grouped by date and time
obswell[i:n,] %>%
group_by(myLocation, Time) %>%
summarise(Value = mean(Value), ssize = n()) %>%
# Writing to disk
write_csv('ObsWellDailyMean-temp.csv',
# For the first set of rows, overwriting the current file (since
# we're updating it fresh). Otherwise appending
append = !(i == 1),
# For the first set of rows, adding column names. Otherwise just
# appending data
col_names = (i == 1))
print(paste('Processed', n, 'rows'))
}
# Removing the object
rm(obswell)
gc()
# Reading the mean dataset back and further re-meaning any duplicate rows (it
# is possible these are created since the 100000 chunksize may not fall
# exactly on the boundary between days)
read_csv('ObsWellDailyMean-temp.csv',
col_types = 'cDdd') %>%
group_by(myLocation, Time) %>%
summarize(Value = weighted.mean(Value, ssize)) %>%
write_csv('ObsWellDailyMean.csv', append = F)
# removing the temp file
file.remove('ObsWellDailyMean-temp.csv')
# ==== Creating a past 1-year dataset ====
# Specifying the timestamp
last_year <- ymd((Sys.Date() - 366))
# Reading dataset
read_csv(path_to_dat, col_types = 'Tdcc') %>%
# Filtering only values since the last year
filter(Time > last_year) %>%
# Writing to disk
write_csv('ObsWellHourly1Year.csv', append = F)
# ==== Copying all datasets to the archive ====
# Hourly
file.copy('ObsWellHourly.csv',
paste0(path_to_archive, '/ObsWellHourly_', Sys.Date(), '.csv'),
overwrite = T)
# Daily means
file.copy('ObsWellDailyMean.csv',
paste0(path_to_archive, '/ObsWellDailyMean_', Sys.Date(), '.csv'),
overwrite = T)
# 1-year
file.copy('ObsWellHourly1Year.csv',
paste0(path_to_archive, '/ObsWellHourly1Year_', Sys.Date(), '.csv'),
overwrite = T)
# Update report
file.copy('update_report.txt',
paste0(path_to_archive, '/update_report', Sys.Date(), '.txt'),
overwrite = T)
# ==== Cleaning the archive ====
# Getting available files and naming them by their datestamps
fnames <- list.files('archive') %>%
setNames(str_extract(., "\\d{4}-\\d{2}-\\d{2}"))
# Sorting names to get all the unique dates for which data is currently stored
dates <- names(fnames) %>%
ymd() %>%
unique()
if (length(dates) < 2) {
print("Only 1 version of data present in the archive. No cleaning performed")
}else{
# Sorting dates and selecting only the most recent 2
dates <- dates %>%
sort() %>%
tail(2)
# Indexing the list to select only those files dated before these 2
fnames <- fnames[!(ymd(names(fnames)) %in% dates)]
# Removing these files (if there are any to remove)
if (length(fnames) > 0) file.remove(paste0('archive/', fnames))
}