-
Notifications
You must be signed in to change notification settings - Fork 1
/
main.R
226 lines (184 loc) · 10.7 KB
/
main.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
# import libraries, it is assumed they've been installed already
library(data.table) # parallel data manipulation
library(gsubfn) # string templates
library(readstata13) # to read from .dta files
library(magrittr)
library(tibble)
library(dplyr)
library(tidyverse)
library(R.utils)
# set the number of threads to decrease the time of file reading/writing
data.table::getDTthreads()
data.table::setDTthreads(8)
# a list of variables in the main table
main_out_colnames = c('dhm', 'les_c4', 'laboursupplyweekly', 'out_ghqcase',
'equivaliseddisposableincomeyearl', 'atriskofpoverty')
ids = c("scenario", "run", "time")
main_loop <- function(eid_, dft) {
per_run <- get_summary_statistics(ids, dft, eid_)
per_run <- rename_final_columns(per_run, eid_)
per_all <- get_summary_statistics(c("time", "scenario"), dft, eid_)
per_all <- rename_final_columns(per_all, eid_)
bind_rows(per_run, per_all)
}
rename_final_columns <- function(data_table_name, postfix) {
# TODO this can be wrapped into a function as the number of columns increases
names(data_table_name)[names(data_table_name) == 'dhm'] <- paste("out_ghq", postfix, sep="_")
names(data_table_name)[names(data_table_name) == 'out_ghqcase'] <- paste("out_ghqcase", postfix, sep="_")
names(data_table_name)[names(data_table_name) == 'les_c4'] <- paste("out_emp", postfix, sep="_")
names(data_table_name)[names(data_table_name) == 'laboursupplyweekly'] <- paste("out_emphrs", postfix, sep="_")
names(data_table_name)[names(data_table_name) == 'equivaliseddisposableincomeyearl'] <- paste("out_income", postfix, sep="_")
names(data_table_name)[names(data_table_name) == 'atriskofpoverty'] <- paste("out_poverty", postfix, sep="_")
data_table_name
}
get_summary_statistics <- function(main_column_names, # common columns for all minor tables, run id is missing when summarize over all runs
data_table_name, experiment_stage) {
action_columns <- list(averaging = main_out_colnames)
action_list <- list(averaging = mean) # list of functions to evaluate
todo <- tibble(action_columns, action_list) # combinations of functions applied to corresponding columns
pmap_dfc(todo, ~data_table_name[experiment==experiment_stage] %>% group_by(across(all_of(main_column_names))) %>% summarise(across(all_of(.x), .y), .groups = 'keep'))
}
get_effects <-function(data_table) {
column_names <- c('ghq', 'ghqcase', 'emp', 'emphrs', 'income', 'poverty')
for (n in column_names) {
names <- c(paste("eff", n, sep="_"), paste(n, "baseline", sep="_"), paste(n, "reform", sep="_"))
data_table[names[[1]]] <- data_table[paste("out", names[[2]], sep="_")] - data_table[paste("out", names[[3]], sep="_")]
}
data_table
}
df <- data.table()
#scenario_id <- c("S1", "S2", "S3")
scenario_id <- c("S1")
experiment_id <- c("baseline", "reform")
# no parallel data reading at the moment
for (sid in scenario_id) {
for (eid in experiment_id) {
# reads the file, only one at the moment. Decompression takes some time,
# but compressed files use 5 times less space. We also read
# a selected set of columns to save memory/time
fn <- paste(paste("data", sid, eid, sep="/"), "csv.gz", sep=".")
df_scratch <- fread(file=fn,
select = c('laboursupplyweekly', 'atriskofpoverty',
'equivaliseddisposableincomeyearl',
'n_children_0', 'n_children_1',
'n_children_10', 'n_children_11', 'run',
'n_children_12','n_children_13', 'dag',
'n_children_14', 'n_children_15', 'dgn',
'n_children_16', 'n_children_17', 'deh_c3',
'n_children_2', 'n_children_3', 'time',
'n_children_4', 'n_children_5', 'dhm',
'n_children_6', 'n_children_7', 'les_c4',
'n_children_8', 'n_children_9'))
# add scenario/arm id now at the cost of memory
df_scratch[, scenario:=sid]
df_scratch[, experiment:=eid]
# do the type conversion right within the loop
# no check for NA for now
df_scratch$grp_hchild <- select(df_scratch, contains("n_children")) %>% rowSums %>% as.logical
df_scratch$grp_nchild <- !df_scratch$grp_hchild
# drop number of children columns
df_scratch <- df_scratch %>% select(-starts_with("n_children"))
# glue together
df <- rbind(df, df_scratch)
}
}
# free memory
rm(df_scratch)
gc()
# convert certain columns to categorical
df$scenario <- as.factor(df$scenario)
df$experiment <- as.factor(df$experiment)
df$dgn <- as.factor(df$dgn)
df$deh_c3 <- as.factor(df$deh_c3)
gc()
# convert initial raw data into something to work with
unique_years <- unique(df$time) # all available year values
unique_runs <- unique(df$run) # all values of run ids
unique_weekly_hours <- unique(df$laboursupplyweekly) # string values of working hours/week
unique_weekly_hours_numeric <- c(0, 20, 10, 40, 30) # integer values of unique_weekly_hours, used to convert chars to ints properly
unique_labour_status <- unique(df$les_c4) # employment status
unique_labour_status_numeric <- c(0, 1, 0, 0) # conversion to employed/unemployed as integer/boolean
unique_atriskofpoverty_status <- unique(df$atriskofpoverty) # "0" "1" "null" as no, yes, missing value (interpreted as yes)
unique_atriskofpoverty_status_numeric <- c(0, 1, 1) # integer/boolean values of poverty status
df$out_ghqcase <- ifelse(df$dhm<=24, 1, 0) # cut quasi-continuous interval into two sub-intervals
# atriskofpoverty is chars, check for empty strings, char nulls
# les_c4 is the same
# convert poverty status to ints
df$atriskofpoverty <- as.integer(format(factor(df$atriskofpoverty,
levels = unique_atriskofpoverty_status,
labels = unique_atriskofpoverty_status_numeric))) # kind of slow
# convert labour category to ints
df$laboursupplyweekly <- as.integer(format(factor(df$laboursupplyweekly,
levels = unique_weekly_hours,
labels = unique_weekly_hours_numeric))) # kind of slow
# convert employment category to ints
df$les_c4 <- as.integer(format(factor(df$les_c4,
levels = unique_labour_status,
labels = unique_labour_status_numeric))) # kind of slow
gc()
result <- lapply(experiment_id, main_loop, dft=df) %>% reduce(left_join, by = ids)
result$grp_all <- TRUE
result <- result %>% get_effects
subpop_result <- lapply(experiment_id, main_loop, dft=df[dgn=="Male"]) %>% reduce(left_join, by = ids)
subpop_result$grp_male <- TRUE
result <- do.call(rbind, list(result, subpop_result)) %>% get_effects
subpop_result <- lapply(experiment_id, main_loop, dft=df[dgn=="Female"]) %>% reduce(left_join, by = ids)
subpop_result$grp_female <- TRUE
result <- do.call(rbind, list(result, subpop_result)) %>% get_effects
subpop_result <- lapply(experiment_id, main_loop, dft=df[dag >= 25 & dag < 45]) %>% reduce(left_join, by = ids)
subpop_result$grp_age25 <- TRUE
result <- do.call(rbind, list(result, subpop_result)) %>% get_effects
subpop_result <- lapply(experiment_id, main_loop, dft=df[dag >= 45 & dag < 65]) %>% reduce(left_join, by = ids)
subpop_result$grp_age45 <- TRUE
result <- do.call(rbind, list(result, subpop_result)) %>% get_effects
subpop_result <- lapply(experiment_id, main_loop, dft=df[grp_hchild == TRUE]) %>% reduce(left_join, by = ids)
subpop_result$grp_hchild <- TRUE
result <- do.call(rbind, list(result, subpop_result)) %>% get_effects
subpop_result <- lapply(experiment_id, main_loop, dft=df[grp_nchild == TRUE]) %>% reduce(left_join, by = ids)
subpop_result$grp_nchild <- TRUE
result <- do.call(rbind, list(result, subpop_result)) %>% get_effects
subpop_result <- lapply(experiment_id, main_loop, dft=df[les_c4 == 1]) %>% reduce(left_join, by = ids)
subpop_result$grp_emp <- TRUE
result <- do.call(rbind, list(result, subpop_result)) %>% get_effects
subpop_result <- lapply(experiment_id, main_loop, dft=df[les_c4 == 0]) %>% reduce(left_join, by = ids)
subpop_result$grp_unemp <- TRUE
result <- do.call(rbind, list(result, subpop_result)) %>% get_effects
subpop_result <- lapply(experiment_id, main_loop, dft=df[les_c4 == 1 & atriskofpoverty == 1]) %>% reduce(left_join, by = ids)
subpop_result$grp_povin <- TRUE
result <- do.call(rbind, list(result, subpop_result)) %>% get_effects
subpop_result <- lapply(experiment_id, main_loop, dft=df[les_c4 == 0 & atriskofpoverty == 1]) %>% reduce(left_join, by = ids)
subpop_result$grp_povout <- TRUE
result <- do.call(rbind, list(result, subpop_result)) %>% get_effects
subpop_result <- lapply(experiment_id, main_loop, dft=df[deh_c3 == "Low"]) %>% reduce(left_join, by = ids)
subpop_result$grp_edlow <- TRUE
result <- do.call(rbind, list(result, subpop_result)) %>% get_effects
subpop_result <- lapply(experiment_id, main_loop, dft=df[deh_c3 == "Medium"]) %>% reduce(left_join, by = ids)
subpop_result$grp_edmed <- TRUE
result <- do.call(rbind, list(result, subpop_result)) %>% get_effects
subpop_result <- lapply(experiment_id, main_loop, dft=df[deh_c3 == "High"]) %>% reduce(left_join, by = ids)
subpop_result$grp_edhi <- TRUE
result <- do.call(rbind, list(result, subpop_result)) %>% get_effects
# NOTE check original subgroup restrictions, make sure everything is correct
# especially the employment status that's reduced to two possible states with loss of information
vars <- c("ghq", "emp", "emphrs", "ghqcase", "income", "poverty")
for (v in vars) {
result[paste("rank", v, "baseline", sep="_")] <- NA
result[paste("rank", v, "reform", sep="_")] <- NA
result[paste("rank", "eff", v, sep="_")] <- NA
}
for (sid in scenario_id) {
for (actual_year in unique_years) {
for (group in colnames(result)[grep('grp_', colnames(result))]) {
for (v in vars) {
data_selection = which(result$scenario == sid & # pick values for a given year and scenario
result$time == actual_year &
!is.na(result$run) & # drop NA in runs, this stage contains data for a given year and scenario and for all pop groups
!is.na(result[group])) # get all vars - must be 50 rows only
# generate ranks for every var
result[data_selection,][paste("rank", v, "baseline", sep="_")] <- rank(result[data_selection,][paste("out", v, "baseline", sep="_")])
result[data_selection,][paste("rank", v, "reform", sep="_")] <- rank(result[data_selection,][paste("out", v, "reform", sep="_")])
result[data_selection,][paste("rank", "eff", v, sep="_")] <- rank(result[data_selection,][paste("eff", v, sep="_")])
}
}
}
}