This repository has been archived by the owner on Jan 30, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathimport.R
130 lines (120 loc) · 4.55 KB
/
import.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
# Import full data from TresorIt ====
# binding *right* to *one* data frame does not work, because the underlying bind_rows (or sth) inside map_dfr will kill all spss attributes
rawdat <- purrr::map(
.x = list(atizo = "atizo", applause = "applause", crowdguru = "crowdguru"),
.f = function(x) {
df <- haven::read_sav(
file = fs::path("../../Tresors/Crowddaten/data/", x, ext = "sav"),
user_na = FALSE # turns out, the spss file does not actually include properly coded user-defined missings, so this won't help either way
)
# the last 6 cols have unique (hashes?) column names, to avoid confusion we delete them
# they seem to measure some time spent *between* the different survey pages, not very important to keep
df[(ncol(df) - 6):ncol(df)] <- NULL
# Sadly, the below columns are string inputs in the survey, and include some *strings* instead of integers ("four" instead of 4 etc.).
# These offending cells have been fixed by hand in excel, and commited in the below csv.
# It appeared easier to do this in excel than with individual subsetting edits.
df[, c("v_35", "v_38", "v_45", "v_44")] <- readr::read_delim(
file = fs::path("data/manual_corrections/", x, ext = "csv"),
delim = ";",
col_names = TRUE,
col_types = c("iiii")
)
df <- select(
.data = df,
# there are no meaningful short variable names available anywhere in the raw data, so we came up with these for easier handling.
# before doing this, we actually tested that the cryptic names are the same in all three datasets
birth = v_1,
gender = v_2,
education = v_7,
disability_care = v_12,
children = v_17,
employment = v_22,
sum_employer = v_35,
profession_dev = v_37,
sum_platforms = v_38,
platforms = v_39,
h_month = v_45,
h_platform = v_44,
time_of_day = v_46,
time_of_week = v_52,
workspace = v_53,
perm_contract = v_203,
interesting_work = v_175,
learning = v_176,
cooperation = v_177,
support = v_178,
expectations = v_179,
codecide = v_180,
autonomy = v_181,
deadline = v_182,
quantity = v_183,
evaluation = v_184,
transparent_eval = v_185,
planning = v_186,
enough_time = v_187,
training = v_188,
credit_chef = v_189,
credit_col = v_190,
fair = v_191,
adequate = v_192,
safe_job = v_193,
balance_loc = v_194,
balance_time = v_195,
separation = v_196,
transparent_tasks = v_197,
influence_eval = v_198,
consistent_eval = v_199,
dissent_eval = v_200,
wage = v_201,
wage_organisation = v_202,
interesting_work_cw = v_119,
learning_cw = v_120,
cooperation_cw = v_121,
support_cw = v_122,
expectations_cw = v_123,
codecide_cw = v_124,
autonomy_cw = v_125,
deadline_cw = v_126,
quantity_cw = v_127,
evaluation_cw = v_128,
transparent_eval_cw = v_129,
planning_cw = v_130,
enough_time_cw = v_131,
training_cw = v_132,
credit_chef_cw = v_133,
credit_col_cw = v_134,
fair_cw = v_135,
adequate_cw = v_136,
safe_job_cw = v_137,
balance_loc_cw = v_138,
balance_time_cw = v_139,
separation_cw = v_140,
transparent_tasks_cw = v_141,
influence_eval_cw = v_142,
consistent_eval_cw = v_143,
dissent_eval_cw = v_144,
wage_cw = v_145,
wage_organisation_cw = v_146
)
# re-write spss attributes into proper R factors
df <- haven::as_factor(x = df, only_labeled = TRUE)
df
}
)
readr::write_rds(x = rawdat, path = "data/rawdat.rds")
# we also ran these tests, but they are currently disabled
# # test some variables before they get thrown out, just to be sure
# # this was probably used to mark test-runs, of which there appear none left in the data
# assert_subset(x = as.character(crowddata$tester), choices = "Kein Tester")
# # this was probably the state of the user session
# assert_subset(x = as.character(x = crowddata$dispcode), choices = c("Beendet (31)", "Beendet nach Unterbrechung (32)"))
# # this was apparently the time to completion
# # there are some -1s in here, which does not make sense, but the data seems otherwise ok, so this should be NA
# crowddata[crowddata$duration < 0, "duration"] <- NA
# assert_integerish(
# x = crowddata$duration,
# lower = 1,
# upper = 20000,
# any.missing = TRUE
# )
# # all other non-questionnaire variables at the end are uninteresting or empty