-
Notifications
You must be signed in to change notification settings - Fork 2
/
data_cleaning.R
82 lines (67 loc) · 3.23 KB
/
data_cleaning.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
df <- read_csv("ERAE_2019_data_initial_version_23_may_19_noNSRL_newnames.csv")
### this aprt: changes to be merged in the df later
# calculating fat per Kcal
df <- df %>%
mutate(FatKcal = (lipides_100/100)*actual_weight*actual_Kcal,
SFAKcal = (ags_100/100)*actual_weight*actual_Kcal,
SugarKcal = (sucres_100/100)*actual_weight*actual_Kcal,
SaltKcal = (sel_100/100)*actual_weight*actual_Kcal)
# demogaphic variables
# recoding education and translating from French levels to readable output
df <- df %>%
mutate(edu_categorical = fct_recode(education, "< high school" = "aucun", "< high school" = "bp",
"< high school" = "cap", "high school" = "bac", "universty or >" = "sup2",
"universty or >" = "sup4", "universty or >" = "autre"))
# transforming age into a categorical variable
df <- df %>%
mutate(age_categorical = cut(age, breaks = c(0,29,44,59,100), labels = c("<30","30-44","45-59",">60")))
# computing standard of living
df <- df %>%
mutate(income_numeric = case_when(
income=="0_1000" ~ 1000,
income=="1000_2000" ~ 1500,
income=="2000_3000" ~ 2500,
income=="3000_4000" ~ 3500,
income=="4000_5000" ~ 4500,
income=="5000_6000" ~ 5500,
income=="6000_7000" ~ 6500,
income=="7000_8000" ~ 7500,
income=="8000_plus" ~ 8000
))
# 0 children was coded as NA
df <- df %>%
mutate(children = if_else(is.na(children), 0, children))
# computing living standards per year, then split into three equal groups
ls <- df %>%
mutate(consumption_units = (1+(familysize-children-1)*0.5 + children*0.3)) %>%
mutate(living_standard = income_numeric/consumption_units) %>%
mutate(living_standard_year = living_standard*12) %>%
ungroup() %>%
select(subject, living_standard_year) %>%
distinct() %>%
mutate(incomeclass = ntile(living_standard_year, 3))
df <- df %>%
left_join(ls, by = "subject")
## eliminate unused variables
## nutritional details
df <- df %>%
select(-session, -disp, -ingr, -nutr, -remo, -Nremo, -Ndisp,
-picture, -kcal_100, -glucides_100, -proteines_100, -fibres_100, -view,
-portion_NM, -portion_marque_NM, -unit_portion_NM, -nb_portion_NM, -unit_portion_marque_NM, -kcal_portion_NM,
-kj_portion_NM, -pourcent_kcal_portion_NM, -lipides_portion_NM, -pourcent_lipides_portion_NM,-ags_portion_NM,
-sel_portion_NM, -pourcent_ags_portion_NM, -pourcent_sucres_portion_NM,
-sel_portion_NC, -lipides_portion_NC,
-starts_with('kj'), -treatcol, -FSAJulia, -lettre_NSRL, -couleur_NSRL,
-kcal_100_NM, -kcal_100_NC, )
## more
df <- df %>%
select(-unit_portion_NC, -unit_portion_NR, -sucres_portion_NM, -sucres_portion_NC, -sucres_portion_NR,-pourcent_sel_portion_NM,
-pourcent_kcal_portion_NC, -pourcent_lipides_portion_NC, -pourcent_ags_portion_NC, -pourcent_sucres_portion_NC,
-pourcent_sel_portion_NC)
## even more
df <- df %>%
select(-portion_NC, -portion_NR, -kcal_portion_NC, -ags_portion_NC, -ags_portion_NR, -kcal_portion_NR, -lipides_portion_NR,
-ags_portion_NC, -ags_portion_NR, -sel_portion_NR, -kcal_100_NR, -kcal_portion_NR, -income2)
df %>% write_csv("Crosetto_et_al_ERAE2019_data.csv")
## cleaning up
rm(ls)