From d1db74460e7fd1cfa5d789f71c98f6f16f61f53e Mon Sep 17 00:00:00 2001 From: storresrod <69353664+storresrod@users.noreply.github.com> Date: Thu, 28 Dec 2023 16:03:53 -0500 Subject: [PATCH 1/5] Updates to the imputation script These updates include: 1) including all predictors in list of imputed variables (see comments in past PR) 2) removing hisp from list of predictors 3) reporting imputed values for list of predictors and 4) adding a bar chart to look at new dad_education imputation --- NLSY/nlsy_imputation.qmd | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/NLSY/nlsy_imputation.qmd b/NLSY/nlsy_imputation.qmd index 7e8a52c..2d7b21b 100644 --- a/NLSY/nlsy_imputation.qmd +++ b/NLSY/nlsy_imputation.qmd @@ -133,7 +133,7 @@ round(m$mr/(m$mr + m$mm), 3) #looking for low proportions: where both target and #Note: For row target pincome and pnetworth respectively, predictors dad_educ_hs, ret_sav, and hown have lowest proportions impute <- impute %>% - select(-dad_educ_hs, -race) # I suspect multi-collinearity is causing singularity in the multiple imputation below, so removed race and only kept interaction race_eth for now + select(-race, -hisp) # I suspect multi-collinearity is causing singularity in the multiple imputation below, so removed race and only kept interaction race_eth for now ``` ## Multiple Imputation @@ -147,7 +147,7 @@ predM = init$predictorMatrix # Set distributions num <- c("mom_age") # numerical variables -log <- c("hisp", "ret_sav", "hown", "both_pars") # binary variables +log <- c("ret_sav", "hown", "both_pars") # binary variables poly <- c("mom_educ_hs") # ordered categorical variables poly2 <- c("race", "race_eth")# Unordered categorical variable @@ -155,8 +155,8 @@ poly2 <- c("race", "race_eth")# Unordered categorical variable predM[,c("id")] <- 0 # not using as a predictor variable predM[c("id"),] <- 0 # not using as a predictor variable predM[c("race_eth"),] <- 1 # not using as a predictor variable -predM[,c("hisp", "mom_age", "ret_sav", "hown", "mom_educ_hs", "both_pars", "race_eth")] <- 0 # not imputing -meth[c("hisp", "mom_age", "ret_sav", "hown", "mom_educ_hs", "both_pars", "race_eth")]= "" #not imputing missing values, just using as a predictor +#predM[,c("hisp", "mom_age", "ret_sav", "hown", "mom_educ_hs", "both_pars", "race_eth")] <- 0 +#meth[c("hisp", "mom_age", "ret_sav", "hown", "mom_educ_hs", "both_pars", "race_eth")]= "" # Income and Wealth imputation set.seed(25) @@ -166,33 +166,29 @@ imputed <- complete(imputed, action = "repeated", include = FALSE) final <- imputed %>% rename(id = id.1, - mom_age = mom_age.1, - ret_sav = ret_sav.1, - hown = hown.1, - mom_educ_hs = mom_educ_hs.1, - hisp = hisp.1, - both_pars = both_pars.1, race_eth = race_eth.1) %>% - select(-starts_with("id."), -starts_with("mom_age."), -starts_with("ret_sav."), -starts_with("hown."), -starts_with("mom_educ_hs."), -starts_with("dad_educ_hs."), -starts_with("both_pars."), -starts_with("hisp."), -starts_with("race_eth.")) + select(-starts_with("id."), -starts_with("race_eth")) ``` ```{r} # Join back with original income and wealth og_values <- impute %>% - select(id, pincome, pnetworth) + select(id, pincome, pnetworth, race_eth, dad_educ_hs) final <- merge(x=final,y=og_values, by="id", all.x=TRUE) income <- final %>% select(id, starts_with("pincome")) - wealth <- final %>% select(id, starts_with("pnetworth")) +dad_ed <- final %>% + select(id, starts_with("dad_")) pincome <- tidyr::gather(income, key = "imputation", value ="pincome", matches("pincome"), -id) pnetworth <- tidyr::gather(wealth, key = "imputation", value ="pnetworth", matches("pnetworth"), -id) +dad_educ <- tidyr::gather(dad_ed, key = "imputation", value ="dad_educ", matches("dad_educ"), -id) ``` ## Distribution Plots for Parent Income (non-imputed and imputed) @@ -210,6 +206,12 @@ ggplot(pnetworth, aes(x=pnetworth, fill= imputation)) + geom_density(alpha = 0.5) + facet_wrap(~imputation, scales="free") ``` +```{r} +ggplot(dad_educ, aes(x=dad_educ, fill= imputation)) + + geom_bar(alpha = 0.5) + + facet_wrap(~imputation, scales="free") +# Select a larger graphic to see the x-axis labels +``` ## Whisker Plots ```{r} From 97099981b7ff11068a1e36d66ff1a97c38c23032 Mon Sep 17 00:00:00 2001 From: storresrod <69353664+storresrod@users.noreply.github.com> Date: Fri, 29 Dec 2023 10:30:19 -0500 Subject: [PATCH 2/5] Responding to edits in imputation script --- .gitignore | 1 + NLSY/nlsy_imputation.qmd | 24 +++++++++++++++++------- NLSY/nlsy_lib.R | 28 ++-------------------------- 3 files changed, 20 insertions(+), 33 deletions(-) diff --git a/.gitignore b/.gitignore index 1a4c4f5..3e53323 100644 --- a/.gitignore +++ b/.gitignore @@ -19,3 +19,4 @@ !*renv.activate.R .Rproj.user !*.csv +NLSY/NLSY-college-finance_imp.csv diff --git a/NLSY/nlsy_imputation.qmd b/NLSY/nlsy_imputation.qmd index 2d7b21b..d3952f3 100644 --- a/NLSY/nlsy_imputation.qmd +++ b/NLSY/nlsy_imputation.qmd @@ -46,8 +46,8 @@ test <- nlsydf %>% #Add additional variables that should be considered during mu # Making dummy or factor variables test <- test |> mutate( - mom_educ_hs = factor(nlsy_encode_educ4(mom_education)), - dad_educ_hs = factor(nlsy_encode_educ4(dad_education)), + mom_educ_hs = nlsy_encode_educ5(mom_education), + dad_educ_hs = nlsy_encode_educ5(dad_education), race = factor(race), hisp = factor(hispanic), race_eth = ifelse(hisp == "Yes" & race == "White", "White Hispanic", NA), # Race and ethnicity interaction @@ -157,7 +157,11 @@ predM[c("id"),] <- 0 # not using as a predictor variable predM[c("race_eth"),] <- 1 # not using as a predictor variable #predM[,c("hisp", "mom_age", "ret_sav", "hown", "mom_educ_hs", "both_pars", "race_eth")] <- 0 #meth[c("hisp", "mom_age", "ret_sav", "hown", "mom_educ_hs", "both_pars", "race_eth")]= "" +meth +predM +``` +```{r} # Income and Wealth imputation set.seed(25) imputed = mice(impute, method=meth, predictorMatrix=predM, m=5) @@ -168,7 +172,6 @@ final <- imputed %>% rename(id = id.1, race_eth = race_eth.1) %>% select(-starts_with("id."), -starts_with("race_eth")) - ``` ```{r} @@ -206,20 +209,28 @@ ggplot(pnetworth, aes(x=pnetworth, fill= imputation)) + geom_density(alpha = 0.5) + facet_wrap(~imputation, scales="free") ``` + +## Distribution Plots for Dad Education (non-imputed and imputed) + ```{r} -ggplot(dad_educ, aes(x=dad_educ, fill= imputation)) + +ggplot(dad_educ, aes(y= factor(dad_educ, level = c( + "Less than high school", + "High-school graduate", + "Some college", + "College degree", + "Graduate degree" + )), fill= imputation)) + geom_bar(alpha = 0.5) + facet_wrap(~imputation, scales="free") -# Select a larger graphic to see the x-axis labels ``` ## Whisker Plots + ```{r} boxplot(pincome$pincome ~ pincome$imputation) boxplot(pnetworth$pnetworth ~ pnetworth$imputation) ``` - ## Scatter Plots for both parent income and wealth (non-imputed is zero) ```{r} @@ -235,4 +246,3 @@ ggplot(merge, aes(x=pincome, y=pnetworth, color = imputation)) + geom_point(alpha = 0.5) + facet_wrap(~imputation, scales="free") ``` - diff --git a/NLSY/nlsy_lib.R b/NLSY/nlsy_lib.R index 16fa915..701f38d 100644 --- a/NLSY/nlsy_lib.R +++ b/NLSY/nlsy_lib.R @@ -282,34 +282,11 @@ nlsy_get_student_loans_df = function() return(sloandf) } -# Encode education variable -nlsy_encode_educ4 = function(var) -{ - return( case_when( - {{var}} %in% c( - "12TH GRADE" - ) ~ "High-school graduate", - {{var}} %in% c( - "1ST YEAR COLLEGE", - "2ND YEAR COLLEGE", - "3RD YEAR COLLEGE" - ) ~ "Some college", - {{var}} %in% c( - "4TH YEAR COLLEGE", - "5TH YEAR COLLEGE" - ) ~ "College degree", - {{var}} %in% c( - "6TH YEAR COLLEGE", - "7TH YEAR COLLEGE", - "8TH YEAR COLLEGE" - ) ~ "Graduate degree" - ) - ) -} #' Encodes education into 5 levels nlsy_encode_educ5 = function(var, factorize=FALSE) { + edlevels = c( "Less than high school", "High-school graduate", @@ -351,9 +328,8 @@ nlsy_encode_educ5 = function(var, factorize=FALSE) ) ~ edlevels[5] ) - if(factorize) { x = factor(x, levels=edlevels, ordered=TRUE) - } + return(x) } From 6058d1e46541ac8103bac29c2be69e6d271cca98 Mon Sep 17 00:00:00 2001 From: storresrod <69353664+storresrod@users.noreply.github.com> Date: Fri, 29 Dec 2023 11:34:09 -0500 Subject: [PATCH 3/5] Updates to factorization --- NLSY/nlsy_imputation.qmd | 4 ++-- NLSY/nlsy_lib.R | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/NLSY/nlsy_imputation.qmd b/NLSY/nlsy_imputation.qmd index d3952f3..4d205f2 100644 --- a/NLSY/nlsy_imputation.qmd +++ b/NLSY/nlsy_imputation.qmd @@ -46,8 +46,8 @@ test <- nlsydf %>% #Add additional variables that should be considered during mu # Making dummy or factor variables test <- test |> mutate( - mom_educ_hs = nlsy_encode_educ5(mom_education), - dad_educ_hs = nlsy_encode_educ5(dad_education), + mom_educ_hs = nlsy_encode_educ5(mom_education, factorize=TRUE), + dad_educ_hs = nlsy_encode_educ5(dad_education, factorize=TRUE), race = factor(race), hisp = factor(hispanic), race_eth = ifelse(hisp == "Yes" & race == "White", "White Hispanic", NA), # Race and ethnicity interaction diff --git a/NLSY/nlsy_lib.R b/NLSY/nlsy_lib.R index 701f38d..52d23f6 100644 --- a/NLSY/nlsy_lib.R +++ b/NLSY/nlsy_lib.R @@ -328,7 +328,9 @@ nlsy_encode_educ5 = function(var, factorize=FALSE) ) ~ edlevels[5] ) + if(factorize) { x = factor(x, levels=edlevels, ordered=TRUE) + } return(x) } From ffe0f354a746c4e066e697c8b4b4fba8a43c72c8 Mon Sep 17 00:00:00 2001 From: Damir Cosic Date: Fri, 29 Dec 2023 17:51:20 -0500 Subject: [PATCH 4/5] Creates nlsy_get_educ5_levels(). Adds plots of dad's education by race. --- NLSY/nlsy_imputation.html | 1591 +++++++++++++++++++++++++------------ NLSY/nlsy_imputation.qmd | 43 +- NLSY/nlsy_lib.R | 16 +- 3 files changed, 1116 insertions(+), 534 deletions(-) diff --git a/NLSY/nlsy_imputation.html b/NLSY/nlsy_imputation.html index 0a7ff60..ad84ebe 100644 --- a/NLSY/nlsy_imputation.html +++ b/NLSY/nlsy_imputation.html @@ -2,7 +2,7 @@ - + @@ -17,9 +17,9 @@ ul.task-list{list-style: none;} ul.task-list li input[type="checkbox"] { width: 0.8em; -margin: 0 0.8em 0.2em -1.6em; -vertical-align: middle; +margin: 0 0.8em 0.2em -1em; vertical-align: middle; } + pre > code.sourceCode { white-space: pre; position: relative; } pre > code.sourceCode > span { display: inline-block; line-height: 1.25; } pre > code.sourceCode > span:empty { height: 1.2em; } @@ -46,53 +46,23 @@ -khtml-user-select: none; -moz-user-select: none; -ms-user-select: none; user-select: none; padding: 0 4px; width: 4em; -color: #aaaaaa; } -pre.numberSource { margin-left: 3em; border-left: 1px solid #aaaaaa; padding-left: 4px; } +pre.numberSource { margin-left: 3em; padding-left: 4px; } div.sourceCode { } @media screen { pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; } } -code span.al { color: #ff0000; font-weight: bold; } -code span.an { color: #60a0b0; font-weight: bold; font-style: italic; } -code span.at { color: #7d9029; } -code span.bn { color: #40a070; } -code span.bu { color: #008000; } -code span.cf { color: #007020; font-weight: bold; } -code span.ch { color: #4070a0; } -code span.cn { color: #880000; } -code span.co { color: #60a0b0; font-style: italic; } -code span.cv { color: #60a0b0; font-weight: bold; font-style: italic; } -code span.do { color: #ba2121; font-style: italic; } -code span.dt { color: #902000; } -code span.dv { color: #40a070; } -code span.er { color: #ff0000; font-weight: bold; } -code span.ex { } -code span.fl { color: #40a070; } -code span.fu { color: #06287e; } -code span.im { color: #008000; font-weight: bold; } -code span.in { color: #60a0b0; font-weight: bold; font-style: italic; } -code span.kw { color: #007020; font-weight: bold; } -code span.op { color: #666666; } -code span.ot { color: #007020; } -code span.pp { color: #bc7a00; } -code span.sc { color: #4070a0; } -code span.ss { color: #bb6688; } -code span.st { color: #4070a0; } -code span.va { color: #19177c; } -code span.vs { color: #4070a0; } -code span.wa { color: #60a0b0; font-weight: bold; font-style: italic; } +!function(t,e){"object"==typeof exports&&"object"==typeof module?module.exports=e():"function"==typeof define&&define.amd?define([],e):"object"==typeof exports?exports.ClipboardJS=e():t.ClipboardJS=e()}(this,function(){return n={686:function(t,e,n){"use strict";n.d(e,{default:function(){return b}});var e=n(279),i=n.n(e),e=n(370),u=n.n(e),e=n(817),r=n.n(e);function c(t){try{return document.execCommand(t)}catch(t){return}}var a=function(t){t=r()(t);return c("cut"),t};function o(t,e){var n,o,t=(n=t,o="rtl"===document.documentElement.getAttribute("dir"),(t=document.createElement("textarea")).style.fontSize="12pt",t.style.border="0",t.style.padding="0",t.style.margin="0",t.style.position="absolute",t.style[o?"right":"left"]="-9999px",o=window.pageYOffset||document.documentElement.scrollTop,t.style.top="".concat(o,"px"),t.setAttribute("readonly",""),t.value=n,t);return e.container.appendChild(t),e=r()(t),c("copy"),t.remove(),e}var f=function(t){var e=1 - + - + @@ -2603,7 +3001,7 @@
#Non-significant: par_dec - dropping for now
@@ -2748,16 +3150,25 @@

drop_na() M <- cor(testing_cor) -corrplot(M, type = "upper") +cor(testing_cor) +
+
              age_resp   mom_age     ret_sav       hown   both_pars
+age_resp   1.000000000 0.2068723 0.007470059 0.02104089 -0.03181587
+mom_age    0.206872279 1.0000000 0.151656832 0.20570676  0.23089952
+ret_sav    0.007470059 0.1516568 1.000000000 0.41820682  0.26668117
+hown       0.021040887 0.2057068 0.418206822 1.00000000  0.37400195
+both_pars -0.031815873 0.2308995 0.266681165 0.37400195  1.00000000
+
+
corrplot(M, type = "upper")
-

+

Checking for multi-collinearity - VIF and Successive addition of regressors

-
summary(model <- lm(pincome ~ mom_age, data = test)) #Adj R2 = 0.057
+
summary(model <- lm(pincome ~ mom_age, data = test)) #Adj R2 = 0.057

 Call:
@@ -2779,7 +3190,7 @@ 

summary(model <- lm(pincome ~ mom_age + hisp, data = test)) #Adj R2 = 0.09

+
summary(model <- lm(pincome ~ mom_age + hisp, data = test)) #Adj R2 = 0.09

 Call:
@@ -2802,12 +3213,12 @@ 

car::vif(model)

+
car::vif(model)
 mom_age     hisp 
 1.000325 1.000325 
-
summary(model <- lm(pincome ~ mom_age + hisp + race, data = test)) # #Adj R2 = 0.157
+
summary(model <- lm(pincome ~ mom_age + hisp + race, data = test)) # #Adj R2 = 0.157

 Call:
@@ -2834,14 +3245,14 @@ 

car::vif(model)

+
car::vif(model)
            GVIF Df GVIF^(1/(2*Df))
 mom_age 1.030125  1        1.014951
 hisp    1.616650  1        1.271476
 race    1.662056  4        1.065567
-
summary(model <- lm(pincome ~ mom_age + hisp + race + mom_educ_hs, data = test)) #Adj R2 = 0.1899
+
summary(model <- lm(pincome ~ mom_age + hisp + race + mom_educ_hs, data = test)) #Adj R2 = 0.1899

 Call:
@@ -2849,37 +3260,38 @@ 

car::vif(model)

+
car::vif(model)
                GVIF Df GVIF^(1/(2*Df))
-mom_age     1.083580  1        1.040952
-hisp        1.516648  1        1.231523
-race        1.588827  4        1.059582
-mom_educ_hs 1.092325  3        1.014827
+mom_age 1.084882 1 1.041577 +hisp 1.767578 1 1.329503 +race 1.740222 4 1.071706 +mom_educ_hs 1.240199 4 1.027274
-
summary(model <- lm(pincome ~ mom_age + hisp + race + mom_educ_hs + dad_educ_hs, data = test)) #Adj R2 = 0.1616
+
summary(model <- lm(pincome ~ mom_age + hisp + race + mom_educ_hs + dad_educ_hs, data = test)) #Adj R2 = 0.1616

 Call:
@@ -2888,41 +3300,43 @@ 

car::vif(model)

+
car::vif(model)
                GVIF Df GVIF^(1/(2*Df))
-mom_age     1.116575  1        1.056681
-hisp        1.458563  1        1.207710
-race        1.519815  4        1.053717
-mom_educ_hs 1.373641  3        1.054336
-dad_educ_hs 1.400450  3        1.057738
+mom_age 1.104122 1 1.050772 +hisp 1.721196 1 1.311943 +race 1.664009 4 1.065723 +mom_educ_hs 1.912808 4 1.084449 +dad_educ_hs 1.915439 4 1.084635
-
summary(model <- lm(pincome ~ mom_age + hisp + race + mom_educ_hs + dad_educ_hs + ret_sav, data = test)) #Adj R2 = 0.199
+
summary(model <- lm(pincome ~ mom_age + hisp + race + mom_educ_hs + dad_educ_hs + ret_sav, data = test)) #Adj R2 = 0.199

 Call:
@@ -2931,43 +3345,45 @@ 

car::vif(model)

+
car::vif(model)
                GVIF Df GVIF^(1/(2*Df))
-mom_age     1.117451  1        1.057096
-hisp        1.470673  1        1.212713
-race        1.567470  4        1.057791
-mom_educ_hs 1.387635  3        1.056118
-dad_educ_hs 1.411949  3        1.059180
-ret_sav     1.074406  1        1.036535
+mom_age 1.107847 1 1.052543 +hisp 1.755546 1 1.324970 +race 1.716629 4 1.069879 +mom_educ_hs 1.981531 4 1.089244 +dad_educ_hs 1.981972 4 1.089274 +ret_sav 1.233499 1 1.110630
-
summary(model <- lm(pincome ~ mom_age + hisp + race + mom_educ_hs + dad_educ_hs + ret_sav + hown, data = test)) #Adj R2 = 0.2107
+
summary(model <- lm(pincome ~ mom_age + hisp + race + mom_educ_hs + dad_educ_hs + ret_sav + hown, data = test)) #Adj R2 = 0.2107

 Call:
@@ -2976,45 +3392,47 @@ 

car::vif(model)

+
car::vif(model)
                GVIF Df GVIF^(1/(2*Df))
-mom_age     1.139122  1        1.067297
-hisp        1.443144  1        1.201309
-race        1.552772  4        1.056546
-mom_educ_hs 1.391807  3        1.056647
-dad_educ_hs 1.411255  3        1.059093
-ret_sav     1.109899  1        1.053517
-hown        1.099569  1        1.048603
+mom_age 1.121945 1 1.059219 +hisp 1.758563 1 1.326108 +race 1.719804 4 1.070126 +mom_educ_hs 2.041947 4 1.093341 +dad_educ_hs 2.052989 4 1.094078 +ret_sav 1.281289 1 1.131940 +hown 1.199491 1 1.095213
-
summary(model <- lm(pincome ~ mom_age + hisp + race + mom_educ_hs + dad_educ_hs + ret_sav + hown + both_pars, data = test)) #Adj R2 = 0.2108
+
summary(model <- lm(pincome ~ mom_age + hisp + race + mom_educ_hs + dad_educ_hs + ret_sav + hown + both_pars, data = test)) #Adj R2 = 0.2108

 Call:
@@ -3023,45 +3441,47 @@ 

car::vif(model)

+
car::vif(model)
                GVIF Df GVIF^(1/(2*Df))
-mom_age     1.198064  1        1.094561
-hisp        1.445199  1        1.202164
-race        1.559232  4        1.057095
-mom_educ_hs 1.392832  3        1.056776
-dad_educ_hs 1.421260  3        1.060341
-ret_sav     1.110001  1        1.053566
-hown        1.115309  1        1.056082
-both_pars   1.115809  1        1.056319
+mom_age 1.178926 1 1.085783 +hisp 1.768026 1 1.329671 +race 1.725004 4 1.070530 +mom_educ_hs 2.044517 4 1.093513 +dad_educ_hs 2.066047 4 1.094946 +ret_sav 1.281342 1 1.131964 +hown 1.214451 1 1.102021 +both_pars 1.113247 1 1.055105
@@ -3069,33 +3489,33 @@

Checking for missings in the predictor variables

-
impute <- test %>%
-    select(-hispanic, -mom_education, -dad_education, -savings, -home, -both_parents, -par1_deceased, -par2_deceased, - mom_age_birth, -par_dec, -age_resp)
-t <- nearZeroVar(impute) # no variables are found to have near zero variance
-
-sapply(impute, function(x) sum(is.na(x))) # dad_educ_hs has 4598 missings
+
impute <- test %>%
+    select(-hispanic, -mom_education, -dad_education, -savings, -home, -both_parents, -par1_deceased, -par2_deceased, - mom_age_birth, -par_dec, -age_resp)
+t <- nearZeroVar(impute) # no variables are found to have near zero variance
+
+sapply(impute, function(x) sum(is.na(x))) # dad_educ_hs has 4598 missings
         id        race     pincome   pnetworth     mom_age mom_educ_hs 
-          0          80        2396        2365         608        2879 
+          0          80        2396        2365         608        1068 
 dad_educ_hs        hisp    race_eth     ret_sav        hown   both_pars 
-       4598          24          88        1194        1616           0 
+ 3433 24 88 1194 1616 0
-
# Calculating percent of usable cases
-m <- md.pairs(impute)
-round(m$mr/(m$mr + m$mm), 3) #looking for low proportions: where both target and predictor are missing on the same cases
+
# Calculating percent of usable cases
+m <- md.pairs(impute)
+round(m$mr/(m$mr + m$mm), 3) #looking for low proportions: where both target and predictor are missing on the same cases
             id  race pincome pnetworth mom_age mom_educ_hs dad_educ_hs  hisp
 id          NaN   NaN     NaN       NaN     NaN         NaN         NaN   NaN
-race          1 0.000   0.600     0.562   0.838       0.312       0.200 0.800
-pincome       1 0.987   0.000     0.328   0.903       0.608       0.442 0.996
-pnetworth     1 0.985   0.319     0.000   0.901       0.658       0.526 0.994
-mom_age       1 0.979   0.618     0.613   0.000       0.497       0.428 0.985
-mom_educ_hs   1 0.981   0.673     0.719   0.894       0.000       0.225 0.993
-dad_educ_hs   1 0.986   0.709     0.756   0.924       0.515       0.000 0.996
-hisp          1 0.333   0.625     0.375   0.625       0.167       0.167 0.000
-race_eth      1 0.091   0.602     0.568   0.830       0.307       0.193 0.727
-ret_sav       1 0.972   0.073     0.000   0.869       0.555       0.440 0.989
-hown          1 0.981   0.338     0.296   0.892       0.546       0.399 0.993
+race          1 0.000   0.600     0.562   0.838       0.713       0.488 0.800
+pincome       1 0.987   0.000     0.328   0.903       0.837       0.580 0.996
+pnetworth     1 0.985   0.319     0.000   0.901       0.833       0.648 0.994
+mom_age       1 0.979   0.618     0.613   0.000       0.660       0.533 0.985
+mom_educ_hs   1 0.978   0.635     0.629   0.806       0.000       0.358 0.982
+dad_educ_hs   1 0.988   0.707     0.757   0.917       0.800       0.000 0.994
+hisp          1 0.333   0.625     0.375   0.625       0.208       0.167 0.000
+race_eth      1 0.091   0.602     0.568   0.830       0.670       0.455 0.727
+ret_sav       1 0.972   0.073     0.000   0.869       0.781       0.600 0.989
+hown          1 0.981   0.338     0.296   0.892       0.811       0.584 0.993
 both_pars   NaN   NaN     NaN       NaN     NaN         NaN         NaN   NaN
             race_eth ret_sav  hown both_pars
 id               NaN     NaN   NaN       NaN
@@ -3103,222 +3523,255 @@ 

#Note: For row target pincome and pnetworth respectively, predictors dad_educ_hs, ret_sav, and hown have lowest proportions
-
-impute <- impute %>%
-    select(-dad_educ_hs, -race) # I suspect multi-collinearity is causing singularity in the multiple imputation below, so removed race and only kept interaction race_eth for now

+
#Note: For row target pincome and pnetworth respectively, predictors dad_educ_hs, ret_sav, and hown have lowest proportions
+
+impute <- impute %>%
+    select(-race, -hisp) # I suspect multi-collinearity is causing singularity in the multiple imputation below, so removed race and only kept interaction race_eth for now

Multiple Imputation

-
# Paper that uses multiple imputation: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6049093/
-# Multiple imputation in R reference: https://library.virginia.edu/data/articles/getting-started-with-multiple-imputation-in-r
-init = mice(impute, maxit=10)
+
# Paper that uses multiple imputation: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6049093/
+# Multiple imputation in R reference: https://library.virginia.edu/data/articles/getting-started-with-multiple-imputation-in-r
+init = mice(impute, maxit=5)

  iter imp variable
-  1   1  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
-  1   2  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
-  1   3  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
-  1   4  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
-  1   5  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
-  2   1  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
-  2   2  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
-  2   3  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
-  2   4  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
-  2   5  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
-  3   1  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
-  3   2  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
-  3   3  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
-  3   4  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
-  3   5  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
-  4   1  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
-  4   2  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
-  4   3  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
-  4   4  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
-  4   5  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
-  5   1  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
-  5   2  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
-  5   3  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
-  5   4  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
-  5   5  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
-  6   1  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
-  6   2  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
-  6   3  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
-  6   4  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
-  6   5  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
-  7   1  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
-  7   2  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
-  7   3  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
-  7   4  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
-  7   5  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
-  8   1  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
-  8   2  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
-  8   3  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
-  8   4  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
-  8   5  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
-  9   1  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
-  9   2  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
-  9   3  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
-  9   4  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
-  9   5  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
-  10   1  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
-  10   2  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
-  10   3  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
-  10   4  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
-  10   5  pincome  pnetworth  mom_age  mom_educ_hs  hisp  race_eth  ret_sav  hown
+ 1 1 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth ret_sav hown + 1 2 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth ret_sav hown + 1 3 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth ret_sav hown + 1 4 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth ret_sav hown + 1 5 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth ret_sav hown + 2 1 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth ret_sav hown + 2 2 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth ret_sav hown + 2 3 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth ret_sav hown + 2 4 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth ret_sav hown + 2 5 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth ret_sav hown + 3 1 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth ret_sav hown + 3 2 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth ret_sav hown + 3 3 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth ret_sav hown + 3 4 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth ret_sav hown + 3 5 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth ret_sav hown + 4 1 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth ret_sav hown + 4 2 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth ret_sav hown + 4 3 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth ret_sav hown + 4 4 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth ret_sav hown + 4 5 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth ret_sav hown + 5 1 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth ret_sav hown + 5 2 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth ret_sav hown + 5 3 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth ret_sav hown + 5 4 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth ret_sav hown + 5 5 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth ret_sav hown
+
+
meth = init$method
+predM = init$predictorMatrix
+
+# Set distributions
+num <- c("mom_age") # numerical variables
+log <- c("ret_sav", "hown", "both_pars") # binary variables
+poly <- c("mom_educ_hs") # ordered categorical variables
+poly2 <- c("race", "race_eth")# Unordered categorical variable
+
+# Assign Variables to be imputed
+predM[,c("id")] <- 0 # not using as a predictor variable
+predM[c("id"),] <- 0 # not using as a predictor variable
+predM[c("race_eth"),] <- 1 # not using as a predictor variable
+#predM[,c("hisp", "mom_age", "ret_sav", "hown", "mom_educ_hs", "both_pars", "race_eth")] <- 0 
+#meth[c("hisp", "mom_age", "ret_sav", "hown", "mom_educ_hs", "both_pars", "race_eth")]= "" 
+meth
+
+
         id     pincome   pnetworth     mom_age mom_educ_hs dad_educ_hs 
+         ""       "pmm"       "pmm"       "pmm"      "polr"      "polr" 
+   race_eth     ret_sav        hown   both_pars 
+  "polyreg"       "pmm"       "pmm"          "" 
-
meth = init$method
-predM = init$predictorMatrix
-
-# Set distributions
-num <- c("mom_age") # numerical variables
-log <- c("hisp", "ret_sav", "hown", "both_pars") # binary variables
-poly <- c("mom_educ_hs") # ordered categorical variables
-poly2 <- c("race", "race_eth")# Unordered categorical variable
-
-# Assign Variables to be imputed
-predM[,c("id")] <- 0 # not using as a predictor variable
-predM[c("id"),] <- 0 # not using as a predictor variable
-predM[c("race_eth"),] <- 1 # not using as a predictor variable
-predM[,c("hisp", "mom_age", "ret_sav", "hown", "mom_educ_hs", "both_pars", "race_eth")] <- 0 # not imputing
-meth[c("hisp", "mom_age", "ret_sav", "hown", "mom_educ_hs", "both_pars", "race_eth")]= "" #not imputing missing values, just using as a predictor
-
-# Income and Wealth imputation
-set.seed(25)
-imputed = mice(impute, method=meth, predictorMatrix=predM, m=10)
+
predM
+
+
            id pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth
+id           0       0         0       0           0           0        0
+pincome      0       0         1       1           1           1        1
+pnetworth    0       1         0       1           1           1        1
+mom_age      0       1         1       0           1           1        1
+mom_educ_hs  0       1         1       1           0           1        1
+dad_educ_hs  0       1         1       1           1           0        1
+race_eth     1       1         1       1           1           1        1
+ret_sav      0       1         1       1           1           1        1
+hown         0       1         1       1           1           1        1
+both_pars    0       1         1       1           1           1        1
+            ret_sav hown both_pars
+id                0    0         0
+pincome           1    1         1
+pnetworth         1    1         1
+mom_age           1    1         1
+mom_educ_hs       1    1         1
+dad_educ_hs       1    1         1
+race_eth          1    1         1
+ret_sav           0    1         1
+hown              1    0         1
+both_pars         1    1         0
+
+
+
+
# Income and Wealth imputation
+set.seed(25)
+imputed = mice(impute, method=meth, predictorMatrix=predM, m=5)

  iter imp variable
-  1   1  pincome  pnetworth
-  1   2  pincome  pnetworth
-  1   3  pincome  pnetworth
-  1   4  pincome  pnetworth
-  1   5  pincome  pnetworth
-  1   6  pincome  pnetworth
-  1   7  pincome  pnetworth
-  1   8  pincome  pnetworth
-  1   9  pincome  pnetworth
-  1   10  pincome  pnetworth
-  2   1  pincome  pnetworth
-  2   2  pincome  pnetworth
-  2   3  pincome  pnetworth
-  2   4  pincome  pnetworth
-  2   5  pincome  pnetworth
-  2   6  pincome  pnetworth
-  2   7  pincome  pnetworth
-  2   8  pincome  pnetworth
-  2   9  pincome  pnetworth
-  2   10  pincome  pnetworth
-  3   1  pincome  pnetworth
-  3   2  pincome  pnetworth
-  3   3  pincome  pnetworth
-  3   4  pincome  pnetworth
-  3   5  pincome  pnetworth
-  3   6  pincome  pnetworth
-  3   7  pincome  pnetworth
-  3   8  pincome  pnetworth
-  3   9  pincome  pnetworth
-  3   10  pincome  pnetworth
-  4   1  pincome  pnetworth
-  4   2  pincome  pnetworth
-  4   3  pincome  pnetworth
-  4   4  pincome  pnetworth
-  4   5  pincome  pnetworth
-  4   6  pincome  pnetworth
-  4   7  pincome  pnetworth
-  4   8  pincome  pnetworth
-  4   9  pincome  pnetworth
-  4   10  pincome  pnetworth
-  5   1  pincome  pnetworth
-  5   2  pincome  pnetworth
-  5   3  pincome  pnetworth
-  5   4  pincome  pnetworth
-  5   5  pincome  pnetworth
-  5   6  pincome  pnetworth
-  5   7  pincome  pnetworth
-  5   8  pincome  pnetworth
-  5   9  pincome  pnetworth
-  5   10  pincome  pnetworth
+ 1 1 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth ret_sav hown + 1 2 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth ret_sav hown + 1 3 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth ret_sav hown + 1 4 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth ret_sav hown + 1 5 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth ret_sav hown + 2 1 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth ret_sav hown + 2 2 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth ret_sav hown + 2 3 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth ret_sav hown + 2 4 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth ret_sav hown + 2 5 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth ret_sav hown + 3 1 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth ret_sav hown + 3 2 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth ret_sav hown + 3 3 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth ret_sav hown + 3 4 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth ret_sav hown + 3 5 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth ret_sav hown + 4 1 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth ret_sav hown + 4 2 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth ret_sav hown + 4 3 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth ret_sav hown + 4 4 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth ret_sav hown + 4 5 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth ret_sav hown + 5 1 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth ret_sav hown + 5 2 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth ret_sav hown + 5 3 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth ret_sav hown + 5 4 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth ret_sav hown + 5 5 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth ret_sav hown
-
imputed <- complete(imputed, action = "repeated", include = FALSE)
-
-final <- imputed %>%
-    rename(id = id.1,
-           mom_age = mom_age.1,
-           ret_sav = ret_sav.1,
-           hown = hown.1,
-           mom_educ_hs = mom_educ_hs.1,
-           hisp = hisp.1,
-           both_pars = both_pars.1,
-           race_eth = race_eth.1) %>%
-    select(-starts_with("id."), -starts_with("mom_age."), -starts_with("ret_sav."), -starts_with("hown."), -starts_with("mom_educ_hs."), -starts_with("dad_educ_hs."), -starts_with("both_pars."), -starts_with("hisp."),  -starts_with("race_eth."))
+
imputed <- complete(imputed, action = "repeated", include = FALSE)
+
+final <- imputed %>%
+    rename(id = id.1,
+           race_eth = race_eth.1) %>%
+    select(-starts_with("id."), -starts_with("race_eth"))
-
# Join back with original income and wealth
-og_values <- impute %>%
-    select(id, pincome, pnetworth)
-
-final <- merge(x=final,y=og_values,
-                    by="id", all.x=TRUE)
-
-income <- final %>%
-    select(id, starts_with("pincome"))
-
-wealth <- final %>%
-    select(id, starts_with("pnetworth"))
-
-pincome <- tidyr::gather(income, key = "imputation", value ="pincome", matches("pincome"), -id)
-pnetworth <- tidyr::gather(wealth, key = "imputation", value ="pnetworth", matches("pnetworth"), -id)
+
# Join back with original income and wealth
+og_values <- impute %>%
+    select(id, pincome, pnetworth, race_eth, dad_educ_hs)
+
+final <- merge(x=final,y=og_values,
+                    by="id", all.x=TRUE)
+
+income <- final %>%
+    select(id, starts_with("pincome"))
+wealth <- final %>%
+    select(id, starts_with("pnetworth"))
+dad_ed <- final %>%
+    select(id, starts_with("dad_"))
+
+pincome <- tidyr::gather(income, key = "imputation", value ="pincome", matches("pincome"), -id)
+pnetworth <- tidyr::gather(wealth, key = "imputation", value ="pnetworth", matches("pnetworth"), -id)
+dad_educ  <- tidyr::gather(dad_ed, key = "imputation", value ="dad_educ", matches("dad_educ"), -id)
+dad_educ = mutate(dad_educ, dad_educ = factor(dad_educ, levels=nlsy_get_educ5_levels()))

Distribution Plots for Parent Income (non-imputed and imputed)

-
ggplot(pincome, aes(x=pincome, fill= imputation)) +
-    geom_density(alpha = 0.5) +
-    facet_wrap(~imputation, scales="free")
+
ggplot(pincome, aes(x=pincome, fill= imputation)) +
+    geom_density(alpha = 0.5) +
+    facet_wrap(~imputation, scales="free")
-

+

Distribution Plots for Parent Wealth (non-imputed and imputed)

-
ggplot(pnetworth, aes(x=pnetworth, fill= imputation)) +
-    geom_density(alpha = 0.5) +
-    facet_wrap(~imputation, scales="free")
+
ggplot(pnetworth, aes(x=pnetworth, fill= imputation)) +
+    geom_density(alpha = 0.5) +
+    facet_wrap(~imputation, scales="free")
+
+

+
+
+
+
+

Distribution Plots for Dad Education (non-imputed and imputed)

+
+
tmpdf = dad_educ |>
+    filter(if_all(everything(), ~!is.na(.) )) |>
+    mutate(imputation = if_else(imputation=="dad_educ_hs", "0", imputation),
+           imputation = gsub("dad_educ_hs.", "", imputation))
+
+
+tmpdf |>
+    group_by(imputation) |>
+    mutate(N=n()) |>
+    group_by(dad_educ, imputation) |>
+    summarise(N=max(N), share=n()/N) |>
+    ggplot() +
+    geom_col(
+        aes(y=share, x=imputation, fill=dad_educ), 
+        position = position_stack(reverse = TRUE)
+        )
+
+

+
+
tmpdf |>
+    left_join(select(test, id, race_eth), by="id") |>
+    filter(!is.na(race_eth)) |>
+    group_by(race_eth, imputation) |>
+    mutate(N=n()) |>
+    group_by(dad_educ, race_eth, imputation) |>
+    summarise(N=max(N), share=n()/N) |>
+    ggplot() +
+    geom_col(
+        aes(y=share, x=imputation, fill=dad_educ),
+        position = position_stack(reverse = TRUE)
+        ) +
+    facet_wrap(~race_eth, nrow=2)
+
+

+
+
+
+
+

Whisker Plots

+
+
boxplot(pincome$pincome ~ pincome$imputation)
+
+

+
+
boxplot(pnetworth$pnetworth ~ pnetworth$imputation)
-

+

Scatter Plots for both parent income and wealth (non-imputed is zero)

-
pincome$imputation <- gsub("pincome.", "", pincome$imputation)
-pincome$imputation <- ifelse(pincome$imputation == "pincome", 0, pincome$imputation)
-
-pnetworth$imputation <- gsub("pnetworth.", "", pnetworth$imputation)
-pnetworth$imputation <- ifelse(pnetworth$imputation == "pnetworth", 0, pnetworth$imputation)
-
-merge <- merge(pincome, pnetworth, by = c("id", "imputation"))
-
-ggplot(merge, aes(x=pincome, y=pnetworth, color = imputation)) +
-    geom_point(alpha = 0.5) +
-    facet_wrap(~imputation, scales="free")
+
pincome$imputation <- gsub("pincome.", "", pincome$imputation)
+pincome$imputation <- ifelse(pincome$imputation == "pincome", 0, pincome$imputation)
+
+pnetworth$imputation <- gsub("pnetworth.", "", pnetworth$imputation)
+pnetworth$imputation <- ifelse(pnetworth$imputation == "pnetworth", 0, pnetworth$imputation)
+
+merge <- merge(pincome, pnetworth, by = c("id", "imputation"))
+
+ggplot(merge, aes(x=pincome, y=pnetworth, color = imputation)) +
+    geom_point(alpha = 0.5) +
+    facet_wrap(~imputation, scales="free")
-

+

@@ -3352,9 +3805,23 @@

{ + const doc = window.document; + const targetCell = annoteEl.getAttribute("data-target-cell"); + const targetAnnotation = annoteEl.getAttribute("data-target-annotation"); + const annoteSpan = window.document.querySelector(selectorForAnnotation(targetCell, targetAnnotation)); + const lines = annoteSpan.getAttribute("data-code-lines").split(","); + const lineIds = lines.map((line) => { + return targetCell + "-" + line; + }) + let top = null; + let height = null; + let parent = null; + if (lineIds.length > 0) { + //compute the position of the single el (top and bottom and make a div) + const el = window.document.getElementById(lineIds[0]); + top = el.offsetTop; + height = el.offsetHeight; + parent = el.parentElement.parentElement; + if (lineIds.length > 1) { + const lastEl = window.document.getElementById(lineIds[lineIds.length - 1]); + const bottom = lastEl.offsetTop + lastEl.offsetHeight; + height = bottom - top; + } + if (top !== null && height !== null && parent !== null) { + // cook up a div (if necessary) and position it + let div = window.document.getElementById("code-annotation-line-highlight"); + if (div === null) { + div = window.document.createElement("div"); + div.setAttribute("id", "code-annotation-line-highlight"); + div.style.position = 'absolute'; + parent.appendChild(div); + } + div.style.top = top - 2 + "px"; + div.style.height = height + 4 + "px"; + let gutterDiv = window.document.getElementById("code-annotation-line-highlight-gutter"); + if (gutterDiv === null) { + gutterDiv = window.document.createElement("div"); + gutterDiv.setAttribute("id", "code-annotation-line-highlight-gutter"); + gutterDiv.style.position = 'absolute'; + const codeCell = window.document.getElementById(targetCell); + const gutter = codeCell.querySelector('.code-annotation-gutter'); + gutter.appendChild(gutterDiv); + } + gutterDiv.style.top = top - 2 + "px"; + gutterDiv.style.height = height + 4 + "px"; + } + selectedAnnoteEl = annoteEl; + } + }; + const unselectCodeLines = () => { + const elementsIds = ["code-annotation-line-highlight", "code-annotation-line-highlight-gutter"]; + elementsIds.forEach((elId) => { + const div = window.document.getElementById(elId); + if (div) { + div.remove(); + } + }); + selectedAnnoteEl = undefined; + }; + // Attach click handler to the DT + const annoteDls = window.document.querySelectorAll('dt[data-target-cell]'); + for (const annoteDlNode of annoteDls) { + annoteDlNode.addEventListener('click', (event) => { + const clickedEl = event.target; + if (clickedEl !== selectedAnnoteEl) { + unselectCodeLines(); + const activeEl = window.document.querySelector('dt[data-target-cell].code-annotation-active'); + if (activeEl) { + activeEl.classList.remove('code-annotation-active'); + } + selectCodeLines(clickedEl); + clickedEl.classList.add('code-annotation-active'); + } else { + // Unselect the line + unselectCodeLines(); + clickedEl.classList.remove('code-annotation-active'); + } + }); + } const findCites = (el) => { const parentEl = el.parentElement; if (parentEl) { diff --git a/NLSY/nlsy_imputation.qmd b/NLSY/nlsy_imputation.qmd index 4d205f2..7190472 100644 --- a/NLSY/nlsy_imputation.qmd +++ b/NLSY/nlsy_imputation.qmd @@ -192,6 +192,7 @@ dad_ed <- final %>% pincome <- tidyr::gather(income, key = "imputation", value ="pincome", matches("pincome"), -id) pnetworth <- tidyr::gather(wealth, key = "imputation", value ="pnetworth", matches("pnetworth"), -id) dad_educ <- tidyr::gather(dad_ed, key = "imputation", value ="dad_educ", matches("dad_educ"), -id) +dad_educ = mutate(dad_educ, dad_educ = factor(dad_educ, levels=nlsy_get_educ5_levels())) ``` ## Distribution Plots for Parent Income (non-imputed and imputed) @@ -213,15 +214,39 @@ ggplot(pnetworth, aes(x=pnetworth, fill= imputation)) + ## Distribution Plots for Dad Education (non-imputed and imputed) ```{r} -ggplot(dad_educ, aes(y= factor(dad_educ, level = c( - "Less than high school", - "High-school graduate", - "Some college", - "College degree", - "Graduate degree" - )), fill= imputation)) + - geom_bar(alpha = 0.5) + - facet_wrap(~imputation, scales="free") + +tmpdf = dad_educ |> + filter(if_all(everything(), ~!is.na(.) )) |> + mutate(imputation = if_else(imputation=="dad_educ_hs", "0", imputation), + imputation = gsub("dad_educ_hs.", "", imputation)) + + +tmpdf |> + group_by(imputation) |> + mutate(N=n()) |> + group_by(dad_educ, imputation) |> + summarise(N=max(N), share=n()/N) |> + ggplot() + + geom_col( + aes(y=share, x=imputation, fill=dad_educ), + position = position_stack(reverse = TRUE) + ) + + +tmpdf |> + left_join(select(test, id, race_eth), by="id") |> + filter(!is.na(race_eth)) |> + group_by(race_eth, imputation) |> + mutate(N=n()) |> + group_by(dad_educ, race_eth, imputation) |> + summarise(N=max(N), share=n()/N) |> + ggplot() + + geom_col( + aes(y=share, x=imputation, fill=dad_educ), + position = position_stack(reverse = TRUE) + ) + + facet_wrap(~race_eth, nrow=2) + ``` ## Whisker Plots diff --git a/NLSY/nlsy_lib.R b/NLSY/nlsy_lib.R index 52d23f6..627d5a5 100644 --- a/NLSY/nlsy_lib.R +++ b/NLSY/nlsy_lib.R @@ -282,18 +282,22 @@ nlsy_get_student_loans_df = function() return(sloandf) } - -#' Encodes education into 5 levels -nlsy_encode_educ5 = function(var, factorize=FALSE) +nlsy_get_educ5_levels = function() { - - edlevels = c( + return(c( "Less than high school", "High-school graduate", "Some college", "College degree", "Graduate degree" - ) + )) +} + +#' Encodes education into 5 levels +nlsy_encode_educ5 = function(var, factorize=FALSE) +{ + + edlevels = nlsy_get_educ5_levels() x = case_when( {{var}} %in% c( From 434707bc92a1f41dfb50e43ef2fbb897a81b2b54 Mon Sep 17 00:00:00 2001 From: Damir Cosic Date: Fri, 5 Jan 2024 12:04:46 -0500 Subject: [PATCH 5/5] Refactors NLSY code. --- NLSY/nlsy_imputation.html | 473 +++++++++++++++++++------------------- NLSY/nlsy_imputation.qmd | 97 ++++---- NLSY/nlsy_lib.R | 102 ++++---- modeling_college_ed.qmd | 8 +- 4 files changed, 334 insertions(+), 346 deletions(-) diff --git a/NLSY/nlsy_imputation.html b/NLSY/nlsy_imputation.html index ad84ebe..8c4ed61 100644 --- a/NLSY/nlsy_imputation.html +++ b/NLSY/nlsy_imputation.html @@ -2,7 +2,7 @@ - + @@ -70,7 +70,52 @@ composed: false, }); +const layoutMarginEls = () => { + // Find any conflicting margin elements and add margins to the + // top to prevent overlap + const marginChildren = window.document.querySelectorAll( + ".column-margin.column-container > * " + ); + + let lastBottom = 0; + for (const marginChild of marginChildren) { + if (marginChild.offsetParent !== null) { + // clear the top margin so we recompute it + marginChild.style.marginTop = null; + const top = marginChild.getBoundingClientRect().top + window.scrollY; + console.log({ + childtop: marginChild.getBoundingClientRect().top, + scroll: window.scrollY, + top, + lastBottom, + }); + if (top < lastBottom) { + const margin = lastBottom - top; + marginChild.style.marginTop = `${margin}px`; + } + const styles = window.getComputedStyle(marginChild); + const marginTop = parseFloat(styles["marginTop"]); + + console.log({ + top, + height: marginChild.getBoundingClientRect().height, + marginTop, + total: top + marginChild.getBoundingClientRect().height + marginTop, + }); + lastBottom = top + marginChild.getBoundingClientRect().height + marginTop; + } + } +}; + window.document.addEventListener("DOMContentLoaded", function (_event) { + // Recompute the position of margin elements anytime the body size changes + if (window.ResizeObserver) { + const resizeObserver = new window.ResizeObserver( + throttle(layoutMarginEls, 50) + ); + resizeObserver.observe(window.document.body); + } + const tocEl = window.document.querySelector('nav.toc-active[role="doc-toc"]'); const sidebarEl = window.document.getElementById("quarto-sidebar"); const leftTocEl = window.document.getElementById("quarto-sidebar-toc-left"); @@ -496,32 +541,6 @@ }; }; - // Find any conflicting margin elements and add margins to the - // top to prevent overlap - const marginChildren = window.document.querySelectorAll( - ".column-margin.column-container > * " - ); - - const layoutMarginEls = () => { - let lastBottom = 0; - for (const marginChild of marginChildren) { - if (marginChild.offsetParent !== null) { - // clear the top margin so we recompute it - marginChild.style.marginTop = null; - const top = marginChild.getBoundingClientRect().top + window.scrollY; - if (top < lastBottom) { - const margin = lastBottom - top; - marginChild.style.marginTop = `${margin}px`; - } - const styles = window.getComputedStyle(marginChild); - const marginTop = parseFloat(styles["marginTop"]); - lastBottom = - top + marginChild.getBoundingClientRect().height + marginTop; - } - } - }; - nexttick(layoutMarginEls); - const tabEls = document.querySelectorAll('a[data-bs-toggle="tab"]'); for (const tabEl of tabEls) { const id = tabEl.getAttribute("data-bs-target"); @@ -531,7 +550,6 @@ ); if (columnEl) tabEl.addEventListener("shown.bs.tab", function (event) { - const el = event.srcElement; if (el) { const visibleCls = `${el.id}-margin-content`; @@ -2992,7 +3010,7 @@ .bi-tencent-qq::before { content: "\f8cb"; } .bi-wikipedia::before { content: "\f8cc"; } - + @@ -3047,52 +3065,22 @@

Data Cleaning

source(here::here("NLSY/nlsy_lib.R")) # Read data -nlsydf = readRDS(paste0(here::here(), "/NLSY/NLSY-college-finance.rds")) +# nlsydf = readRDS(paste0(here::here(), "/NLSY/NLSY-college-finance.rds"))
-
test <- nlsydf %>% #Add additional variables that should be considered during multiple imputation
-    select(id = "PUBID_1997",
-           mom_age_birth = "CV_BIO_MOM_AGE_YOUTH_1997", # numeric
-           age_resp =  "CV_AGE_12/31/96_1997",
-           hispanic = "KEY_ETHNICITY_1997", # binary
-           race = "KEY_RACE_1997", # categorical
-           mom_education = "CV_HGC_RES_MOM_1997", # categorical - highest grade completed in 1997 by mom
-           dad_education = "CV_HGC_RES_DAD_1997", # categorical - highest grade completed in 1997 by dad
-           savings = "P5-130_1997", #binary - yes/no retirement savings
-           home = "P5-101_1997", # binary - homeownership or renting
-           both_parents = "YOUTH_BOTHBIO.01_1997",
-           par1_deceased = "YOUTH_NONR1DEAD.01_1997",
-           par2_deceased = "YOUTH_NONR2DEAD.01_1997",
-           pincome = "CV_INCOME_GROSS_YR_1997",
-           pnetworth = "CV_HH_NET_WORTH_P_1997",
-           # Also, it seems that this represents the wealth of the household in which a respondent lives. That means that for a respondent who lives with only one parent, the net-worth variable represents the wealth of that parent.
-           # In the case of an independent respondent, contains respondent's, rather than parents', wealth.
-    ) %>%
-    mutate(mom_age = mom_age_birth + age_resp)
-
-# Making dummy or factor variables
-test <- test |>
-    mutate(
-        mom_educ_hs = nlsy_encode_educ5(mom_education, factorize=TRUE),
-        dad_educ_hs = nlsy_encode_educ5(dad_education, factorize=TRUE),
-        race = factor(race),
-        hisp = factor(hispanic),
-        race_eth = ifelse(hisp == "Yes" & race == "White", "White Hispanic",  NA), # Race and ethnicity interaction
-        race_eth = ifelse(hisp == "No" & race == "White", "White NonHispanic", race_eth),
-        race_eth = ifelse(hisp == "Yes" & race == "Black or African American", "Black Hispanic", race_eth),
-        race_eth = ifelse(hisp == "No" & race == "Black or African American", "Black NonHispanic", race_eth),
-        race_eth = ifelse(hisp == "Yes" & race == "American Indian, Eskimo, or Aleut", "AIAN Hispanic", race_eth),
-        race_eth = ifelse(hisp == "Yes" & race == "Asian or Pacific Islander", "AAPI Hispanic", race_eth),
-        race_eth = ifelse(hisp == "Yes" & race == "Something else? (SPECIFY)", "Other Race Hispanic", race_eth),
-        race_eth = ifelse(hisp == "No" & race == "American Indian, Eskimo, or Aleut", "AIAN NonHispanic", race_eth),
-        race_eth = ifelse(hisp == "No" & race == "Asian or Pacific Islander", "AAPI NonHispanic", race_eth),
-        race_eth = ifelse(hisp == "No" & race == "Something else? (SPECIFY)", "Other Race NonHispanic", race_eth),
-        race_eth = factor(race_eth))
-
-test$ret_sav <- ifelse(test$savings == "YES", 1, 0)
-test$hown <- ifelse(test$home == "OWNS OR IS BUYING; LAND CONTRACT", 1, 0)
-test$both_pars <- ifelse(test$both_parents == "Yes", 1, 0)
-test$par_dec <- ifelse(test$par1_deceased == "Yes" | test$par2_deceased == "Yes", 1, 0) # Either parent deceased
+
basedf = nlsy_get_base_df()
+
+# Making dummy or factor variables
+test <- basedf |>
+    mutate(
+        mom_educ_hs = nlsy_encode_educ5(mom_res_ed, factorize=TRUE),
+        dad_educ_hs = nlsy_encode_educ5(dad_res_ed, factorize=TRUE),
+        race_eth = nlsy_recode_race_and_eth10(race, hisp),
+        race_eth = factor(race_eth),
+        race = factor(race),
+        hisp = factor(hisp),
+        par_dec = ifelse(par1_deceased == "Yes" | par2_deceased == "Yes", 1, 0) # Either parent deceased
+    )
# Looking for significance with non-missing income and wealth
@@ -3100,12 +3088,13 @@ 

Data Cleaning

filter(!is.na(pincome), !is.na(pnetworth)) -summary(model <- lm(pincome ~ mom_age + hisp + race + mom_educ_hs + dad_educ_hs + ret_sav + hown + both_pars + par_dec, data = non_missing))
+summary(model <- lm(pincome ~ mom_age + hisp + race + mom_educ_hs + dad_educ_hs + has_retsav + owns_home + both_parents + par_dec, data = non_missing))

 Call:
 lm(formula = pincome ~ mom_age + hisp + race + mom_educ_hs + 
-    dad_educ_hs + ret_sav + hown + both_pars + par_dec, data = non_missing)
+    dad_educ_hs + has_retsav + owns_home + both_parents + par_dec, 
+    data = non_missing)
 
 Residuals:
     Min      1Q  Median      3Q     Max 
@@ -3128,9 +3117,9 @@ 

Data Cleaning

dad_educ_hs.Q 1917.6 1883.6 1.018 0.308742 dad_educ_hs.C 1188.6 1537.0 0.773 0.439388 dad_educ_hs^4 -396.1 1353.0 -0.293 0.769705 -ret_sav 16691.9 1481.8 11.265 < 2e-16 *** -hown 12547.9 1628.5 7.705 1.77e-14 *** -both_pars -1774.0 1703.6 -1.041 0.297808 +has_retsav 16691.9 1481.8 11.265 < 2e-16 *** +owns_home 12547.9 1628.5 7.705 1.77e-14 *** +both_parents -1774.0 1703.6 -1.041 0.297808 par_dec -2656.9 5251.9 -0.506 0.612968 --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 @@ -3145,23 +3134,23 @@

Data Cleaning

Checking for multi-collinearity - correlation plot for numeric variables

-
testing_cor <- test %>%
-    select(-id, -hispanic, -race, -mom_education, -dad_education, -savings, -home, -both_parents, -par1_deceased, -par2_deceased, - mom_age_birth, -pincome, -pnetworth, -par_dec, -mom_educ_hs,-dad_educ_hs, -hisp,-race_eth) %>%
+
testing_cor <- test |>
+    select(age_resp, mom_age, has_retsav, owns_home, both_parents) |>
     drop_na()
 
 M <- cor(testing_cor)
 cor(testing_cor)
-
              age_resp   mom_age     ret_sav       hown   both_pars
-age_resp   1.000000000 0.2068723 0.007470059 0.02104089 -0.03181587
-mom_age    0.206872279 1.0000000 0.151656832 0.20570676  0.23089952
-ret_sav    0.007470059 0.1516568 1.000000000 0.41820682  0.26668117
-hown       0.021040887 0.2057068 0.418206822 1.00000000  0.37400195
-both_pars -0.031815873 0.2308995 0.266681165 0.37400195  1.00000000
+
                 age_resp   mom_age  has_retsav  owns_home both_parents
+age_resp      1.000000000 0.2068723 0.007470059 0.02104089  -0.03181587
+mom_age       0.206872279 1.0000000 0.151656832 0.20570676   0.23089952
+has_retsav    0.007470059 0.1516568 1.000000000 0.41820682   0.26668117
+owns_home     0.021040887 0.2057068 0.418206822 1.00000000   0.37400195
+both_parents -0.031815873 0.2308995 0.266681165 0.37400195   1.00000000
corrplot(M, type = "upper")
-

+

@@ -3336,12 +3325,12 @@

summary(model <- lm(pincome ~ mom_age + hisp + race + mom_educ_hs + dad_educ_hs + ret_sav, data = test)) #Adj R2 = 0.199

+
summary(model <- lm(pincome ~ mom_age + hisp + race + mom_educ_hs + dad_educ_hs + has_retsav, data = test)) #Adj R2 = 0.199

 Call:
 lm(formula = pincome ~ mom_age + hisp + race + mom_educ_hs + 
-    dad_educ_hs + ret_sav, data = test)
+    dad_educ_hs + has_retsav, data = test)
 
 Residuals:
     Min      1Q  Median      3Q     Max 
@@ -3364,7 +3353,7 @@ 

summary(model <- lm(pincome ~ mom_age + hisp + race + mom_educ_hs + dad_educ_hs + ret_sav + hown, data = test)) #Adj R2 = 0.2107

+
summary(model <- lm(pincome ~ mom_age + hisp + race + mom_educ_hs + dad_educ_hs + has_retsav + owns_home, data = test)) #Adj R2 = 0.2107

 Call:
 lm(formula = pincome ~ mom_age + hisp + race + mom_educ_hs + 
-    dad_educ_hs + ret_sav + hown, data = test)
+    dad_educ_hs + has_retsav + owns_home, data = test)
 
 Residuals:
     Min      1Q  Median      3Q     Max 
@@ -3411,8 +3400,8 @@ 

summary(model <- lm(pincome ~ mom_age + hisp + race + mom_educ_hs + dad_educ_hs + ret_sav + hown + both_pars, data = test)) #Adj R2 = 0.2108

+
summary(model <- lm(pincome ~ mom_age + hisp + race + mom_educ_hs + dad_educ_hs + has_retsav + owns_home + both_parents, data = test)) #Adj R2 = 0.2108

 Call:
 lm(formula = pincome ~ mom_age + hisp + race + mom_educ_hs + 
-    dad_educ_hs + ret_sav + hown + both_pars, data = test)
+    dad_educ_hs + has_retsav + owns_home + both_parents, data = test)
 
 Residuals:
     Min      1Q  Median      3Q     Max 
@@ -3460,9 +3449,9 @@ 

car::vif(model)

-
                GVIF Df GVIF^(1/(2*Df))
-mom_age     1.178926  1        1.085783
-hisp        1.768026  1        1.329671
-race        1.725004  4        1.070530
-mom_educ_hs 2.044517  4        1.093513
-dad_educ_hs 2.066047  4        1.094946
-ret_sav     1.281342  1        1.131964
-hown        1.214451  1        1.102021
-both_pars   1.113247  1        1.055105
+
                 GVIF Df GVIF^(1/(2*Df))
+mom_age      1.178926  1        1.085783
+hisp         1.768026  1        1.329671
+race         1.725004  4        1.070530
+mom_educ_hs  2.044517  4        1.093513
+dad_educ_hs  2.066047  4        1.094946
+has_retsav   1.281342  1        1.131964
+owns_home    1.214451  1        1.102021
+both_parents 1.113247  1        1.055105
@@ -3489,52 +3478,64 @@

Checking for missings in the predictor variables

-
impute <- test %>%
-    select(-hispanic, -mom_education, -dad_education, -savings, -home, -both_parents, -par1_deceased, -par2_deceased, - mom_age_birth, -par_dec, -age_resp)
-t <- nearZeroVar(impute) # no variables are found to have near zero variance
-
-sapply(impute, function(x) sum(is.na(x))) # dad_educ_hs has 4598 missings
+
impute <- test |>
+    select(
+        id, race, pincome, pnetworth, mom_age, mom_educ_hs, dad_educ_hs, 
+        hisp, race_eth, has_retsav, owns_home, both_parents
+    )
+
+# no variables are found to have near zero variance
+t <- nearZeroVar(impute) 
+
+# dad_educ_hs has 4598 missings
+# TODO: dad_educ_hs has 3433 missings in data-cleaning
+# and 771 missings in data-cleaning-refactor
+sapply(impute, function(x) sum(is.na(x))) 
-
         id        race     pincome   pnetworth     mom_age mom_educ_hs 
-          0          80        2396        2365         608        1068 
-dad_educ_hs        hisp    race_eth     ret_sav        hown   both_pars 
-       3433          24          88        1194        1616           0 
+
          id         race      pincome    pnetworth      mom_age  mom_educ_hs 
+           0           80         2396         2365          608         1068 
+ dad_educ_hs         hisp     race_eth   has_retsav    owns_home both_parents 
+        3433           24           88         1194         1616            0 
# Calculating percent of usable cases
 m <- md.pairs(impute)
-round(m$mr/(m$mr + m$mm), 3) #looking for low proportions: where both target and predictor are missing on the same cases
+ +# looking for low proportions: where both target and predictor are missing on the same cases +# Note: For row target pincome and pnetworth respectively, predictors dad_educ_hs, has_retsav, +# and owns_home have lowest proportions +round(m$mr/(m$mr + m$mm), 3)
-
             id  race pincome pnetworth mom_age mom_educ_hs dad_educ_hs  hisp
-id          NaN   NaN     NaN       NaN     NaN         NaN         NaN   NaN
-race          1 0.000   0.600     0.562   0.838       0.713       0.488 0.800
-pincome       1 0.987   0.000     0.328   0.903       0.837       0.580 0.996
-pnetworth     1 0.985   0.319     0.000   0.901       0.833       0.648 0.994
-mom_age       1 0.979   0.618     0.613   0.000       0.660       0.533 0.985
-mom_educ_hs   1 0.978   0.635     0.629   0.806       0.000       0.358 0.982
-dad_educ_hs   1 0.988   0.707     0.757   0.917       0.800       0.000 0.994
-hisp          1 0.333   0.625     0.375   0.625       0.208       0.167 0.000
-race_eth      1 0.091   0.602     0.568   0.830       0.670       0.455 0.727
-ret_sav       1 0.972   0.073     0.000   0.869       0.781       0.600 0.989
-hown          1 0.981   0.338     0.296   0.892       0.811       0.584 0.993
-both_pars   NaN   NaN     NaN       NaN     NaN         NaN         NaN   NaN
-            race_eth ret_sav  hown both_pars
-id               NaN     NaN   NaN       NaN
-race           0.000   0.588 0.613         1
-pincome        0.985   0.538 0.553         1
-pnetworth      0.984   0.495 0.519         1
-mom_age        0.975   0.742 0.714         1
-mom_educ_hs    0.973   0.755 0.713         1
-dad_educ_hs    0.986   0.861 0.804         1
-hisp           0.000   0.458 0.542         1
-race_eth       0.000   0.591 0.625         1
-ret_sav        0.970   0.000 0.105         1
-hown           0.980   0.338 0.000         1
-both_pars        NaN     NaN   NaN       NaN
+
              id  race pincome pnetworth mom_age mom_educ_hs dad_educ_hs  hisp
+id           NaN   NaN     NaN       NaN     NaN         NaN         NaN   NaN
+race           1 0.000   0.600     0.562   0.838       0.713       0.488 0.800
+pincome        1 0.987   0.000     0.328   0.903       0.837       0.580 0.996
+pnetworth      1 0.985   0.319     0.000   0.901       0.833       0.648 0.994
+mom_age        1 0.979   0.618     0.613   0.000       0.660       0.533 0.985
+mom_educ_hs    1 0.978   0.635     0.629   0.806       0.000       0.358 0.982
+dad_educ_hs    1 0.988   0.707     0.757   0.917       0.800       0.000 0.994
+hisp           1 0.333   0.625     0.375   0.625       0.208       0.167 0.000
+race_eth       1 0.091   0.602     0.568   0.830       0.670       0.455 0.727
+has_retsav     1 0.972   0.073     0.000   0.869       0.781       0.600 0.989
+owns_home      1 0.981   0.338     0.296   0.892       0.811       0.584 0.993
+both_parents NaN   NaN     NaN       NaN     NaN         NaN         NaN   NaN
+             race_eth has_retsav owns_home both_parents
+id                NaN        NaN       NaN          NaN
+race            0.000      0.588     0.613            1
+pincome         0.985      0.538     0.553            1
+pnetworth       0.984      0.495     0.519            1
+mom_age         0.975      0.742     0.714            1
+mom_educ_hs     0.973      0.755     0.713            1
+dad_educ_hs     0.986      0.861     0.804            1
+hisp            0.000      0.458     0.542            1
+race_eth        0.000      0.591     0.625            1
+has_retsav      0.970      0.000     0.105            1
+owns_home       0.980      0.338     0.000            1
+both_parents      NaN        NaN       NaN          NaN
-
#Note: For row target pincome and pnetworth respectively, predictors dad_educ_hs, ret_sav, and hown have lowest proportions
-
+
# I suspect multi-collinearity is causing singularity in the multiple imputation below, 
+# so removed race and only kept interaction race_eth for now
 impute <- impute %>%
-    select(-race, -hisp) # I suspect multi-collinearity is causing singularity in the multiple imputation below, so removed race and only kept interaction race_eth for now
+ select(-race, -hisp)
@@ -3546,38 +3547,38 @@

Multiple Imputation

  iter imp variable
-  1   1  pincome  pnetworth  mom_age  mom_educ_hs  dad_educ_hs  race_eth  ret_sav  hown
-  1   2  pincome  pnetworth  mom_age  mom_educ_hs  dad_educ_hs  race_eth  ret_sav  hown
-  1   3  pincome  pnetworth  mom_age  mom_educ_hs  dad_educ_hs  race_eth  ret_sav  hown
-  1   4  pincome  pnetworth  mom_age  mom_educ_hs  dad_educ_hs  race_eth  ret_sav  hown
-  1   5  pincome  pnetworth  mom_age  mom_educ_hs  dad_educ_hs  race_eth  ret_sav  hown
-  2   1  pincome  pnetworth  mom_age  mom_educ_hs  dad_educ_hs  race_eth  ret_sav  hown
-  2   2  pincome  pnetworth  mom_age  mom_educ_hs  dad_educ_hs  race_eth  ret_sav  hown
-  2   3  pincome  pnetworth  mom_age  mom_educ_hs  dad_educ_hs  race_eth  ret_sav  hown
-  2   4  pincome  pnetworth  mom_age  mom_educ_hs  dad_educ_hs  race_eth  ret_sav  hown
-  2   5  pincome  pnetworth  mom_age  mom_educ_hs  dad_educ_hs  race_eth  ret_sav  hown
-  3   1  pincome  pnetworth  mom_age  mom_educ_hs  dad_educ_hs  race_eth  ret_sav  hown
-  3   2  pincome  pnetworth  mom_age  mom_educ_hs  dad_educ_hs  race_eth  ret_sav  hown
-  3   3  pincome  pnetworth  mom_age  mom_educ_hs  dad_educ_hs  race_eth  ret_sav  hown
-  3   4  pincome  pnetworth  mom_age  mom_educ_hs  dad_educ_hs  race_eth  ret_sav  hown
-  3   5  pincome  pnetworth  mom_age  mom_educ_hs  dad_educ_hs  race_eth  ret_sav  hown
-  4   1  pincome  pnetworth  mom_age  mom_educ_hs  dad_educ_hs  race_eth  ret_sav  hown
-  4   2  pincome  pnetworth  mom_age  mom_educ_hs  dad_educ_hs  race_eth  ret_sav  hown
-  4   3  pincome  pnetworth  mom_age  mom_educ_hs  dad_educ_hs  race_eth  ret_sav  hown
-  4   4  pincome  pnetworth  mom_age  mom_educ_hs  dad_educ_hs  race_eth  ret_sav  hown
-  4   5  pincome  pnetworth  mom_age  mom_educ_hs  dad_educ_hs  race_eth  ret_sav  hown
-  5   1  pincome  pnetworth  mom_age  mom_educ_hs  dad_educ_hs  race_eth  ret_sav  hown
-  5   2  pincome  pnetworth  mom_age  mom_educ_hs  dad_educ_hs  race_eth  ret_sav  hown
-  5   3  pincome  pnetworth  mom_age  mom_educ_hs  dad_educ_hs  race_eth  ret_sav  hown
-  5   4  pincome  pnetworth  mom_age  mom_educ_hs  dad_educ_hs  race_eth  ret_sav  hown
-  5   5  pincome  pnetworth  mom_age  mom_educ_hs  dad_educ_hs  race_eth  ret_sav  hown
+ 1 1 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth has_retsav owns_home + 1 2 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth has_retsav owns_home + 1 3 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth has_retsav owns_home + 1 4 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth has_retsav owns_home + 1 5 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth has_retsav owns_home + 2 1 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth has_retsav owns_home + 2 2 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth has_retsav owns_home + 2 3 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth has_retsav owns_home + 2 4 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth has_retsav owns_home + 2 5 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth has_retsav owns_home + 3 1 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth has_retsav owns_home + 3 2 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth has_retsav owns_home + 3 3 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth has_retsav owns_home + 3 4 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth has_retsav owns_home + 3 5 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth has_retsav owns_home + 4 1 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth has_retsav owns_home + 4 2 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth has_retsav owns_home + 4 3 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth has_retsav owns_home + 4 4 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth has_retsav owns_home + 4 5 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth has_retsav owns_home + 5 1 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth has_retsav owns_home + 5 2 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth has_retsav owns_home + 5 3 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth has_retsav owns_home + 5 4 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth has_retsav owns_home + 5 5 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth has_retsav owns_home
meth = init$method
 predM = init$predictorMatrix
 
 # Set distributions
 num <- c("mom_age") # numerical variables
-log <- c("ret_sav", "hown", "both_pars") # binary variables
+log <- c("has_retsav", "owns_home", "both_parents") # binary variables
 poly <- c("mom_educ_hs") # ordered categorical variables
 poly2 <- c("race", "race_eth")# Unordered categorical variable
 
@@ -3585,39 +3586,39 @@ 

Multiple ImputationpredM[,c("id")] <- 0 # not using as a predictor variable predM[c("id"),] <- 0 # not using as a predictor variable predM[c("race_eth"),] <- 1 # not using as a predictor variable -#predM[,c("hisp", "mom_age", "ret_sav", "hown", "mom_educ_hs", "both_pars", "race_eth")] <- 0 -#meth[c("hisp", "mom_age", "ret_sav", "hown", "mom_educ_hs", "both_pars", "race_eth")]= "" +#predM[,c("hisp", "mom_age", "has_retsav", "owns_home", "mom_educ_hs", "both_parents", "race_eth")] <- 0 +#meth[c("hisp", "mom_age", "has_retsav", "owns_home", "mom_educ_hs", "both_parents", "race_eth")]= "" meth

-
         id     pincome   pnetworth     mom_age mom_educ_hs dad_educ_hs 
-         ""       "pmm"       "pmm"       "pmm"      "polr"      "polr" 
-   race_eth     ret_sav        hown   both_pars 
-  "polyreg"       "pmm"       "pmm"          "" 
+
          id      pincome    pnetworth      mom_age  mom_educ_hs  dad_educ_hs 
+          ""        "pmm"        "pmm"        "pmm"       "polr"       "polr" 
+    race_eth   has_retsav    owns_home both_parents 
+   "polyreg"        "pmm"        "pmm"           "" 
predM
-
            id pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth
-id           0       0         0       0           0           0        0
-pincome      0       0         1       1           1           1        1
-pnetworth    0       1         0       1           1           1        1
-mom_age      0       1         1       0           1           1        1
-mom_educ_hs  0       1         1       1           0           1        1
-dad_educ_hs  0       1         1       1           1           0        1
-race_eth     1       1         1       1           1           1        1
-ret_sav      0       1         1       1           1           1        1
-hown         0       1         1       1           1           1        1
-both_pars    0       1         1       1           1           1        1
-            ret_sav hown both_pars
-id                0    0         0
-pincome           1    1         1
-pnetworth         1    1         1
-mom_age           1    1         1
-mom_educ_hs       1    1         1
-dad_educ_hs       1    1         1
-race_eth          1    1         1
-ret_sav           0    1         1
-hown              1    0         1
-both_pars         1    1         0
+
             id pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth
+id            0       0         0       0           0           0        0
+pincome       0       0         1       1           1           1        1
+pnetworth     0       1         0       1           1           1        1
+mom_age       0       1         1       0           1           1        1
+mom_educ_hs   0       1         1       1           0           1        1
+dad_educ_hs   0       1         1       1           1           0        1
+race_eth      1       1         1       1           1           1        1
+has_retsav    0       1         1       1           1           1        1
+owns_home     0       1         1       1           1           1        1
+both_parents  0       1         1       1           1           1        1
+             has_retsav owns_home both_parents
+id                    0         0            0
+pincome               1         1            1
+pnetworth             1         1            1
+mom_age               1         1            1
+mom_educ_hs           1         1            1
+dad_educ_hs           1         1            1
+race_eth              1         1            1
+has_retsav            0         1            1
+owns_home             1         0            1
+both_parents          1         1            0
@@ -3627,31 +3628,31 @@

Multiple Imputation

  iter imp variable
-  1   1  pincome  pnetworth  mom_age  mom_educ_hs  dad_educ_hs  race_eth  ret_sav  hown
-  1   2  pincome  pnetworth  mom_age  mom_educ_hs  dad_educ_hs  race_eth  ret_sav  hown
-  1   3  pincome  pnetworth  mom_age  mom_educ_hs  dad_educ_hs  race_eth  ret_sav  hown
-  1   4  pincome  pnetworth  mom_age  mom_educ_hs  dad_educ_hs  race_eth  ret_sav  hown
-  1   5  pincome  pnetworth  mom_age  mom_educ_hs  dad_educ_hs  race_eth  ret_sav  hown
-  2   1  pincome  pnetworth  mom_age  mom_educ_hs  dad_educ_hs  race_eth  ret_sav  hown
-  2   2  pincome  pnetworth  mom_age  mom_educ_hs  dad_educ_hs  race_eth  ret_sav  hown
-  2   3  pincome  pnetworth  mom_age  mom_educ_hs  dad_educ_hs  race_eth  ret_sav  hown
-  2   4  pincome  pnetworth  mom_age  mom_educ_hs  dad_educ_hs  race_eth  ret_sav  hown
-  2   5  pincome  pnetworth  mom_age  mom_educ_hs  dad_educ_hs  race_eth  ret_sav  hown
-  3   1  pincome  pnetworth  mom_age  mom_educ_hs  dad_educ_hs  race_eth  ret_sav  hown
-  3   2  pincome  pnetworth  mom_age  mom_educ_hs  dad_educ_hs  race_eth  ret_sav  hown
-  3   3  pincome  pnetworth  mom_age  mom_educ_hs  dad_educ_hs  race_eth  ret_sav  hown
-  3   4  pincome  pnetworth  mom_age  mom_educ_hs  dad_educ_hs  race_eth  ret_sav  hown
-  3   5  pincome  pnetworth  mom_age  mom_educ_hs  dad_educ_hs  race_eth  ret_sav  hown
-  4   1  pincome  pnetworth  mom_age  mom_educ_hs  dad_educ_hs  race_eth  ret_sav  hown
-  4   2  pincome  pnetworth  mom_age  mom_educ_hs  dad_educ_hs  race_eth  ret_sav  hown
-  4   3  pincome  pnetworth  mom_age  mom_educ_hs  dad_educ_hs  race_eth  ret_sav  hown
-  4   4  pincome  pnetworth  mom_age  mom_educ_hs  dad_educ_hs  race_eth  ret_sav  hown
-  4   5  pincome  pnetworth  mom_age  mom_educ_hs  dad_educ_hs  race_eth  ret_sav  hown
-  5   1  pincome  pnetworth  mom_age  mom_educ_hs  dad_educ_hs  race_eth  ret_sav  hown
-  5   2  pincome  pnetworth  mom_age  mom_educ_hs  dad_educ_hs  race_eth  ret_sav  hown
-  5   3  pincome  pnetworth  mom_age  mom_educ_hs  dad_educ_hs  race_eth  ret_sav  hown
-  5   4  pincome  pnetworth  mom_age  mom_educ_hs  dad_educ_hs  race_eth  ret_sav  hown
-  5   5  pincome  pnetworth  mom_age  mom_educ_hs  dad_educ_hs  race_eth  ret_sav  hown
+ 1 1 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth has_retsav owns_home + 1 2 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth has_retsav owns_home + 1 3 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth has_retsav owns_home + 1 4 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth has_retsav owns_home + 1 5 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth has_retsav owns_home + 2 1 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth has_retsav owns_home + 2 2 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth has_retsav owns_home + 2 3 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth has_retsav owns_home + 2 4 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth has_retsav owns_home + 2 5 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth has_retsav owns_home + 3 1 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth has_retsav owns_home + 3 2 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth has_retsav owns_home + 3 3 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth has_retsav owns_home + 3 4 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth has_retsav owns_home + 3 5 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth has_retsav owns_home + 4 1 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth has_retsav owns_home + 4 2 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth has_retsav owns_home + 4 3 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth has_retsav owns_home + 4 4 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth has_retsav owns_home + 4 5 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth has_retsav owns_home + 5 1 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth has_retsav owns_home + 5 2 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth has_retsav owns_home + 5 3 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth has_retsav owns_home + 5 4 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth has_retsav owns_home + 5 5 pincome pnetworth mom_age mom_educ_hs dad_educ_hs race_eth has_retsav owns_home

imputed <- complete(imputed, action = "repeated", include = FALSE)
 
@@ -3688,7 +3689,7 @@ 

geom_density(alpha = 0.5) + facet_wrap(~imputation, scales="free")

-

+

@@ -3699,7 +3700,7 @@

geom_density(alpha = 0.5) + facet_wrap(~imputation, scales="free")
-

+

@@ -3723,7 +3724,7 @@

position = position_stack(reverse = TRUE) )
-

+

tmpdf |>
     left_join(select(test, id, race_eth), by="id") |>
@@ -3739,7 +3740,7 @@ 

) + facet_wrap(~race_eth, nrow=2)

-

+

@@ -3748,11 +3749,11 @@

Whisker Plots

boxplot(pincome$pincome ~ pincome$imputation)
-

+

boxplot(pnetworth$pnetworth ~ pnetworth$imputation)
-

+

@@ -3771,7 +3772,7 @@

geom_point(alpha = 0.5) + facet_wrap(~imputation, scales="free")
-

+

diff --git a/NLSY/nlsy_imputation.qmd b/NLSY/nlsy_imputation.qmd index 7190472..d024131 100644 --- a/NLSY/nlsy_imputation.qmd +++ b/NLSY/nlsy_imputation.qmd @@ -19,53 +19,24 @@ librarian::shelf(dplyr, mice, car, corrplot, caret, tidyr) source(here::here("NLSY/nlsy_lib.R")) # Read data -nlsydf = readRDS(paste0(here::here(), "/NLSY/NLSY-college-finance.rds")) +# nlsydf = readRDS(paste0(here::here(), "/NLSY/NLSY-college-finance.rds")) ``` ```{r} -test <- nlsydf %>% #Add additional variables that should be considered during multiple imputation - select(id = "PUBID_1997", - mom_age_birth = "CV_BIO_MOM_AGE_YOUTH_1997", # numeric - age_resp = "CV_AGE_12/31/96_1997", - hispanic = "KEY_ETHNICITY_1997", # binary - race = "KEY_RACE_1997", # categorical - mom_education = "CV_HGC_RES_MOM_1997", # categorical - highest grade completed in 1997 by mom - dad_education = "CV_HGC_RES_DAD_1997", # categorical - highest grade completed in 1997 by dad - savings = "P5-130_1997", #binary - yes/no retirement savings - home = "P5-101_1997", # binary - homeownership or renting - both_parents = "YOUTH_BOTHBIO.01_1997", - par1_deceased = "YOUTH_NONR1DEAD.01_1997", - par2_deceased = "YOUTH_NONR2DEAD.01_1997", - pincome = "CV_INCOME_GROSS_YR_1997", - pnetworth = "CV_HH_NET_WORTH_P_1997", - # Also, it seems that this represents the wealth of the household in which a respondent lives. That means that for a respondent who lives with only one parent, the net-worth variable represents the wealth of that parent. - # In the case of an independent respondent, contains respondent's, rather than parents', wealth. - ) %>% - mutate(mom_age = mom_age_birth + age_resp) + +basedf = nlsy_get_base_df() # Making dummy or factor variables -test <- test |> +test <- basedf |> mutate( - mom_educ_hs = nlsy_encode_educ5(mom_education, factorize=TRUE), - dad_educ_hs = nlsy_encode_educ5(dad_education, factorize=TRUE), + mom_educ_hs = nlsy_encode_educ5(mom_res_ed, factorize=TRUE), + dad_educ_hs = nlsy_encode_educ5(dad_res_ed, factorize=TRUE), + race_eth = nlsy_recode_race_and_eth10(race, hisp), + race_eth = factor(race_eth), race = factor(race), - hisp = factor(hispanic), - race_eth = ifelse(hisp == "Yes" & race == "White", "White Hispanic", NA), # Race and ethnicity interaction - race_eth = ifelse(hisp == "No" & race == "White", "White NonHispanic", race_eth), - race_eth = ifelse(hisp == "Yes" & race == "Black or African American", "Black Hispanic", race_eth), - race_eth = ifelse(hisp == "No" & race == "Black or African American", "Black NonHispanic", race_eth), - race_eth = ifelse(hisp == "Yes" & race == "American Indian, Eskimo, or Aleut", "AIAN Hispanic", race_eth), - race_eth = ifelse(hisp == "Yes" & race == "Asian or Pacific Islander", "AAPI Hispanic", race_eth), - race_eth = ifelse(hisp == "Yes" & race == "Something else? (SPECIFY)", "Other Race Hispanic", race_eth), - race_eth = ifelse(hisp == "No" & race == "American Indian, Eskimo, or Aleut", "AIAN NonHispanic", race_eth), - race_eth = ifelse(hisp == "No" & race == "Asian or Pacific Islander", "AAPI NonHispanic", race_eth), - race_eth = ifelse(hisp == "No" & race == "Something else? (SPECIFY)", "Other Race NonHispanic", race_eth), - race_eth = factor(race_eth)) - -test$ret_sav <- ifelse(test$savings == "YES", 1, 0) -test$hown <- ifelse(test$home == "OWNS OR IS BUYING; LAND CONTRACT", 1, 0) -test$both_pars <- ifelse(test$both_parents == "Yes", 1, 0) -test$par_dec <- ifelse(test$par1_deceased == "Yes" | test$par2_deceased == "Yes", 1, 0) # Either parent deceased + hisp = factor(hisp), + par_dec = ifelse(par1_deceased == "Yes" | par2_deceased == "Yes", 1, 0) # Either parent deceased + ) ``` @@ -75,7 +46,7 @@ non_missing <- test %>% filter(!is.na(pincome), !is.na(pnetworth)) -summary(model <- lm(pincome ~ mom_age + hisp + race + mom_educ_hs + dad_educ_hs + ret_sav + hown + both_pars + par_dec, data = non_missing)) +summary(model <- lm(pincome ~ mom_age + hisp + race + mom_educ_hs + dad_educ_hs + has_retsav + owns_home + both_parents + par_dec, data = non_missing)) #Non-significant: par_dec - dropping for now ``` @@ -83,8 +54,8 @@ summary(model <- lm(pincome ~ mom_age + hisp + race + mom_educ_hs + dad_educ_hs ```{r} -testing_cor <- test %>% - select(-id, -hispanic, -race, -mom_education, -dad_education, -savings, -home, -both_parents, -par1_deceased, -par2_deceased, - mom_age_birth, -pincome, -pnetworth, -par_dec, -mom_educ_hs,-dad_educ_hs, -hisp,-race_eth) %>% +testing_cor <- test |> + select(age_resp, mom_age, has_retsav, owns_home, both_parents) |> drop_na() M <- cor(testing_cor) @@ -108,32 +79,46 @@ car::vif(model) summary(model <- lm(pincome ~ mom_age + hisp + race + mom_educ_hs + dad_educ_hs, data = test)) #Adj R2 = 0.1616 car::vif(model) -summary(model <- lm(pincome ~ mom_age + hisp + race + mom_educ_hs + dad_educ_hs + ret_sav, data = test)) #Adj R2 = 0.199 +summary(model <- lm(pincome ~ mom_age + hisp + race + mom_educ_hs + dad_educ_hs + has_retsav, data = test)) #Adj R2 = 0.199 car::vif(model) -summary(model <- lm(pincome ~ mom_age + hisp + race + mom_educ_hs + dad_educ_hs + ret_sav + hown, data = test)) #Adj R2 = 0.2107 +summary(model <- lm(pincome ~ mom_age + hisp + race + mom_educ_hs + dad_educ_hs + has_retsav + owns_home, data = test)) #Adj R2 = 0.2107 car::vif(model) -summary(model <- lm(pincome ~ mom_age + hisp + race + mom_educ_hs + dad_educ_hs + ret_sav + hown + both_pars, data = test)) #Adj R2 = 0.2108 +summary(model <- lm(pincome ~ mom_age + hisp + race + mom_educ_hs + dad_educ_hs + has_retsav + owns_home + both_parents, data = test)) #Adj R2 = 0.2108 car::vif(model) ``` ## Checking for missings in the predictor variables ```{r} -impute <- test %>% - select(-hispanic, -mom_education, -dad_education, -savings, -home, -both_parents, -par1_deceased, -par2_deceased, - mom_age_birth, -par_dec, -age_resp) -t <- nearZeroVar(impute) # no variables are found to have near zero variance -sapply(impute, function(x) sum(is.na(x))) # dad_educ_hs has 4598 missings +impute <- test |> + select( + id, race, pincome, pnetworth, mom_age, mom_educ_hs, dad_educ_hs, + hisp, race_eth, has_retsav, owns_home, both_parents + ) + +# no variables are found to have near zero variance +t <- nearZeroVar(impute) + +# dad_educ_hs has 4598 missings +# TODO: dad_educ_hs has 3433 missings in data-cleaning +# and 771 missings in data-cleaning-refactor +sapply(impute, function(x) sum(is.na(x))) # Calculating percent of usable cases m <- md.pairs(impute) -round(m$mr/(m$mr + m$mm), 3) #looking for low proportions: where both target and predictor are missing on the same cases -#Note: For row target pincome and pnetworth respectively, predictors dad_educ_hs, ret_sav, and hown have lowest proportions +# looking for low proportions: where both target and predictor are missing on the same cases +# Note: For row target pincome and pnetworth respectively, predictors dad_educ_hs, has_retsav, +# and owns_home have lowest proportions +round(m$mr/(m$mr + m$mm), 3) + +# I suspect multi-collinearity is causing singularity in the multiple imputation below, +# so removed race and only kept interaction race_eth for now impute <- impute %>% - select(-race, -hisp) # I suspect multi-collinearity is causing singularity in the multiple imputation below, so removed race and only kept interaction race_eth for now + select(-race, -hisp) ``` ## Multiple Imputation @@ -147,7 +132,7 @@ predM = init$predictorMatrix # Set distributions num <- c("mom_age") # numerical variables -log <- c("ret_sav", "hown", "both_pars") # binary variables +log <- c("has_retsav", "owns_home", "both_parents") # binary variables poly <- c("mom_educ_hs") # ordered categorical variables poly2 <- c("race", "race_eth")# Unordered categorical variable @@ -155,8 +140,8 @@ poly2 <- c("race", "race_eth")# Unordered categorical variable predM[,c("id")] <- 0 # not using as a predictor variable predM[c("id"),] <- 0 # not using as a predictor variable predM[c("race_eth"),] <- 1 # not using as a predictor variable -#predM[,c("hisp", "mom_age", "ret_sav", "hown", "mom_educ_hs", "both_pars", "race_eth")] <- 0 -#meth[c("hisp", "mom_age", "ret_sav", "hown", "mom_educ_hs", "both_pars", "race_eth")]= "" +#predM[,c("hisp", "mom_age", "has_retsav", "owns_home", "mom_educ_hs", "both_parents", "race_eth")] <- 0 +#meth[c("hisp", "mom_age", "has_retsav", "owns_home", "mom_educ_hs", "both_parents", "race_eth")]= "" meth predM ``` diff --git a/NLSY/nlsy_lib.R b/NLSY/nlsy_lib.R index 627d5a5..a1737d5 100644 --- a/NLSY/nlsy_lib.R +++ b/NLSY/nlsy_lib.R @@ -11,38 +11,43 @@ nlsy_get_base_df = function() basedf = nlsydf |> select( - id = "PUBID_1997", - age0 = "CV_AGE_12/31/96_1997", - sex = "KEY_SEX_1997", - bdate_m = "KEY_BDATE_M_1997", - bdate_y = "KEY_BDATE_Y_1997", - hisp = "KEY_ETHNICITY_1997", - race = "KEY_RACE_1997", - pincome = "CV_INCOME_GROSS_YR_1997", - pnetworth = "CV_HH_NET_WORTH_P_1997", - has_retsav= "P5-130_1997", - retsav1 = "P5-131_1997", - retsav2 = "P5-132_1997", - dad_bio_ed ="CV_HGC_BIO_DAD_1997", - dad_res_ed ="CV_HGC_BIO_MOM_1997", - mom_bio_ed ="CV_HGC_RES_DAD_1997", - mom_res_ed ="CV_HGC_RES_MOM_1997", + id = "PUBID_1997", + age_resp = "CV_AGE_12/31/96_1997", + sex = "KEY_SEX_1997", + bdate_m = "KEY_BDATE_M_1997", + bdate_y = "KEY_BDATE_Y_1997", + hisp = "KEY_ETHNICITY_1997", + race = "KEY_RACE_1997", + pincome = "CV_INCOME_GROSS_YR_1997", + pnetworth = "CV_HH_NET_WORTH_P_1997", + has_retsav = "P5-130_1997", + retsav1 = "P5-131_1997", + retsav2 = "P5-132_1997", + owns_home = "P5-101_1997", + both_parents = "YOUTH_BOTHBIO.01_1997", + par1_deceased = "YOUTH_NONR1DEAD.01_1997", + par2_deceased = "YOUTH_NONR2DEAD.01_1997", + dad_bio_ed = "CV_HGC_BIO_DAD_1997", + dad_res_ed = "CV_HGC_RES_DAD_1997", + mom_bio_ed = "CV_HGC_BIO_MOM_1997", + mom_res_ed = "CV_HGC_RES_MOM_1997", + mom_age_birth = "CV_BIO_MOM_AGE_YOUTH_1997", wt ) basedf = basedf |> mutate( race = droplevels(race), - hisp = fct_recode( - hisp, - 'Hispanic' = 'Yes', - 'Non-Hispanic' = 'No' - ), has_retsav = case_when( has_retsav=='YES' ~ 1, has_retsav=='NO' ~ 0, TRUE ~ NA - ) + ), + owns_home = if_else( + owns_home == "OWNS OR IS BUYING; LAND CONTRACT", 1, 0 + ), + both_parents = if_else(both_parents=="Yes", 1, 0), + mom_age = mom_age_birth + age_resp ) basedf = basedf |> @@ -82,40 +87,37 @@ nlsy_get_base_df = function() return(basedf) } -#' Recodes race -nlsy_recode_race = function(data) +#' Recodes race and ethnicity +nlsy_recode_race_and_eth5 = function(race, hisp) { - data = data |> - mutate( - race = fct_recode( - race, - 'Other' = "Something else? (SPECIFY)", - 'Black' = "Black or African American", - 'Other' = "Asian or Pacific Islander", - 'Other' = "American Indian, Eskimo, or Aleut" - ) + return(case_when( + {{hisp}} == 'Yes' ~ 'Hispanic', + {{race}} == 'Black or African American' ~ 'Black', + {{race}} == 'Asian or Pacific Islander' ~ 'Asian', + {{race}} == 'American Indian, Eskimo, or Aleut' ~ 'Other', + {{race}} == 'White' ~ 'White', + {{race}} == 'Something else? (SPECIFY)' ~ 'Other', + TRUE ~ {{race}} ) - return(data) + ) } -#' Recodes race and ethnicity -nlsy_recode_race_and_ethn = function(data) + +nlsy_recode_race_and_eth10 = function(race, hisp) { - data = data |> - mutate( - race = case_when( - hisp == 'Hispanic' ~ 'Hispanic', - race == 'Black or African American' ~ 'Black', - race == 'Asian or Pacific Islander' ~ 'Asian', - race == 'American Indian, Eskimo, or Aleut' ~ 'Other', - race == 'White' ~ 'White', - race == 'Something else? (SPECIFY)' ~ 'Other', - TRUE ~ race - ) + return(case_when( + {{hisp}} == "Yes" & {{race}} == "White" ~ "White Hispanic", + {{hisp}} == "No" & {{race}} == "White" ~ "White NonHispanic", + {{hisp}} == "Yes" & {{race}} == "Black or African American" ~ "Black Hispanic", + {{hisp}} == "No" & {{race}} == "Black or African American" ~ "Black NonHispanic", + {{hisp}} == "Yes" & {{race}} == "American Indian, Eskimo, or Aleut" ~ "AIAN Hispanic", + {{hisp}} == "Yes" & {{race}} == "Asian or Pacific Islander" ~ "AAPI Hispanic", + {{hisp}} == "Yes" & {{race}} == "Something else? (SPECIFY)" ~ "Other Race Hispanic", + {{hisp}} == "No" & {{race}} == "American Indian, Eskimo, or Aleut" ~ "AIAN NonHispanic", + {{hisp}} == "No" & {{race}} == "Asian or Pacific Islander" ~ "AAPI NonHispanic", + {{hisp}} == "No" & {{race}} == "Something else? (SPECIFY)" ~ "Other Race NonHispanic" ) - - # stopifnot(all(!is.na(data$race))) - return(data) + ) } diff --git a/modeling_college_ed.qmd b/modeling_college_ed.qmd index f025706..c8d425b 100644 --- a/modeling_college_ed.qmd +++ b/modeling_college_ed.qmd @@ -39,12 +39,12 @@ colstdf = left_join( # Create a base frame basedf = nlsy_get_base_df() |> select(id, sex, race, hisp, pnetworth, wt) |> + filter(!is.na(race) & !is.na(hisp) & race!='No information') |> mutate( pnetworth_na = as.integer(is.na(pnetworth)), - pnetworth = if_else(is.na(pnetworth), 0, pnetworth) - ) |> - filter(!is.na(race) & !is.na(hisp) & race!='No information') |> - nlsy_recode_race_and_ethn() |> + pnetworth = if_else(is.na(pnetworth), 0, pnetworth), + race = nlsy_recode_race_and_eth5(race, hisp) + ) |> # There are not enough Asians in the dataset for a separate model # We group them with whites and use a dummy variable mutate(