diff --git a/exercises/06-intro-pscores-exercises.qmd b/exercises/06-intro-pscores-exercises.qmd index aa2b769..3f2668a 100644 --- a/exercises/06-intro-pscores-exercises.qmd +++ b/exercises/06-intro-pscores-exercises.qmd @@ -80,7 +80,7 @@ dagify( "\n(one year ago)", "\n(6 months ago)", "\n(3 months ago)", - "5pm - 6pm\n(Today)" + "9am - 10am\n(Today)" ) ) ``` @@ -93,24 +93,24 @@ First we need to subset the data to only include average wait times between 9 an ```{r} seven_dwarfs <- seven_dwarfs_train_2018 |> - filter(hour == 9) + filter(wait_hour == 9) ``` Here's a data dictionary of the variables we need in the `seven_dwarfs` data set: | Variable | Column in `seven_dwarfs` | |--------------------------------|--------------------------| -| Posted Wait Time (outcome) | `avg_spostmin` | -| Extra Magic Morning (exposure) | `extra_magic_morning` | -| Ticket Season | `wdw_ticket_season` | -| Closing Time | `close` | -| Historic Temperature | `weather_wdwhigh` | +| Posted Wait Time (outcome) | `wait_minutes_posted_avg`| +| Extra Magic Morning (exposure) | `park_extra_magic_morning` | +| Ticket Season | `park_ticket_season` | +| Closing Time | `park_close` | +| Historic Temperature | `park_temperature_high` | ## Your Turn *After updating the code chunks below, change `eval: true` before rendering* -Now, fit a propensity score model for `extra_magic_morning` using the above proposed confounders. +Now, fit a propensity score model for `park_extra_magic_morning` using the above proposed confounders. ```{r} #| eval: false @@ -131,7 +131,7 @@ df <- propensity_model |> Stretch Goal 1: -Examine two histograms of the propensity scores, one days with Extra Magic Morning (`extra_magic_morning == 1`) and one for days without it (`extra_magic_morning == 0`). +Examine two histograms of the propensity scores, one days with Extra Magic Morning (`park_extra_magic_morning == 1`) and one for days without it (`park_extra_magic_morning == 0`). How do these compare? ```{r} diff --git a/exercises/07-pscores-using-exercises.qmd b/exercises/07-pscores-using-exercises.qmd index c9ba2cd..9fac195 100644 --- a/exercises/07-pscores-using-exercises.qmd +++ b/exercises/07-pscores-using-exercises.qmd @@ -18,10 +18,10 @@ Below is the propensity score model you created in the previous exercise. ```{r} seven_dwarfs <- seven_dwarfs_train_2018 |> - filter(hour == 9) + filter(wait_hour == 9) propensity_model <- glm( - extra_magic_morning ~ wdw_ticket_season + close + weather_wdwhigh, + park_extra_magic_morning ~ park_ticket_season + park_close + park_temperature_high, data = seven_dwarfs, family = binomial() ) @@ -82,7 +82,7 @@ Update the code below to examine the distribution of the weighted sample. **HINT #| warning: false ggplot( seven_dwarfs_prop, - aes(.fitted, fill = factor(extra_magic_morning)) + aes(.fitted, fill = factor(park_extra_magic_morning)) ) + geom_mirror_histogram(bins = 50, alpha = .5) + geom_mirror_histogram(aes(weight = ____), alpha = .5, bins = 50) + diff --git a/exercises/08-pscores-diagnostics-exercises.qmd b/exercises/08-pscores-diagnostics-exercises.qmd index 04a7bbb..bdc2798 100644 --- a/exercises/08-pscores-diagnostics-exercises.qmd +++ b/exercises/08-pscores-diagnostics-exercises.qmd @@ -19,17 +19,17 @@ Below is the propensity score model and weights you created in the previous exer ```{r} seven_dwarfs <- seven_dwarfs_train_2018 |> - filter(hour == 9) + filter(wait_hour == 9) propensity_model <- glm( - extra_magic_morning ~ wdw_ticket_season + close + weather_wdwhigh, + park_extra_magic_morning ~ park_ticket_season + park_close + park_temperature_high, data = seven_dwarfs, family = binomial() ) seven_dwarfs_ps <- propensity_model |> augment(type.predict = "response", data = seven_dwarfs) |> - mutate(w_ate = wt_ate(.fitted, extra_magic_morning)) + mutate(w_ate = wt_ate(.fitted, park_extra_magic_morning)) ``` ## Your Turn 1 @@ -41,7 +41,7 @@ Calculate the standardized mean differences with and without weights ```{r} #| eval: false smds <- seven_dwarfs_ps |> - mutate(close = as.numeric(close)) |> + mutate(park_close = as.numeric(park_close)) |> tidy_smd( .vars = ____, .group = ____, @@ -62,7 +62,7 @@ ggplot( ## Your Turn 2 -Create an unweighted ECDF for `weather_wdwhigh` by whether or not the day had Extra Magic Hours. +Create an unweighted ECDF for `park_temperature_high` by whether or not the day had Extra Magic Hours. ```{r} #| eval: false @@ -77,7 +77,7 @@ ggplot(seven_dwarfs_ps, aes(x = ____, group = ____, color = factor(____))) + ylab("Proportion <= x") ``` -Create an weighted ECDF for `weather_wdwhigh` by whether or not the day had Extra Magic Hours. +Create an weighted ECDF for `park_temperature_high` by whether or not the day had Extra Magic Hours. ```{r} #| eval: false diff --git a/exercises/09-outcome-model-exercises.qmd b/exercises/09-outcome-model-exercises.qmd index e462a54..d53b3d0 100644 --- a/exercises/09-outcome-model-exercises.qmd +++ b/exercises/09-outcome-model-exercises.qmd @@ -13,7 +13,7 @@ library(rsample) library(propensity) seven_dwarfs <- seven_dwarfs_train_2018 |> - filter(hour == 9) + filter(wait_hour == 9) ``` We are interested in examining the relationship between whether there were "Extra Magic Hours" in the morning (the **exposure**) and the average wait time for the Seven Dwarfs Mine Train the same day between 9am and 10am (the **outcome**). @@ -57,9 +57,9 @@ ipw_results |> mutate( estimate = map_dbl( boot_fits, - # pull the `estimate` for `extra_magic_morning` for each fit + # pull the `estimate` for `park_extra_magic_morning` for each fit \(.fit) .fit |> - filter(term == "extra_magic_morning") |> + filter(term == "park_extra_magic_morning") |> pull(estimate) ) ) |> diff --git a/exercises/10-continuous-g-computation-exercises.qmd b/exercises/10-continuous-g-computation-exercises.qmd index fa8d64b..6f5f0ae 100644 --- a/exercises/10-continuous-g-computation-exercises.qmd +++ b/exercises/10-continuous-g-computation-exercises.qmd @@ -13,7 +13,7 @@ library(splines) For this set of exercises, we'll use g-computation to calculate a causal effect for continuous exposures. -In the touringplans data set, we have information about the posted waiting times for rides. We also have a limited amount of data on the observed, actual times. The question that we will consider is this: Do posted wait times (`avg_spostmin`) for the Seven Dwarves Mine Train at 8 am affect actual wait times (`avg_sactmin`) at 9 am? Here’s our DAG: +In the touringplans data set, we have information about the posted waiting times for rides. We also have a limited amount of data on the observed, actual times. The question that we will consider is this: Do posted wait times (`wait_minutes_posted_avg`) for the Seven Dwarves Mine Train at 8 am affect actual wait times (`wait_minutes_actual_avg`) at 9 am? Here’s our DAG: ```{r} #| echo: false @@ -83,29 +83,29 @@ dagify( ) ``` -First, let’s wrangle our data to address our question: do posted wait times at 8 affect actual weight times at 9? We’ll join the baseline data (all covariates and posted wait time at 8) with the outcome (average actual time). We also have a lot of missingness for `avg_sactmin`, so we’ll drop unobserved values for now. +First, let’s wrangle our data to address our question: do posted wait times at 8 affect actual weight times at 9? We’ll join the baseline data (all covariates and posted wait time at 8) with the outcome (average actual time). We also have a lot of missingness for `wait_minutes_actual_avg`, so we’ll drop unobserved values for now. You don't need to update any code here, so just run this. ```{r} eight <- seven_dwarfs_train_2018 |> - filter(hour == 8) |> - select(-avg_sactmin) + filter(wait_hour == 8) |> + select(-wait_minutes_actual_avg) nine <- seven_dwarfs_train_2018 |> - filter(hour == 9) |> - select(date, avg_sactmin) + filter(wait_hour == 9) |> + select(park_date, wait_minutes_actual_avg) wait_times <- eight |> - left_join(nine, by = "date") |> - drop_na(avg_sactmin) + left_join(nine, by = "park_date") |> + drop_na(wait_minutes_actual_avg) ``` # Your Turn 1 -For the parametric G-formula, we'll use a single model to fit a causal model of Posted Waiting Times (`avg_spostmin`) on Actual Waiting Times (`avg_sactmin`) where we include all covariates, much as we normally fit regression models. However, instead of interpreting the coefficients, we'll calculate the estimate by predicting on cloned data sets. +For the parametric G-formula, we'll use a single model to fit a causal model of Posted Waiting Times (`wait_minutes_posted_avg`) on Actual Waiting Times (`wait_minutes_actual_avg`) where we include all covariates, much as we normally fit regression models. However, instead of interpreting the coefficients, we'll calculate the estimate by predicting on cloned data sets. -Two additional differences in our model: we'll use a natural cubic spline on the exposure, `avg_spostmin`, using `ns()` from the splines package, and we'll include an interaction term between `avg_spostmin` and `extra_magic_mornin g`. These complicate the interpretation of the coefficient of the model in normal regression but have virtually no downside (as long as we have a reasonable sample size) in g-computation, because we still get an easily interpretable result. +Two additional differences in our model: we'll use a natural cubic spline on the exposure, `wait_minutes_posted_avg`, using `ns()` from the splines package, and we'll include an interaction term between `wait_minutes_posted_avg` and `park_extra_magic_morning`. These complicate the interpretation of the coefficient of the model in normal regression but have virtually no downside (as long as we have a reasonable sample size) in g-computation, because we still get an easily interpretable result. First, let's fit the model. @@ -114,14 +114,14 @@ First, let's fit the model. ```{r} _______ ___ _______( - avg_sactmin ~ ns(_______, df = 5)*extra_magic_morning + _______ + _______ + _______, + wait_minutes_actual_avg ~ ns(_______, df = 5)*park_extra_magic_morning + _______ + _______ + _______, data = seven_dwarfs ) ``` # Your Turn 2 -Now that we've fit a model, we need to clone our data set. To do this, we'll simply mutate it so that in one set, all participants have `avg_spostmin` set to 30 minutes and in another, all participants have `avg_spostmin` set to 60 minutes. +Now that we've fit a model, we need to clone our data set. To do this, we'll simply mutate it so that in one set, all participants have `wait_minutes_posted_avg` set to 30 minutes and in another, all participants have `wait_minutes_posted_avg` set to 60 minutes. 1. Create the cloned data sets, called `thirty` and `sixty`. 2. For both data sets, use `standardized_model` and `augment()` to get the predicted values. Use the `newdata` argument in `augment()` with the relevant cloned data set. Then, select only the fitted value. Rename `.fitted` to either `thirty_posted_minutes` or `sixty_posted_minutes` (use the pattern `select(new_name = old_name)`). diff --git a/exercises/14-bonus-continuous-pscores-exercises.qmd b/exercises/14-bonus-continuous-pscores-exercises.qmd index bdbddb7..08981d8 100644 --- a/exercises/14-bonus-continuous-pscores-exercises.qmd +++ b/exercises/14-bonus-continuous-pscores-exercises.qmd @@ -13,7 +13,7 @@ library(propensity) For this set of exercises, we'll use propensity scores for continuous exposures. -In the touringplans data set, we have information about the posted waiting times for rides. We also have a limited amount of data on the observed, actual times. The question that we will consider is this: Do posted wait times (`avg_spostmin`) for the Seven Dwarves Mine Train at 8 am affect actual wait times (`avg_sactmin`) at 9 am? Here’s our DAG: +In the touringplans data set, we have information about the posted waiting times for rides. We also have a limited amount of data on the observed, actual times. The question that we will consider is this: Do posted wait times (`wait_minutes_posted_avg`) for the Seven Dwarves Mine Train at 8 am affect actual wait times (`wait_minutes_actual_avg`) at 9 am? Here’s our DAG: ```{r} #| echo: false @@ -83,31 +83,31 @@ dagify( ) ``` -First, let’s wrangle our data to address our question: do posted wait times at 8 affect actual weight times at 9? We’ll join the baseline data (all covariates and posted wait time at 8) with the outcome (average actual time). We also have a lot of missingness for `avg_sactmin`, so we’ll drop unobserved values for now. +First, let’s wrangle our data to address our question: do posted wait times at 8 affect actual weight times at 9? We’ll join the baseline data (all covariates and posted wait time at 8) with the outcome (average actual time). We also have a lot of missingness for `wait_minutes_actual_avg`, so we’ll drop unobserved values for now. You don't need to update any code here, so just run this. ```{r} eight <- seven_dwarfs_train_2018 |> - filter(hour == 8) |> - select(-avg_sactmin) + filter(wait_hour == 8) |> + select(-wait_minutes_actual_avg) nine <- seven_dwarfs_train_2018 |> - filter(hour == 9) |> - select(date, avg_sactmin) + filter(wait_hour == 9) |> + select(park_date, wait_minutes_actual_avg) wait_times <- eight |> - left_join(nine, by = "date") |> - drop_na(avg_sactmin) + left_join(nine, by = "park_date") |> + drop_na(wait_minutes_actual_avg) ``` # Your Turn 1 -First, let’s calculate the propensity score model, which will be the denominator in our stabilized weights (more to come on that soon). We’ll fit a model using `lm()` for `avg_spostmin` with our covariates, then use the fitted predictions of `avg_spostmin` (`.fitted`, `.sigma`) to calculate the density using `dnorm()`. +First, let’s calculate the propensity score model, which will be the denominator in our stabilized weights (more to come on that soon). We’ll fit a model using `lm()` for `wait_minutes_posted_avg` with our covariates, then use the fitted predictions of `wait_minutes_posted_avg` (`.fitted`, `.sigma`) to calculate the density using `dnorm()`. -1. Fit a model using `lm()` with `avg_spostmin` as the outcome and the confounders identified in the DAG. +1. Fit a model using `lm()` with `wait_minutes_posted_avg` as the outcome and the confounders identified in the DAG. 2. Use `augment()` to add model predictions to the data frame. -3. In `wt_ate()`, calculate the weights using `avg_postmin`, `.fitted`, and `.sigma`. +3. In `wt_ate()`, calculate the weights using `wait_minutes_posted_avg`, `.fitted`, and `.sigma`. ```{r} post_time_model <- lm( @@ -169,7 +169,7 @@ Now, let's fit the outcome model! ```{r} lm(___ ~ ___, weights = ___, data = wait_times_swts) |> tidy() |> - filter(term == "avg_spostmin") |> + filter(term == "wait_minutes_posted_avg") |> mutate(estimate = estimate * 10) ``` diff --git a/slides/raw/06-pscores.qmd b/slides/raw/06-pscores.qmd index 5d73192..2d439d7 100644 --- a/slides/raw/06-pscores.qmd +++ b/slides/raw/06-pscores.qmd @@ -181,7 +181,7 @@ dagify( "\n(one year ago)", "\n(6 months ago)", "\n(3 months ago)", - "5pm - 6pm\n(Today)" + "9am - 10am\n(Today)" ) ) ``` @@ -190,7 +190,7 @@ dagify( `r countdown::countdown(minutes = 10)` -### Using the **confounders** identified in the previous DAG, fit a propensity score model for `extra_magic_morning` +### Using the **confounders** identified in the previous DAG, fit a propensity score model for `park_extra_magic_morning` ### *Stretch*: Create two histograms, one of the propensity scores for days with extra morning magic hours and one for those without diff --git a/slides/raw/08-pscore-diagnostics.qmd b/slides/raw/08-pscore-diagnostics.qmd index 7c7c7f6..03064f8 100644 --- a/slides/raw/08-pscore-diagnostics.qmd +++ b/slides/raw/08-pscore-diagnostics.qmd @@ -178,8 +178,8 @@ ggplot(df, aes(x = wt71, color = factor(qsmk))) + `r countdown::countdown(minutes = 10)` -### Create an unweighted ECDF examining the `weather_wdwhigh` confounder by whether or not the day had Extra Magic Hours. -### Create a weighted ECDF examining the `weather_wdwhigh` confounder +### Create an unweighted ECDF examining the `park_temperature_high` confounder by whether or not the day had Extra Magic Hours. +### Create a weighted ECDF examining the `park_temperature_high` confounder ## {background-color="#23373B" .center .huge} diff --git a/slides/raw/09-outcome-model.qmd b/slides/raw/09-outcome-model.qmd index 8b0d7be..8e572ed 100644 --- a/slides/raw/09-outcome-model.qmd +++ b/slides/raw/09-outcome-model.qmd @@ -143,7 +143,7 @@ boot_estimate <- int_t(ipw_results, boot_fits) |> `r countdown::countdown(minutes = 12)` -### Create a function called `ipw_fit` that fits the propensity score model and the weighted outcome model for the effect between `extra_magic_morning` and `avg_spostmin` +### Create a function called `ipw_fit` that fits the propensity score model and the weighted outcome model for the effect between `park_extra_magic_morning` and `wait_minutes_posted_avg` ### Using the `bootstraps()` and `int_t()` functions to estimate the final effect. diff --git a/slides/raw/14-bonus-continuous-pscores.qmd b/slides/raw/14-bonus-continuous-pscores.qmd index a04172d..38cd2cc 100644 --- a/slides/raw/14-bonus-continuous-pscores.qmd +++ b/slides/raw/14-bonus-continuous-pscores.qmd @@ -192,9 +192,9 @@ dagify( ## *Your Turn 1* -### Fit a model using `lm()` with `avg_spostmin` as the outcome and the confounders identified in the DAG. +### Fit a model using `lm()` with `wait_minutes_posted_avg` as the outcome and the confounders identified in the DAG. ### Use `augment()` to add model predictions to the data frame -### In `wt_ate()`, calculate the weights using `avg_postmin`, `.fitted`, and `.sigma` +### In `wt_ate()`, calculate the weights using `wait_minutes_posted_avg`, `.fitted`, and `.sigma` `r countdown::countdown(minutes = 5)` @@ -203,23 +203,23 @@ dagify( ```{r} #| include: false eight <- seven_dwarfs_train_2018 |> - filter(hour == 8) |> - select(-avg_sactmin) + filter(wait_hour == 8) |> + select(-wait_minutes_posted_avg) nine <- seven_dwarfs_train_2018 |> - filter(hour == 9) |> - select(date, avg_sactmin) + filter(wait_hour == 9) |> + select(park_date, wait_minutes_posted_avg) wait_times <- eight |> - left_join(nine, by = "date") |> - drop_na(avg_sactmin) + left_join(nine, by = "park_date") |> + drop_na(wait_minutes_posted_avg) ``` ```{r} post_time_model <- lm( - avg_spostmin ~ - close + extra_magic_morning + - weather_wdwhigh + wdw_ticket_season, + wait_minutes_posted_avg ~ + park_close + park_extra_magic_morning + + park_temperature_high + park_ticket_season, data = wait_times ) ``` @@ -230,7 +230,7 @@ post_time_model <- lm( wait_times_wts <- post_time_model |> augment(data = wait_times) |> mutate(wts = wt_ate( - avg_spostmin, .fitted, .sigma = .sigma + wait_minutes_posted_avg, .fitted, .sigma = .sigma )) ``` @@ -289,7 +289,7 @@ ggplot(nhefs_swts, aes(swts)) + wait_times_swts <- post_time_model |> augment(data = wait_times) |> mutate(swts = wt_ate( - avg_spostmin, + wait_minutes_posted_avg, .fitted, .sigma = .sigma, stabilize = TRUE @@ -324,12 +324,12 @@ lm( ```{r} lm( - avg_sactmin ~ avg_spostmin, + wait_minutes_actual_avg ~ wait_minutes_posted_avg, weights = swts, data = wait_times_swts ) |> tidy() |> - filter(term == "avg_spostmin") |> + filter(term == "wait_minutes_posted_avg") |> mutate(estimate = estimate * 10) ```