Skip to content

Commit 76811b7

Browse files
create post-strat estimator; fix #19; fix #48
1 parent bc11e9c commit 76811b7

26 files changed

+554
-32
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -13,3 +13,4 @@
1313

1414
README.html
1515
docs
16+
inst/doc

DESCRIPTION

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
Package: bbw
22
Type: Package
33
Title: Blocked Weighted Bootstrap
4-
Version: 0.2.4.9000
4+
Version: 0.2.5.9000
55
Authors@R: c(
66
person("Mark", "Myatt",
77
email = "[email protected]", role = c("aut", "cph")),

NAMESPACE

+1
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ export(boot_bw_sample_clusters)
1010
export(boot_bw_sample_within_clusters)
1111
export(boot_bw_sequential)
1212
export(boot_bw_weight)
13+
export(estimate_total)
1314
export(recode)
1415
importFrom(car,bcPower)
1516
importFrom(car,powerTransform)

R/bootProbit.R

+1-1
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ bootPROBIT <- function(x, params, threshold = THRESHOLD) {
4646
d <- car::bcPower(d, lambda)
4747
threshold <- car::bcPower(threshold, lambda)
4848
m <- mean(d, na.rm = TRUE)
49-
s <- stats::sd(d, na.rm = T)
49+
s <- stats::sd(d, na.rm = TRUE)
5050

5151
## PROBIT estimate ----
5252
x <- stats::pnorm(q = threshold, mean = m, sd = s)

R/boot_bw_estimate.R

+44
Original file line numberDiff line numberDiff line change
@@ -51,5 +51,49 @@ boot_bw_estimate <- function(boot_df) {
5151
row.names(est) <- NULL
5252

5353
## Return est ----
54+
est
55+
}
56+
57+
58+
#'
59+
#' Boot estimate
60+
#'
61+
#' @keywords internal
62+
#'
63+
64+
boot_percentile <- function(boot_df) {
65+
if (is.data.frame(boot_df)) {
66+
est <- lapply(
67+
X = boot_df,
68+
FUN = stats::quantile,
69+
probs = c(0.5, 0.025, 0.975),
70+
na.rm = TRUE
71+
) |>
72+
do.call(rbind, args = _) |>
73+
as.data.frame()
74+
75+
se <- lapply(
76+
X = boot_df,
77+
FUN = stats::sd,
78+
na.rm = TRUE
79+
) |>
80+
do.call(rbind, args = _) |>
81+
as.data.frame()
82+
83+
est <- data.frame(est, se)
84+
85+
names(est) <- c("est", "lcl", "ucl", "se")
86+
} else {
87+
est <- stats::quantile(
88+
x = boot_df, probs = c(0.5, 0.025, 0.975), na.rm = TRUE
89+
) |>
90+
rbind() |>
91+
data.frame(
92+
se = stats::sd(x = boot_df, na.rm = TRUE)
93+
)
94+
95+
names(est) <- c("est", "lcl", "ucl", "se")
96+
}
97+
5498
est
5599
}

R/data.R

+31
Original file line numberDiff line numberDiff line change
@@ -139,3 +139,34 @@
139139
#' @source Mother and child health and nutrition survey in 3 regions of Somalia
140140
#'
141141
"indicatorsCH2"
142+
143+
144+
#'
145+
#' Somalia regional population in 2022
146+
#'
147+
#' A data.frame with 19 rows and 18 columns:
148+
#'
149+
#' **Variable** | **Description**
150+
#' :--- | :---
151+
#' `region` | Region name
152+
#' `total` | Total population
153+
#' `urban` | Total urban population
154+
#' `rural` | Total rural population
155+
#' `idp` | Total IDP population
156+
#' `urban_stressed` | Total urban population - stressed
157+
#' `rural_stressed` | Total rural population - stressed
158+
#' `idp_stressed` | Total IDP population - stressed
159+
#' `urban_crisis` | Total urban population - crisis
160+
#' `rural_crisis` | Total rural population - crisis
161+
#' `idp_crisis` | Total IDP population - crisis
162+
#' `urban_emergency` | Total urban population - emergency
163+
#' `rural_emergency` | Total rural population - emergency
164+
#' `idp_emergency` | Total IDP population - emergency
165+
#' `urban_catastrophe` | Total urban population - catastrophe
166+
#' `rural_catastrophe` | Total rural population - catastrophe
167+
#' `idp_catastrophe` | Total IDP population - catastrophe
168+
#' `percent_at_least_crisis` | Percentage of population that are at least in crisis
169+
#'
170+
#' @source <https://fsnau.org/downloads/2022-Gu-IPC-Population-Tables-Current.pdf>
171+
#'
172+
"somalia_population"

R/post_strat_estimation.R

+127
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
#'
2+
#' Post-stratification analysis
3+
#'
4+
#' @param est_df A [data.frame()] of stratified indicator estimates to get
5+
#' overall estimates of. `est_df` should have a variable named `est` for
6+
#' the values of the indicator estimate, a variable named `strata` for
7+
#' information on the stratification or grouping of the estimates, and a
8+
#' variable named `se` for the standard errors for the values of the
9+
#' indicator estimate. This is usually produced via a call to
10+
#' [boot_bw_estimate()].
11+
#' @param pop_df A [data.frame()] with at least two variables: `strata` for the
12+
#' stratification/grouping information that matches `strata` in `est_df` and
13+
#' `pop` for information on population for the given `strata`.
14+
#'
15+
#' @returns A vector of values for the overall estimate, overall 95% lower
16+
#' confidence limit, and overall 95% upper confidence limit for each of the
17+
#' `strata` in `est_df`.
18+
#'
19+
#' @examples
20+
#' est_df <- boot_bw(
21+
#' x = indicatorsHH, w = villageData, statistic = bootClassic,
22+
#' params = "anc1", strata = "region", replicates = 9
23+
#' ) |>
24+
#' boot_bw_estimate()
25+
#'
26+
#' ## Add population ----
27+
#' pop_df <- somalia_population |>
28+
#' subset(select = c(region, total))
29+
#'
30+
#' names(pop_df) <- c("strata", "pop")
31+
#'
32+
#' estimate_total(est_df, pop_df)
33+
#'
34+
#' @export
35+
#'
36+
37+
estimate_total <- function(est_df, pop_df) {
38+
## Check the data ----
39+
check_est_df(est_df)
40+
check_pop_df(pop_df)
41+
42+
## Merge estimates with population data ----
43+
est_pop_df <- merge(pop_df, est_df, by = "strata", all.y = TRUE)
44+
45+
## Get total estimates ----
46+
if (length(unique(est_pop_df$indicator)) > 1) {
47+
est_pop_df <- split(x = est_pop_df, f = est_pop_df$indicator)
48+
49+
total_est_df <- lapply(
50+
X = est_pop_df,
51+
FUN = estimate_total_
52+
) |>
53+
do.call(rbind, args = _) |>
54+
as.data.frame()
55+
} else {
56+
total_est_df <- estimate_total_(est_pop_df)
57+
}
58+
59+
## Return estimates ----
60+
total_est_df
61+
}
62+
63+
64+
#'
65+
#' Estimate post-stratification weighted totals
66+
#'
67+
#' @keywords internal
68+
#'
69+
70+
estimate_total_ <- function(est_pop_df) {
71+
with(est_pop_df, {
72+
data.frame(
73+
strata = "Overall",
74+
indicator = unique(indicator),
75+
est = calc_total_estimate(est, pop),
76+
lcl = calc_total_ci(est, pop, se, "lcl"),
77+
ucl = calc_total_ci(est, pop, se, "ucl"),
78+
se = calc_total_sd(se, pop)
79+
)
80+
})
81+
}
82+
83+
84+
#'
85+
#' Calculate total estimate
86+
#'
87+
#' @keywords internal
88+
#'
89+
90+
calc_total_estimate <- function(est, pop) {
91+
sum(est * pop, na.rm = TRUE) / sum(pop, na.rm = TRUE)
92+
}
93+
94+
95+
#'
96+
#' Calculate total sd
97+
#'
98+
#' @keywords internal
99+
#'
100+
101+
calc_total_sd <- function(se, pop) {
102+
sum(se ^ 2 * pop / sum(pop, na.rm = TRUE), na.rm = TRUE)
103+
}
104+
105+
106+
#'
107+
#' Calculate confidence limits
108+
#'
109+
#' @keywords internal
110+
#'
111+
112+
calc_total_ci <- function(est, pop, se, ci = c("lcl", "ucl")) {
113+
ci <- match.arg(ci)
114+
115+
operator <- ifelse(ci == "lcl", "-", "+")
116+
117+
str2expression(
118+
paste0(
119+
"sum(est * pop, na.rm = TRUE) / sum(pop, na.rm = TRUE) ",
120+
operator,
121+
" 1.96 * sqrt(sum(se ^ 2 * pop / sum(pop, na.rm = TRUE), na.rm = TRUE))"
122+
)
123+
) |>
124+
eval()
125+
}
126+
127+

R/utils.R

+46-20
Original file line numberDiff line numberDiff line change
@@ -120,32 +120,58 @@ tidy_boot <- function(boot, w, strata, outputColumns) {
120120

121121

122122
#'
123-
#' Boot estimate
123+
#' Check est_df
124124
#'
125125
#' @keywords internal
126126
#'
127127

128-
boot_percentile <- function(boot_df) {
129-
if (is.data.frame(boot_df)) {
130-
est <- lapply(
131-
X = boot_df,
132-
FUN = stats::quantile,
133-
probs = c(0.5, 0.025, 0.975),
134-
na.rm = TRUE
135-
) |>
136-
do.call(rbind, args = _) |>
137-
data.frame()
128+
check_est_df <- function(est_df) {
129+
data_name_check <- c("strata", "est", "se") %in% names(est_df)
130+
data_name_in <- c("strata", "est", "se")[which(data_name_check)]
131+
data_name_out <- c("strata", "est", "se")[which(!data_name_check)]
132+
133+
arg_name <- deparse(substitute(est_df))
134+
135+
message_out <- ifelse(
136+
length(data_name_out) == 1,
137+
"{.strong {.val {arg_name}}} doesn't have a {.strong {.val {data_name_out}}} variable or has a different name",
138+
"{.strong {.val {arg_name}}} doesn't have {.strong {.val {data_name_out}}} variables or have different names"
139+
)
138140

139-
names(est) <- c("est", "lcl", "ucl")
141+
if (all(data_name_check)) {
142+
cli::cli_alert_success(
143+
"{.arg est_df} has the appropriate/expected variables"
144+
)
140145
} else {
141-
est <- stats::quantile(
142-
x = boot_df, probs = c(0.5, 0.025, 0.975), na.rm = TRUE
143-
) |>
144-
rbind() |>
145-
data.frame()
146-
147-
names(est) <- c("est", "lcl", "ucl")
146+
cli::cli_abort(message_out)
148147
}
148+
}
149+
150+
151+
#'
152+
#' Check pop_df
153+
#'
154+
#' @keywords internal
155+
#'
156+
157+
check_pop_df <- function(pop_df) {
158+
data_name_check <- c("strata", "pop") %in% names(pop_df)
159+
data_name_in <- c("strata", "pop")[which(data_name_check)]
160+
data_name_out <- c("strata", "pop")[which(!data_name_check)]
161+
162+
arg_name <- deparse(substitute(pop_df))
149163

150-
est
164+
message_out <- ifelse(
165+
length(data_name_out) == 1,
166+
"{.strong {.val {arg_name}}} doesn't have a {.strong {.val {data_name_out}}} variable or has a different name",
167+
"{.strong {.val {arg_name}}} doesn't have {.strong {.val {data_name_out}}} variables or have different names"
168+
)
169+
170+
if (all(data_name_check)) {
171+
cli::cli_alert_success(
172+
"{.arg pop_df} has the appropriate/expected variables"
173+
)
174+
} else {
175+
cli::cli_abort(message_out)
176+
}
151177
}

README.Rmd

+4-4
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ knitr::opts_chunk$set(
2929
<!-- badges: end -->
3030

3131
## Overview
32-
The **blocked weighted bootstrap** is an estimation technique for use with data from two-stage cluster sampled surveys in which either prior weighting (e.g. *population-proportional sampling* or *PPS* as used in **Standardized Monitoring and Assessment of Relief and Transitions** or **SMART** surveys) or *posterior weighting* (e.g. as used in **Rapid Assessment Method** or **RAM** and **Simple Spatial Sampling Method** or **S3M** surveys) is implemented.
32+
The **blocked weighted bootstrap** is an estimation technique for use with data from two-stage cluster sampled surveys in which either prior weighting (e.g. *population-proportional sampling* or *PPS* as used in [Standardized Monitoring and Assessment of Relief and Transitions (SMART)](https://smartmethodology.org/) surveys) or *posterior weighting* (e.g. as used in [Rapid Assessment Method (RAM)](https://rapidsurveys.io/ramOPmanual/) and [Simple Spatial Sampling Method (S3M)](https://researchonline.lshtm.ac.uk/id/eprint/2572543) surveys) is implemented.
3333

3434
## Installation
3535

@@ -61,16 +61,16 @@ With RAM and S3M surveys, the sample is complex in the sense that it is an unwei
6161

6262
* **Blocked**: The block corresponds to the primary sampling unit (*PSU = cluster*). PSUs are resampled with replacement. Observations within the resampled PSUs are also sampled with replacement.
6363

64-
* **Weighted**: RAM and S3M samples do not use population proportional sampling (PPS) to weight the sample prior to data collection (e.g. as is done with SMART surveys). This means that a posterior weighting procedure is required. `{bbw}` uses a *"roulette wheel"* algorithm (see [illustration below](#FIG1)) to weight (i.e. by population) the selection probability of PSUs in bootstrap replicates.
64+
* **Weighted**: RAM and S3M samples do not use population proportional sampling (PPS) to weight the sample prior to data collection (e.g. as is done with SMART surveys). This means that a posterior weighting procedure is required. `{bbw}` uses a *"roulette wheel"* algorithm (see [illustration below](#fig1)) to weight (i.e. by population) the selection probability of PSUs in bootstrap replicates.
6565

66-
<a name="FIG1"></a>
66+
<a name="fig1"></a>
6767
<p align="center">
6868
```{r, echo = FALSE, eval = TRUE, out.width = "50%", fig.alt = "Roulette wheel algorithm", fig.align = "center"}
6969
knitr::include_graphics("man/figures/rouletteWheel.png")
7070
```
7171
</p>
7272

73-
In the case of prior weighting by PPS all clusters are given the same weight. With posterior weighting (as in RAM or S3M) the weight is the population of each PSU. This procedure is very similar to the *fitness proportional selection* technique used in *evolutionary computing*.
73+
In the case of prior weighting by PPS all clusters are given the same weight. With posterior weighting (as in RAM or S3M) the weight is the population of each PSU. This procedure is very similar to the [*fitness proportional selection*](https://en.wikipedia.org/wiki/Fitness_proportionate_selection) technique used in *evolutionary computing*.
7474

7575
A total of `m` PSUs are sampled with replacement for each bootstrap replicate (where `m` is the number of PSUs in the survey sample).
7676

data-raw/population.R

+34
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
# Somalia regional population estimates 2022
2+
3+
.url <- "https://fsnau.org/downloads/2022-Gu-IPC-Population-Tables-Current.pdf"
4+
5+
somalia_population <- pdftools::pdf_text(.url) |>
6+
stringr::str_split(pattern = "\n") |>
7+
(\(x) x[[1]])() |>
8+
(\(x) x[c(8:15, 18:19, 22:29, 31)])() |>
9+
stringr::str_remove_all(pattern = ",") |>
10+
stringr::str_split(pattern = "[ ]{2,}") |>
11+
do.call(rbind, args = _) |>
12+
data.frame()
13+
14+
names(somalia_population) <- c(
15+
"region", "total", "urban", "rural", "idp",
16+
"urban_stressed", "rural_stressed", "idp_stressed",
17+
"urban_crisis", "rural_crisis", "idp_crisis",
18+
"urban_emergency", "rural_emergency", "idp_emergency",
19+
"urban_catastrophe", "rural_catastrophe", "idp_catastrophe",
20+
"percent_at_least_crisis"
21+
)
22+
23+
somalia_population[ , c("total", "urban", "rural", "idp",
24+
"urban_stressed", "rural_stressed", "idp_stressed",
25+
"urban_crisis", "rural_crisis", "idp_crisis",
26+
"urban_emergency", "rural_emergency", "idp_emergency",
27+
"urban_catastrophe", "rural_catastrophe", "idp_catastrophe",
28+
"percent_at_least_crisis")] <- lapply(
29+
X = subset(somalia_population, select = -region),
30+
FUN = as.numeric
31+
) |>
32+
do.call(cbind, args = _)
33+
34+
usethis::use_data(somalia_population, overwrite = TRUE, compress = "xz")

data/somalia_population.rda

1.21 KB
Binary file not shown.

0 commit comments

Comments
 (0)