Skip to content

Commit

Permalink
Merge pull request #85 from worldbank/nov-2021-la
Browse files Browse the repository at this point in the history
Nov 2021 - descriptive analysis and spatial data
  • Loading branch information
luisesanmartin authored Jan 13, 2022
2 parents 574777c + 48f1566 commit 500f015
Show file tree
Hide file tree
Showing 7 changed files with 468 additions and 145 deletions.
101 changes: 101 additions & 0 deletions DataWork/descriptive-statistics.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
# Load packages ################################################################

library(here)
library(tidyverse)
library(skimr)
library(lfe)
library(huxtable)
library(openxlsx)

# Load data
census <-
read_rds(here("DataWork",
"DataSets",
"Final",
"census.RDS"))

# Exploring the data ###########################################################

# See summary statistics for the whole data set:
summary(census)
summary(census, 0)

# See summary statistics for a single variable:
summary(census$region)
summary(census$death)

# Frequency table using table()
table(census$region)
table(census$state, census$region)

# Now with skimr
skim(census)

# Creating custom summary statistics tables ####################################

summary_stats <-
skim_with(numeric = sfl(Mean = mean, # Variable name = statistic
Median = median,
SD = sd,
Min = min,
Max = max),
append = FALSE) # Remove all default statistics

summary_stats_table <-
census %>%
summary_stats() %>%
yank("numeric") %>%
select(- complete_rate) %>%
rename(`Missings` = n_missing)

# Exporting tables #############################################################

quick_xlsx(table,
file = here("DataWork",
"Output",
"Raw",
"summary-statistics.xlsx"))

## Add variable names --------------------------------------------
# Extract variable labels from data frame
# Extract variable labels from data frame
census_dictionary <-
data.frame("Variable" = attributes(census)$var.labels,
"name" = names(census))

summary_stats_table <-
summary_stats_table %>%
rename(name = skim_variable) %>% # Rename var with var names so we can merge the datasets
left_join(census_dictionary) %>% # Merge to variable labels
select(-name) %>% # Keep only variable labels instead of names
as_hux # Convert it into a huxtable object

summary_stats_table <-
summary_stats_table %>%
relocate(Variable) %>% # Make variable labels the first column
set_header_rows(1, TRUE) %>% # Use stats name as table header
set_header_cols("Variable", TRUE) %>% # Use variable name as row header
set_number_format(everywhere, 2:ncol(.), "%9.0f") %>% # Don't round large numbers
theme_basic() # Set a theme for quick formatting

quick_xlsx(summary_stats_table,
file = here("DataWork",
"Output",
"Raw",
"summary-statistics-formatted.xlsx"))

# Aggregate tables ###########################################################

census_region <-
census %>%
group_by(region) %>%
summarise(`Number of States` = n_distinct(state),
`Total Population` = sum(pop))


# Run regression ######################################################

reg1 <-
lm(divorce ~ pop + popurban + marriage,
census)

104 changes: 54 additions & 50 deletions Presentations/05-descriptive-analysis.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,30 @@ But that's enough of me talking. Let's get you all to run some code

# Setting the stage


Load the packages that we will use today
```{r, eval = F}
# Install new packages
install.packages("skimr")
install.packages("lfe")
install.packages("huxtable")
install.packages("openxlsx")
```

```{r, warning = FALSE}
# Load packages
library(here)
library(tidyverse)
library(skimr)
library(lfe)
library(huxtable)
library(openxlsx)
```

---

# Setting the stage

Load the data that we will use today: Stata's `census` dataset
```{r}
# Load data
Expand Down Expand Up @@ -161,27 +185,6 @@ glimpse(census)

---

# Setting the stage


Load the packages that we will use today
```{r, eval = F}
# Install new packages
install.packages("skimr")
install.packages("lfe")
install.packages("huxtable")
```

```{r, warning = FALSE}
# Load packages
library(tidyverse)
library(skimr)
library(lfe)
library(huxtable)
```

---

class: inverse, center, middle
name: exploring

Expand Down Expand Up @@ -214,8 +217,8 @@ Use the `summary()` function to describe the `census` data frame.

# Exploring a dataset

```{r, echo = FALSE}
include_app("https://luizaandrade.shinyapps.io/learnr/")
```{r}
summary(census)
```

---
Expand All @@ -238,11 +241,6 @@ include_app("https://luizaandrade.shinyapps.io/learnr/")
Use the `summary()` function to display summary statistics for a continuous variable in the `census` data frame.
]

```{r, echo = FALSE}
include_app("https://luizaandrade.shinyapps.io/learnr/")
```


---

# Summarizing categorical variables
Expand All @@ -268,8 +266,10 @@ Use the `table()` function to display frequency tables for:

## One way tabulation

```{r, echo = FALSE}
include_app("https://luizaandrade.shinyapps.io/learnr/")
--

```{r}
table(census$region)
```


Expand All @@ -279,8 +279,10 @@ include_app("https://luizaandrade.shinyapps.io/learnr/")

## Two way tabulation

--

```{r, echo = FALSE}
include_app("https://luizaandrade.shinyapps.io/learnr/")
table(census$state, census$region)
```


Expand Down Expand Up @@ -402,12 +404,10 @@ census %>%
```

Here are a few functions that can be used within `sfl()`:
- Center: `mean()`, `median()`
- Spread: `sd()`, `IQR()`, `mad()`
- Range: `min()`, `max()`, `quantile()`
- Position: `first()`, `last()`, `nth()`,
- Count: `n()`, `n_distinct()`
- Logical: `any()`, `all()`
- Center: `mean`, `median`
- Spread: `sd`, `IQR`, `variance`
- Range: `min`, `max`, `quantile`
- Count: `n_complete`, `n_missing`, `length`

---

Expand Down Expand Up @@ -574,7 +574,7 @@ summary_stats_table <- #<<
relocate(Variable) %>% # Make variable labels the first column #<<
set_header_rows(1, TRUE) %>% # Use stats name as table header #<<
set_header_cols("Variable", TRUE) %>% # Use variable name as row header #<<
set_number_format("\"%9.0f\"" ) %>% # Don't round large numbers #<<
set_number_format(everywhere, 2:ncol(.), "%9.0f") %>% # Don't round large numbers
theme_basic() # Set a theme for quick formatting #<<
```
Expand All @@ -591,7 +591,7 @@ summary_stats_table <- #<<
# Beautifying tables

.small[
```{r, message = FALSE, eval = F}
```{r, message = FALSE}
# Extract variable labels from data frame
census_dictionary <-
Expand All @@ -611,12 +611,15 @@ summary_stats_table <-
relocate(Variable) %>% # Make variable labels the first column
set_header_rows(1, TRUE) %>% # Use stats name as table header
set_header_cols("Variable", TRUE) %>% # Use variable name as row header
set_number_format("\"%9.0f\"" ) %>% # Don't round large numbers
set_number_format(everywhere, 2:ncol(.), "%9.0f") %>% # Don't round large numbers
theme_basic() # Set a theme for quick formatting
# Now export it #<<
quick_xlsx(summary_stats_table, #<<
file = file.path(rawOutput, "summary-stats-basic.xlsx")) #<<
file = here("DataWork", #<<
"Output", #<<
"Raw", #<<
"summary-stats-basic.xlsx")) #<<
quick_latex(summary_stats_table, #<<
file = here("DataWork", #<<
Expand Down Expand Up @@ -706,11 +709,6 @@ The "name-value" pairs mentioned under `...` look like this: `new_variable = sta
Recreate the `region_stats` data set, now including the average and the standard deviation of the population.
]


```{r, echo = FALSE}
include_app("https://luizaandrade.shinyapps.io/learnr/")
```

---

# Aggregating observations
Expand Down Expand Up @@ -820,8 +818,10 @@ Using the `census` data, run a regression of the number of divorces on populatio
]


```{r, echo = FALSE}
include_app("https://luizaandrade.shinyapps.io/learnr/")
```{r}
reg1 <-
lm(divorce ~ pop + popurban + marriage,
census)
```


Expand Down Expand Up @@ -897,8 +897,12 @@ Using the `census` data, run a regression of the number of divorces on populatio
Using the `census` data, run a regression of divorce on population, urban population and number of marriages controlling for region fixed effects.
]

```{r, echo = FALSE}
include_app("https://luizaandrade.shinyapps.io/learnr/")
```{r, evalt = FALSE}
reg2 <-
felm(divorce ~ pop + popurban + marriage | region | 0 | 0,
census)
summary(reg2)
```

---
Expand Down
Loading

0 comments on commit 500f015

Please sign in to comment.