diff --git a/.gitignore b/.gitignore index 1094ee1..53ff6ae 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,4 @@ *Data/* -*.pdf *.do *.Rhistory *.xlsx @@ -7,12 +6,12 @@ *.out *.snm *.nav -*.tex *.toc *.md *.vrb *.aux *.png +*.DS_Store *.csv *.dbf *.cpg @@ -20,4 +19,3 @@ *.shp !Presentations/img/* -*.pdf diff --git a/DataWork/Code/Lab 3 - Data Processing.R b/DataWork/Code/Lab 3 - Data Processing.R deleted file mode 100644 index fce293b..0000000 --- a/DataWork/Code/Lab 3 - Data Processing.R +++ /dev/null @@ -1,37 +0,0 @@ -# ------------------------------------------------------------------------------ # -# # -# DIME # -# Data Processing in R # -# # -# ------------------------------------------------------------------------------ # - - # List variables we want in the final data set - id_vars <- c("hh_code", "wave", "year", "treatment_hh", - "treatment_site", "site_code") - demographic_vars <- c("gender_hhh", "age_hhh", "num_dependents", "read_and_write") - yield_vars <- c("w_gross_yield_a", "w_gross_yield_b") - food_vars <- c("expend_food_yearly", "expend_food_lastweek", "wdds_score") - income_vars <- income_vars <- names(lwh_panel)[29:38] - - names(lwh_panel)[grep("area", names(lwh_panel))] - plot_area_vars <- - - - # Subset data set - lwh <- lwh_panel[,c(id_vars, - demographic_vars, - yield_vars, - food_vars, - income_vars, - plot_area_vars)] - - - # A customized function to winsorize observations at the 90th percentile - winsor <- function(x) { - x[x > quantile(x, 0.9, na.rm = T)] <- - quantile(x, 0.9, na.rm = T) - return(x) - } - - - \ No newline at end of file diff --git a/DataWork/Code/Lab 2 - Coding for Reproducible Research.R b/DataWork/Code/Lab 4 - Coding for Reproducible Research.R similarity index 87% rename from DataWork/Code/Lab 2 - Coding for Reproducible Research.R rename to DataWork/Code/Lab 4 - Coding for Reproducible Research.R index 172abb4..b5eff50 100644 --- a/DataWork/Code/Lab 2 - Coding for Reproducible Research.R +++ b/DataWork/Code/Lab 4 - Coding for Reproducible Research.R @@ -2,22 +2,18 @@ # DIME # Introduction to R for Stata users # - MASTER DO_FILE # + MASTER SCRIPT # # ------------------------------------------------------------------------------ # # PURPOSE: Set-up configurations and run scripts that are part of DIME's R # Training -# NOTES: Version 1 +# NOTES: Version 3 # WRITTEN BY: Luiza Cardoso de Andrade, Robert A. Marty, Leonardo Viotti -# Last modified in May 2018 - -# PART 0: Clear boiler plate - - rm(list=ls()) +# Last modified in Dec 2018 # PART 1: Select sections to run @@ -25,13 +21,12 @@ # PART 2: Load packages # If you selected the option to install packages, install them - install.packages(c("swirl","stargazer"), dep = TRUE) + install.packages(c("stargazer", "swirl"), dependencies = TRUE) # Load all packages -- this is equivalent to using library(package) for each # package listed before library(swirl) - library(stargazer) # PART 3: Set folder folder paths @@ -59,5 +54,6 @@ finalOutput <- file.path(Output,"Final") - -# PART 4: Run selected sections \ No newline at end of file +# PART 4: Load data + +# PART 5: Run selected sections \ No newline at end of file diff --git a/DataWork/Code/Lab 4 - Descriptive Analysis.R b/DataWork/Code/Lab 4 - Descriptive Analysis.R deleted file mode 100644 index a083cfc..0000000 --- a/DataWork/Code/Lab 4 - Descriptive Analysis.R +++ /dev/null @@ -1,40 +0,0 @@ -# ------------------------------------------------------------------------------ # -# # -# DIME # -# Descriptive Staticstics in R # -# # -# ------------------------------------------------------------------------------ # - - -# Load data ---------------------------------------------------------------------- - - lwh <- read.csv(file.path(finalData,"lwh_clean.csv"), - header = T) - -# EXERCISE 1: Summary statistics ------------------------------------------------ - -# EXERCISE 2: Frequencies tables ------------------------------------------------ - -# EXERCISE 3: Stargazer --------------------------------------------------------- - -# EXERCISE 4: Export tables to LaTeX -------------------------------------------- - - # Vector with covariates to be kept - covariates <- c("age_hhh", - "num_dependents", - "income_total_win", - "expend_food_yearly") - # subset lwh - lwh_simp <- lwh[, covariates] - - -# EXERCISE 5: Aggregate --------------------------------------------------------- - - -# EXERCISE 6: Reshape ----------------------------------------------------------- - - -# EXERCISE 7: Create table from scratch ----------------------------------------- - - -# EXERCISE 8: Export table to Excel --------------------------------------------- \ No newline at end of file diff --git a/DataWork/Code/Lab 5 - Data Visualization.R b/DataWork/Code/Lab 5 - Data Visualization.R deleted file mode 100644 index ef99d4e..0000000 --- a/DataWork/Code/Lab 5 - Data Visualization.R +++ /dev/null @@ -1,45 +0,0 @@ -# ------------------------------------------------------------------------------ # -# # -# DIME # -# Data Visualization # -# # -# ------------------------------------------------------------------------------ # - -# Load data ---------------------------------------------------------------------- - - # Load LWH data - lwh <- read.csv(file.path(finalData,"lwh_clean.csv"), - header = T) - - # Subset data - covariates <- c("age_hhh", - "num_dependents", - "income_total_trim", - "expend_food_yearly", - "w_area_plots_a") - lwh_simp <- lwh[,covariates] - -# EXERCISE 1: Plot a whole data set ----------------------------------------------- - -# EXERCISE 2: Create a histogram -------------------------------------------------- - -# EXERCISE 3: Create a boxplot ---------------------------------------------------- - -# EXERCISE 4: ggpplot ------------------------------------------------------------- - - plot1 <- ggplot(data = lwh, - aes(y = w_area_plots_b, - x = w_gross_yield_b)) - -# EXERCISE 5: Plot an aggregated data set ----------------------------------------- - - # Aggregate anual income by year - anualInc <- - aggregate(x = lwh["income_total_trim"], # data.frame - by = list(year = lwh$year), #list - FUN = mean, na.rm = T) # function - -# EXERCISE 6: ggplot aesthectics --------------------------------------------------- - - # Subset lwh to 2018 and Rwamangana 35 - lwh_s <- lwh[lwh$year == 2018 & lwh$site_code == "Rwamangana 35", ] \ No newline at end of file diff --git a/DataWork/Code/Lab 5 - Spatial Data.R b/DataWork/Code/Lab 5 - Spatial Data.R new file mode 100644 index 0000000..6c32252 --- /dev/null +++ b/DataWork/Code/Lab 5 - Spatial Data.R @@ -0,0 +1,248 @@ +# Spatial Data Analysis with R + +# Setup ------------------------------------------------------------------------ +library(maptools) +library(rgdal) +library(ggplot2) +library(broom) +library(ggmap) +library(raster) +library(leaflet) + +# Define the filepath to the Final data folder +finalData <- "~/Documents/GitHub/dime-r-training/DataWork/DataSets/Final" + +# *** Load a Shapefile --------------------------------------------------------- +worldmap <- readOGR(dsn=finalData, layer="worldmap") + +# If the above line doesn't work, try this: +# library(maptools) +# worldmap <- readShapeSpatial(file.path(finalData,"worldmap.shp")) +# crs(worldmap) <- CRS("+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0") + +# If this fails, try running this: +# load(file.path(finalData, "spatialdata.Rda")) + +#### Plot Shapefile +plot(worldmap) + +#### Plot Subset of Shapefile +plot(worldmap[1:42,]) + +#### Look inside spatial dataframe + +# Coordinate Reference System +worldmap@proj4string + +# Dataframe +worldmap@data + +# *** Mapping with ggplot: Polygons -------------------------------------------- + +#### Make map of the world +ggplot() + + geom_polygon(data=worldmap, aes(x=long, y=lat, group=group)) + +#### Make tidy dataframe of world map +# Make tidy dataframe +worldmap$id <- row.names(worldmap) +worldmap_tidy <- tidy(worldmap) + +# Merge our variables back in +worldmap_tidy <- merge(worldmap_tidy, worldmap@data, by="id") + +#### Make happiness scores +ggplot() + + geom_polygon(data=worldmap_tidy, aes(x=long, y=lat, + group=group, + fill = hppy_sc)) + + theme_void() + +#### Make a prettier map with ggplot +ggplot() + + geom_polygon(data=worldmap_tidy, aes(x=long, y=lat, + group=group, + fill = hppy_sc)) + + theme_void() + + coord_quickmap() + # Removes distortions + labs(fill="Happiness\nScore") + + scale_fill_gradient(low = "firebrick4", + high = "chartreuse2") + +##### Exercise 1: Make of map of just Africa ##### + +# WRITE YOUR CODE HERE! + + + + + +# *** Mapping with ggplot: Points ---------------------------------------------- +#### Load World Bank data +wb_projects <- read.csv(file.path(finalData, "wb_projects.csv")) + +##### Exercise 3: Add World Bank Projects to Map ##### + +# WRITE YOUR CODE HERE! + + + + + +# *** Mapping with ggplot: Lines ----------------------------------------------- + +##### Exercise 3: Add trunk roads to map ##### +trunk_roads <- readOGR(dsn=finalData, layer="troads") + +# If the above line doesn't work, try this: +# library(maptools) +# trunk_roads <- readShapeSpatial(file.path(finalData,"troads.shp")) +# crs(worldmap) <- CRS("+proj=utm +zone=30 +a=6378249.145 +b=6356514.96582849 +units=m +no_defs") + +# WRITE YOUR CODE HERE! + + + + + +#### Check Projections +# World Map Projection +worldmap@proj4string + +# Trunk Roads Projection +trunk_roads@proj4string + +#### Reproject Data +trunk_roads_reprj <- spTransform(trunk_roads, CRS("+init=epsg:4326")) + +#### Add trunk roads to map +ggplot() + + geom_polygon(data=africa_tidy, aes(x=long, y=lat, + group=group, + fill = hppy_sc)) + + geom_path(data=trunk_roads_reprj, aes(x=long, y=lat, + group=group)) + + geom_point(data=wb_projects, aes(x=longitude, + y=latitude), + size=.1) + + theme_void() + + coord_quickmap() + # Removes distortions + labs(fill="Happiness\nScore") + + scale_fill_gradient(low = "firebrick4", + high = "chartreuse2") + +#### Add Roads to Legend +ggplot() + + geom_polygon(data=africa_tidy, aes(x=long, y=lat, + group=group, + fill = hppy_sc)) + + geom_point(data=wb_projects, aes(x=longitude, + y=latitude), + size=.1) + + geom_path(data=trunk_roads_reprj, aes(x=long, y=lat, + group=group, + color="Trunk Roads")) + + theme_void() + + coord_quickmap() + # Removes distortions + labs(fill="Happiness\nScore", + color="") + # REMOVE TITLE ABOVE TRUNK ROAD IN LEGEND + scale_fill_gradient(low = "firebrick4", + high = "chartreuse2") + + scale_color_manual(values=c("blue")) # MANUALY DEFINE COLOR HERE + +# *** Basemap ------------------------------------------------------------------ +#### Basemap of Nairobi +nairobi <- c(left = 36.66, + bottom = -1.44, + right = 37.10, + top = -1.15) +nairobi_map <- get_stamenmap(nairobi, + zoom = 12, + maptype = "toner-lite") + +ggmap(nairobi_map) + +#### World Bank Projects in Nairobi +kenya_projects <- subset(wb_projects, recipients == "Kenya") + +ggmap(nairobi_map) + + geom_point(data=wb_projects, aes(x=longitude, + y=latitude), + color="orange") + + theme_void() + +#### Exercise 4: Make Size of Point Correspond with Commitment ##### + +# WRITE YOUR CODE HERE! + +# *** Interactive Map ---------------------------------------------------------- + +#### Create Spatial Points Dataframe + +# Create Spatial Points Dataframe +coordinates(kenya_projects) <- ~longitude+latitude + +# Define Projection +crs(kenya_projects) <- CRS("+init=epsg:4326") + +# Plot +plot(kenya_projects, pch=16, cex=.5) + +#### Interactive Map +leaflet() %>% + addCircles(data=kenya_projects) %>% + addTiles() + +#### Better Interactive Map +leaflet() %>% + addCircles(data=kenya_projects, + popup = ~project_title, + radius = ~sqrt(commitments)*5, + weight = 1, + color = "orange") %>% + addProviderTiles(providers$Stamen.Terrain) + +##### Exercise 5: Add Two Layers on Map ##### + +# WRITE YOUR CODE HERE! + +#### Add Layers and Buttons to Interactive Map +leaflet() %>% + addCircles(data=subset(kenya_projects, + ad_sector_names == "Transport and storage"), + popup = ~project_title, + radius = ~sqrt(commitments)*5, + weight = 2.5, + color = "orange", + group = "Transport") %>% + addCircles(data=subset(kenya_projects, + ad_sector_names == "Water supply and sanitation"), + popup = ~project_title, + radius = ~sqrt(commitments)*5, + weight = 2.5, + color = "blue", + group = "Water") %>% + addProviderTiles(providers$Stamen.Terrain) %>% + addLayersControl(overlayGroups = c("Transport", "Water"), + options = layersControlOptions(collapsed = FALSE)) + +# *** Spatial Operations ------------------------------------------------------- +#### GADM Data +ken_adm1 <- getData('GADM', country='KEN', level=1) + +#### Crop Roads to Kenya +trunk_roads_kenya <- gIntersection(trunk_roads_reprj, ken_adm1) + +# Convert back to spatial points dataframe (instead of spatial points) +trunk_roads_kenya$id <- 1:length(trunk_roads_kenya) + +#### Calculate Distance to Road +distance_matrix <- gDistance(kenya_projects, trunk_roads_kenya, byid=T) +kenya_projects$dist_road <- apply(distance_matrix, 2, min) + +###### Exercise 6: Map of World Bank Projects Near Roads ##### +kenya_projects_df <- kenya_projects@data +kenya_projects_df$long <- kenya_projects@coords[,1] +kenya_projects_df$lat <- kenya_projects@coords[,2] + +# WRITE YOUR CODE HERE! diff --git a/DataWork/Code/Lab 6 - Spatial Data.R b/DataWork/Code/Lab 6 - Spatial Data.R deleted file mode 100644 index 65bc996..0000000 --- a/DataWork/Code/Lab 6 - Spatial Data.R +++ /dev/null @@ -1,225 +0,0 @@ -# Spatial Data in R -# DIME FC Training -# June 2018 - -# FIRST RUN MASTER FILE TO DEFINE FILEPATHS - -# *** HOUSEHOLD DATA **** ------------------------------------------------------ - -# Load Household Data ---------------------------------------------------------- -library(broom) -library(sp) -library(RColorBrewer) -library(rgdal) -library(rgeos) -library(ggrepel) -library(raster) -library(ggplot2) -library(ggmap) -library(rgeos) -library(leaflet) -library(dplyr) -library(htmlwidgets) - -hh_data <- read.csv(file.path(finalData,"HH_data.csv")) - -# Make a Basic Map ------------------------------------------------------------- - - -# Make a Better Map ------------------------------------------------------------ -hh_map1 <- ggplot() + - - # Points - geom_point(data=hh_data, - aes(x=longitude_scramble, y=latitude_scramble, color=food_security), - size=.7) + - - # Other elements to improve map - coord_quickmap() + # make sure the map doesn't look distorted. Can also use - # coord_map(), but sometimes makes the process slow - theme_void() + - scale_color_manual(values=c("green", "orange", "red")) + - labs(title="Household Food Security", color="Food Security") + - theme(plot.title = element_text(hjust = 0.5, face="bold")) # center and bold title -hh_map1 - -# Add a Basemap ---------------------------------------------------------------- -basemap <- get_map(location = c(lon=mean(hh_data$longitude_scramble), - lat=mean(hh_data$latitude_scramble)), - zoom=10, - maptype="roadmap") # roadmap, satellite, etc. See help(get_map) - -hh_map2 <- ggmap(basemap) + - geom_point(data=hh_data, - aes(x=longitude_scramble, - y=latitude_scramble, - color=food_security), - size=.7) + - coord_quickmap() + # make sure the map doesn't look distorted. Can also use - # coord_map(), but sometimes makes the process slow - theme_void() + - labs(title="Household Food Security", color="Food Security") + - theme(plot.title = element_text(hjust = 0.5, face="bold")) + - scale_color_manual(values=c("green", "orange", "red")) -hh_map2 - -# Trim Basemap ----------------------------------------------------------------- -hh_map2 + - scale_x_continuous(limits = c(min(hh_data$longitude_scramble), - max(hh_data$longitude_scramble)), - expand = c(.03, .03)) + - scale_y_continuous(limits = c(min(hh_data$latitude_scramble), - max(hh_data$latitude_scramble)), - expand = c(.03, .03)) - -# Spatial Dataframe ------------------------------------------------------------ -crs(hh_data_sdf) <- CRS("+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0") - - -# Spatial Operations ----------------------------------------------------------- - -# Buffer - -# Reproject -hh_data_sdf_newproj <- spTransform(hh_data_sdf, CRS("+init=epsg:3857")) - -# Calculating distance -dist_matrix <- gDistance(hh_data_sdf_newproj, byid=T) - -# Interactive Maps ------------------------------------------------------------- - -# Save Interactive Map --------------------------------------------------------- - -# Better Interactive Map ------------------------------------------------------- -pal <- colorFactor( - palette = c("Green","Yellow","Red"), - domain = hh_data_sdf$food_security) - -imap_2 <- leaflet() %>% - addProviderTiles("Stamen.Terrain") %>% - addCircleMarkers(data=hh_data_sdf, - radius=3, - fillOpacity=1, - color=~pal(food_security), - stroke=F, # remove outer-circle around circle - popup=~food_security) %>% # variable to display when click feature - # Add legend for points layer - addLegend(pal = pal, - values = hh_data_sdf$food_security, - title = "Food Security") -imap_2 - -# *** POLGON DATA **** --------------------------------------------------------- - -# Load and Prep Plot-Level Data ------------------------------------------------ -setwd(finalData) # set filepath to folder with shapefile -ag_fields <- readOGR(dsn=".", layer="allsitessubset", verbose = FALSE) - -# Plot File Projection - -# HH Survey Location Projection - -# Reproject plot data to have same projection as HH survey data - -# Examine Agricultural Fields Spatial Dataframe -------------------------------- - -# Aggregate/Dissolve Polygon --------------------------------------------------- - -# Buffer Polygon by Very Small Amount (get rid of weird tic) ------------------- - -# Merge Variables with Polygon ------------------------------------------------- -ag_fields_data <- read.csv(file.path(finalData, "plot_data.csv")) - -# Convert Spatial Dataframe to Dataframe for ggplot ---------------------------- - -# Make a Map ------------------------------------------------------------------- - -# Make a Better Map ------------------------------------------------------------ - -ag_map1 <- ggplot() + - geom_polygon(data = ag_fields_df, aes(x = long, y=lat, group=group, - fill=expend_food_yearly_mean), - color="black", size=0.25) + # Borders of polygons - - coord_quickmap() + # Make sure map doesn't look distorted - theme_void() + # Set theme of plot - scale_fill_gradient(low = "orangered1", high = "green1") + # Color Gradient - labs(fill="Food\nExpenditure", title="Annual Food Expenditure") + # Labels - theme(plot.title = element_text(hjust = 0.5, face="bold")) # Center title -ag_map1 - -# Color Palettes --------------------------------------------------------------- -RColorBrewer::display.brewer.all() - -# Map with Different Color Palette --------------------------------------------- -ag_map2 <- ggplot() + - geom_polygon(data = ag_fields_df, aes(x = long, y=lat, group=group, - fill=expend_food_yearly_mean), - color="black", size=0.25) + # Borders of polygons - - coord_quickmap() + - theme_void() + - scale_fill_distiller(palette = "Spectral", direction = 1) + # Color Gradient - labs(fill="Food\nExpenditure", title="Annual Food Expenditure") + - theme(plot.title = element_text(hjust = 0.5, face="bold")) -ag_map2 - -# Add Text to Map [Way 1] ------------------------------------------------------ - -# Create dataframe of center of each plot with site name as variable -ag_fields_center <- gCentroid(ag_fields, byid=T) -ag_fields_center <- as.data.frame(coordinates(ag_fields_center)) - -ag_fields_center$site <- ag_fields$site -names(ag_fields_center) <- c("longitude","latitude","site") - -ag_map2 + geom_text(data=ag_fields_center, aes(label=site, x=longitude, y=latitude), - check_overlap = TRUE) # makes sure text doesn't overlap - -# Add Text to Map [Way 2] ------------------------------------------------------ - - -# Add Households to Map -------------------------------------------------------- -ag_map2 <- ggplot() + - geom_polygon(data = ag_fields_df, aes(x = long, y=lat, group=group, - fill=expend_food_yearly_mean), - color="black", size=0.25) + # Borders of polygons - geom_point(data=hh_data, - aes(x=longitude_scramble, - y=latitude_scramble, - color="HH Location"), # Name an aesthetic want you want to - # appear on legend - size=.1, alpha=.6) + - scale_color_manual(values="black") + # Manually define color of points - coord_quickmap() + - theme_void() + - scale_fill_gradient(low = "orangered1", high = "green1") + - labs(fill="Food\nExpenditure", title="Annual Food Expenditure", - color="") + # Setting color="" makes the title above the legend item blank - theme(plot.title = element_text(hjust = 0.5, face="bold")) -ag_map2 - -# Interactive Map -------------------------------------------------------------- - -# Better Interactive Map ------------------------------------------------------- -pal_foodexpend <- colorNumeric("RdYlGn", ag_fields$expend_food_yearly_mean) - -imap_3 <- leaflet() %>% - addProviderTiles("OpenStreetMap") %>% - addPolygons(data=ag_fields, - fillColor = ~pal_foodexpend(expend_food_yearly_mean), - fillOpacity = 0.6, - color="black", # color of line around polygons - weight=1, # width of line around polygons - popup=~site) -imap_3 - -# *** EXERCISE *** ------------------------------------------------------------- -# Import Rwanda Administrative Zone 3 Polygon - -# Import Vegetation Data -rwa_veg <- read.csv(file.path(finalData, "rwanda_vegetation.csv")) - -# Simplify the polygon - -# Resulting layer from gSimplify has no variables, so need to add a variable back in diff --git a/DataWork/DataSets/Intermediate/placeholder.txt b/DataWork/DataSets/Intermediate/placeholder.txt deleted file mode 100644 index e69de29..0000000 diff --git a/DataWork/DataSets/Raw/placeholder.txt b/DataWork/DataSets/Raw/placeholder.txt deleted file mode 100644 index e69de29..0000000 diff --git a/DataWork/Documentation/placeholder.txt b/DataWork/Documentation/placeholder.txt deleted file mode 100644 index e69de29..0000000 diff --git a/DataWork/Output/Final/placeholder.txt b/DataWork/Output/Final/placeholder.txt deleted file mode 100644 index e69de29..0000000 diff --git a/DataWork/Output/Raw/desc_table.tex b/DataWork/Output/Raw/desc_table.tex new file mode 100644 index 0000000..4141e8f --- /dev/null +++ b/DataWork/Output/Raw/desc_table.tex @@ -0,0 +1,18 @@ + +% Table created by stargazer v.5.2.2 by Marek Hlavac, Harvard University. E-mail: hlavac at fas.harvard.edu +% Date and time: Thu, Dec 20, 2018 - 2:20:26 PM +\begin{table}[!htbp] \centering + \caption{} + \label{} +\begin{tabular}{@{\extracolsep{5pt}}lccccc} +\\[-1.8ex]\hline +\hline \\[-1.8ex] +Statistic & \multicolumn{1}{c}{N} & \multicolumn{1}{c}{Mean} & \multicolumn{1}{c}{St. Dev.} & \multicolumn{1}{c}{Min} & \multicolumn{1}{c}{Max} \\ +\hline \\[-1.8ex] +Happy score & 470 & 5.37 & 1.14 & 2.69 & 7.59 \\ +GDP per capita & 470 & 0.93 & 0.42 & 0.00 & 1.87 \\ +Freedom & 470 & 0.40 & 0.15 & 0.00 & 0.67 \\ +Trust in gornment and currption & 470 & 0.13 & 0.11 & 0.00 & 0.55 \\ +\hline \\[-1.8ex] +\end{tabular} +\end{table} diff --git a/DataWork/Output/Raw/placeholder.txt b/DataWork/Output/Raw/placeholder.txt deleted file mode 100644 index e69de29..0000000 diff --git a/Exercise solutions/Data cleaning/statsYearBook_processing.R b/Exercise solutions/Data cleaning/statsYearBook_processing.R new file mode 100644 index 0000000..edb7d26 --- /dev/null +++ b/Exercise solutions/Data cleaning/statsYearBook_processing.R @@ -0,0 +1,133 @@ +#------------------------------------------------------------------------------# + +# UN stats year book data processing + +#------------------------------------------------------------------------------# + + + +if (Sys.getenv("USERNAME") == "Leonardo"){ + projectFolder <- "C:/Users/Leonardo/Documents/GitHub/dime-r-training" + dropbox <- "C:/Users/Leonardo/Dropbox/Work/WB/R Training/R training" + +} + +if (Sys.getenv("USERNAME") == "WB519128"){ + projectFolder <- file.path("C:/Users/WB519128/Documents/GitHub/dime-r-training") + dropbox <- "C:/Users/WB519128/Dropbox/Work/WB/R Training/R training" + +} + +UNstatsDat <- file.path(dropbox,"Data/World/Raw/UN stats yearbook") + + +#------------------------------------------------------------------------------# +#### Load data #### + +pop <- read.csv(file.path(UNstatsDat, "T02_Pop.csv"), header = T, stringsAsFactors = F) +int <- read.csv(file.path(UNstatsDat,"T29_Internet.csv"), header = T, stringsAsFactors = F) +crime <- read.csv(file.path(UNstatsDat,"T12_Crime.csv"), header = T, stringsAsFactors = F) +#gdp <- read.csv(file.path(UNstatsDat,"T13_Gdp.csv"), header = T, stringsAsFactors = F) +#educ <- read.csv(file.path(UNstatsDat,"T07_Education.csv"), header = T, stringsAsFactors = F) +teach <- read.csv(file.path(UNstatsDat,"T08_Teachers.csv"), header = T, stringsAsFactors = F) +med <- read.csv(file.path(UNstatsDat,"T10_Health_Personnel.csv"), header = T, stringsAsFactors = F) + +#------------------------------------------------------------------------------# +#### Basic cleaning #### + +treat <- + function(x){ + names(x) <- c("code", + "country", + "year", + "var", + "value", + "coomments", + "source") + + nonCountries <- c("Total, all countries or areas", + "Sub-Saharan Africa", + "Latin America & the Caribbean", + "South-central Asia", + "Australia and New Zealand", + "Oceania", + "Africa", + "Northern Africa", + "Eastern Africa", + "Middle Africa", + "Southern Africa", + "Western Africa", + "Americas", + "Northern America", + "Caribbean", + "Central America", + "South America", + "Asia", + "Central Asia", + "Eastern Asia", + "South-eastern Asia", + "Southern Asia", + "Western Asia", + "Europe", + "Eastern Europe", + "Northern Europe", + "Southern Europe", + "Western Europe", + "Melanesia") + + + keepVars <- c("Population mid-year estimates (millions)", + "Sex ratio (males per 100 females)", + "Health personnel: Physicians (number)", + "Teachers at primary level (thousands)", + "Teachers at secondary level (thousands)", + "Intentional homicide rates per 100,000", + "Percentage of individuals using the internet") + + + x$coomments <- NULL + x$source <- NULL + + + x <- x[!(x$country %in% nonCountries),] + x <- x[x$year > 2014,] + + x <- x[x$var %in% keepVars,] + + x <- spread(x, + key = var, + value = value) + + + + return(x) + } + + + +pop <- treat(pop) +int <- treat(int) +crime <- treat(crime) +#gdp <- treat(gdp) +#educ <- treat(educ) +teach <-treat(teach) +med <- treat(med) + + +#------------------------------------------------------------------------------# +#### Merge #### + +teach$year <- as.integer(teach$year) +teach$code <- as.integer(teach$code) + + +x1 <- pop %>% + full_join(int, by = c("code", "year", "country")) %>% + full_join(crime, by = c("code", "year", "country")) %>% + full_join(teach, by = c("code", "year", "country")) %>% + full_join(med, by = c("code", "year", "country")) + +whr <- +rename(whr, code = coutry_code) + +foo <- left_join(whr, x1, by = c("code", "year")) diff --git a/Exercise solutions/Data cleaning/whr_processing.R b/Exercise solutions/Data cleaning/whr_processing.R new file mode 100644 index 0000000..0ecff48 --- /dev/null +++ b/Exercise solutions/Data cleaning/whr_processing.R @@ -0,0 +1,127 @@ +#------------------------------------------------------------------------------# + +# R training - World happiness report data processing + +#------------------------------------------------------------------------------# + + +# Load the pacakge than contains ToothGrowth dataset +library(datasets) + +# File paths + +if (Sys.getenv("USERNAME") == "WB501238"){ + projectFolder <- "C:/Users/WB501238/Documents/GitHub/dime-r-training" + +} + +if (Sys.getenv("USERNAME") == "Leonardo"){ + projectFolder <- "C:/Users/Leonardo/Documents/GitHub/dime-r-training" + +} + +if (Sys.getenv("USERNAME") == "WB519128"){ + projectFolder <- file.path("C:/Users/WB519128/Documents/GitHub/dime-r-training") +} + +# File paths +dataWorkFolder <- file.path(projectFolder,"DataWork") + +Data <- file.path(dataWorkFolder,"DataSets") +finalData <- file.path(Data,"Final") + + +#------------------------------------------------------------------------------# +# Load CSV data +whr15 <- read.csv(file.path(Data,"Raw/WHR2015.csv"), header = T) +whr16 <- read.csv(file.path(Data,"Raw/WHR2016.csv"), header = T) +whr17 <- read.csv(file.path(Data,"Raw/WHR2017.csv"), header = T) + +codes <- read.csv(file.path(Data,"Raw/country_codes.csv"), header = T) + + +#### Fix countries and regions in 2017 +whr17$Region <- "" + +whr17$Country <- as.character(whr17$Country) +whr17$Country[whr17$Country == "Taiwan Province of China"] <- "Taiwan" +whr17$Country[whr17$Country == "Hong Kong S.A.R., China"] <- "Hong Kong" + + + + + +whr17$Region <- whr16$Region[match(whr17$Country, whr16$Country)] + +# countries added in 2017 + +whr17$Region[whr17$Country == "Mozambique"] <- "Sub-Saharan Africa" +whr17$Region[whr17$Country == "Lesotho"] <- "Sub-Saharan Africa" +whr17$Region[whr17$Country == "Central African Republic"] <- "Sub-Saharan Africa" + + + + +#### Standardize varibles + +#Year variable +whr15$year <- 2015 +whr16$year <- 2016 +whr17$year <- 2017 + + +keepVars <- c("Country", + "Region", + "year", + "Happiness.Rank", + "Happiness.Score", + "Economy..GDP.per.Capita.", + "Family", + "Health..Life.Expectancy.", + "Freedom", + "Trust..Government.Corruption.", + "Generosity", + "Dystopia.Residual") + +whr15 <- whr15[,keepVars] +whr16 <- whr16[,keepVars] +whr17 <- whr17[,keepVars] + + + + +#------------------------------------------------------------------------------# +# Append +whr <- rbind(whr17, whr16, whr15) + +# Rename variaables +newVar_names <- c("country", + "region", + "year", + "happy_rank", + "happy_score", + "gdp_pc", + "family", + "health", + "freedom", + "trust_gov_corr", + "generosity", + "dystopia_res") +names(whr) <- newVar_names + +#------------------------------------------------------------------------------# +#### Match codes #### + +whr$Country[whr$country == "Somaliland region"] <- "Somaliland Region" + +whr$coutry_code <- codes$code[match(whr$country, codes$country)] + +whr <- whr %>% select(coutry_code, country, everything()) + +#------------------------------------------------------------------------------# +#### Export #### + +write.csv(whr, + file.path(Data, "whr_panel.csv"), + na = "", + row.names = F) diff --git a/Exercise solutions/Lab 2 - Challenge.R b/Exercise solutions/Lab 2 - Challenge.R new file mode 100644 index 0000000..db1bc92 --- /dev/null +++ b/Exercise solutions/Lab 2 - Challenge.R @@ -0,0 +1,80 @@ +library(stargazer) +library(tidyverse) + +# File paths + +if (Sys.getenv("USERNAME") == "luiza"){ + projectFolder <- "C:/Users/luiza/Documents/GitHub/dime-r-training" + +} + +if (Sys.getenv("USERNAME") == "WB501238"){ + projectFolder <- "C:/Users/WB501238/Documents/GitHub/dime-r-training" + +} + +if (Sys.getenv("USERNAME") == "Leonardo"){ + projectFolder <- "C:/Users/Leonardo/Documents/GitHub/dime-r-training" + +} + +if (Sys.getenv("USERNAME") == "WB519128"){ + projectFolder <- file.path("C:/Users/WB519128/Documents/GitHub/dime-r-training") +} + +# File paths +dataWorkFolder <- file.path(projectFolder,"DataWork") + +Data <- file.path(dataWorkFolder,"DataSets") +finalData <- file.path(Data,"Final") +rawOutput <- file.path(dataWorkFolder,"Output","Raw") + +# Load CSV data +whr <- read.csv(file.path(finalData,"whr_panel.csv"), + header = T, + stringsAsFactors = F) + +#### Construct a duplicated df +x1 <- whr +x1$stat <- "mean" + +x2 <- whr +x2$stat <- "N" + + +x12 <- rbind(x1,x2) + +#### Collapse +happy_table2 <- + aggregate(happy_score ~ stat + region + year, + data = x12, + FUN = mean) + +happy_table2 <- + rename(happy_table2, value = happy_score) + +#### Freq df + +freqdf <- + as.data.frame(table(whr$region, whr$year)) + +#### replace values +happy_table2$value[happy_table2$stat == "N"] <- freqdf$Freq + +happy_table2 <- + select(happy_table2, region, year, stat, value) + +#### Reshape +ht_wd <- + spread(happy_table2, + key = year, + value = value) + +#### Cosmetics + +ht_wd$region[as.integer(row.names(ht_wd)) %% 2 == 0] <- "" + +stargazer(ht_wd, + summary = F, + rownames = F) + diff --git a/Exercise solutions/Lab 4 - Descriptive Analysis.R b/Exercise solutions/Lab 2 - Descriptive Analysis.R similarity index 100% rename from Exercise solutions/Lab 4 - Descriptive Analysis.R rename to Exercise solutions/Lab 2 - Descriptive Analysis.R diff --git a/Exercise solutions/Lab 3 - Data Processing.R b/Exercise solutions/Lab 3 - Data Processing.R deleted file mode 100644 index 36a2497..0000000 --- a/Exercise solutions/Lab 3 - Data Processing.R +++ /dev/null @@ -1,142 +0,0 @@ -# ------------------------------------------------------------------------------ # -# # -# DIME # -# Data Processing in R # -# # -# ------------------------------------------------------------------------------ # - -# PART 1: Load data -------------------------------------------------------------- - - # Load panel data - panel <- read.csv(file.path(rawData,"lwh_panel.csv")) - - # Load endline data - endline <- read.dta13(file.path(rawData,"endline_data_raw.dta")) - -# PART 1: Explore data ---------------------------------------------------------- - - # See objects' formats - class(endline) - class(panel) - - # How many observations and variables? - dim(endline) - dim(panel) - - # Variables names - names(endline) - - # Data set structure - str(panel) - - # Variable structure - str(panel$treatment_hh) - -# PART 2: ID variables ------------------------------------------------------------ - - # Uniquely identifying? - sum(duplicated(panel[,c("hh_code","year")])) - # Fully identifiying? - sum(is.na(panel$year)) - sum(is.na(panel[,c("hh_code","year")])) - - # Create panel ID - panel$id <- (panel$hh_code * 10000) + panel$year - - # Check properties - sum(duplicated(panel$id)) - sum(is.na(panel$id)) - - # Here's a shortcut similar to the isid variable in Stata - length(unique(panel$id, na.rm = TRUE)) == length(panel$id) - -# PART 3: Drop variables ------------------------------------------------------------ - - # List variables we want in the final data set - id_vars <- c("hh_code", "wave", "year", "treatment_hh", - "treatment_site", "site_code") - demographic_vars <- c("gender_hhh", "age_hhh", "num_dependents", "read_and_write") - yield_vars <- c("w_gross_yield_a", "w_gross_yield_b") - food_vars <- c("expend_food_yearly", "expend_food_lastweek", "wdds_score") - - head(names(panel), 40) - income_vars <- income_vars <- names(panel)[28:37] - - names(panel)[grep("area", names(panel))] - plot_area_vars <- c("w_gross_yield_a", "w_gross_yield_b", "w_gross_yield_c") - - - # Subset data set - lwh <- panel[,c(id_vars, - demographic_vars, - yield_vars, - food_vars, - income_vars, - plot_area_vars)] - -# PART 4: Factor variables ------------------------------------------------------------------- - -# Create a factor variable - - # The numeric variable - table(lwh$gender_hhh) - - # Turn numeric variable into factor - lwh$gender_hhh <- factor(lwh$gender_hhh, - levels = c(0, 1), - labels = c("Female", "Male")) - - # The factor variable - table(lwh$gender_hhh) - -# Order an existing factor variable - - # Unordered factor - str(panel$wave) - levels(panel$wave) - table(panel$wave, panel$year) - - # Ordered factor - panel$wave <- factor(panel$wave, - c("Baseline", "FUP1&2", "FUP3", "FUP4", "Endline"), - ordered = T) - str(panel$wave) - table(panel$wave, panel$year) - -# PART 5: Construct indicators ------------------------------------------------------------- - - # Aggregate income - lwh$income_total <- rowSums(lwh[,income_vars], na.rm = TRUE) - lwh[,income_vars] <- NULL - - # A customized function to winsorize observations at the 90th percentile - winsor <- function(x) { - x[x > quantile(x, 0.9, na.rm = T)] <- - quantile(x, 0.9, na.rm = T) - return(x) - } - - # Create winsorized income - lwh$income_total_win <- winsor(lwh$income_total) - - # Create the trim function - trim <- function(x) { - x[x > quantile(x, 0.9, na.rm = T)] <- NA - return(x) - } - - # Trim income - lwh$income_total_trim <- trim(lwh$income_total) - - # Compare variables - summary(lwh$income_total) - summary(lwh$income_total_win) - summary(lwh$income_total_trim) - -# PART 6: Save ------------------------------------------------------------------------------ - - # Save the lwh data set - write.csv(lwh, - file.path(finalData,"lwh_clean.csv"), - row.names = F) - \ No newline at end of file diff --git a/Exercise solutions/Lab 5 - Data Visualization.R b/Exercise solutions/Lab 5 - Data Visualization.R deleted file mode 100644 index 5853fcf..0000000 --- a/Exercise solutions/Lab 5 - Data Visualization.R +++ /dev/null @@ -1,165 +0,0 @@ -# ------------------------------------------------------------------------------ # -# # -# DIME # -# Data Visualization # -# # -# ------------------------------------------------------------------------------ # - -# Set up ------------------------------------------------------------------------- - - # Install packages - install.packages(c("ggplot2","plotly"), - dependencies = TRUE) - - # Load packages - library(ggplot2) - library(plotly) - - # Set file paths - projectFolder <- "YOUR/FOLDER/PATH" - finalData <- file.path(projectFolder, - "DataWork","DataSets","Final") - - # Load LWH data - lwh <- read.csv(file.path(finalData,"lwh_clean.csv"), - header = T) - - # Subset data - covariates <- c("age_hhh", - "num_dependents", - "income_total_trim", - "expend_food_yearly", - "w_area_plots_a") - lwh_simp <- lwh[,covariates] - -# EXERCISE 1: Plot a whole data set ----------------------------------------------- - - plot(lwh_simp) - -# EXERCISE 2: Create a histogram -------------------------------------------------- - - hist(lwh_simp$w_area_plots_a, - col = "green") - -# EXERCISE 3: Create a boxplot ---------------------------------------------------- - - boxplot(lwh_simp$expend_food_yearly ~ lwh_simp$num_dependents) - -# EXERCISE 4: ggpplot ------------------------------------------------------------- - - ggplot(data = lwh, - aes(y = w_area_plots_b, - x = w_gross_yield_b)) + - geom_point() - -# EXERCISE 5: Plot an aggregated data set ----------------------------------------- - - # Aggregate anual income by year - anualInc <- - aggregate(x = lwh["income_total_trim"], # data.frame - by = list(year = lwh$year), #list - FUN = mean, na.rm = T) # function - - # Plot income by year - ggplot(data = anualInc, - aes(y = income_total_trim, - x = year)) + - geom_line() - - # Aggregate anual income by year and treatment status - anualInc <- - aggregate(x = lwh["income_total_trim"], - by = list(Year = lwh$year, - Treatment = lwh$treatment_hh), - FUN = mean, na.rm = T) - - # Plot income by year and treatment status - ggplot(anualIncGen, - aes(y = income_total_win, - x = year, - color = treatment, - group = treatment)) + - geom_line() + - geom_point() - -# EXERCISE 6: ggplot aesthectics --------------------------------------------------- - - # Subset lwh to 2018 and Rwamangana 35 - lwh_s <- lwh[lwh$year == 2018 & lwh$site_code == "Rwamangana 35", ] - - # Simple scatter plot - ggplot(lwh_s, aes(x = w_area_plots_a, - y = income_total_trim)) + - geom_point() - - # Adding women's dietary score to scatter plot - ggplot(data = lwh_s, - aes(y = w_area_plots_a, - x = income_total_trim, - color = factor(wdds_score))) + - geom_point() - - # Adding food expenditure as well - ggplot(data = lwh_s, - aes(y = w_area_plots_a, - x = income_total_trim, - color = factor(wdds_score), - size = expend_food_lastweek)) + - geom_point() - - # Using a continuous variable for color instead of a categorical one - ggplot(data = lwh_s, - aes(y = w_area_plots_a, - x = income_total_trim, - color = w_gross_yield_a, - size = expend_food_lastweek)) + - geom_point() - -# EXERCISE 7: Labelling in ggplot ---------------------------------------------------- - - # A properly labelled plot - ggplot(lwh_s, aes(y = w_area_plots_a, - x = income_total_trim, - size = expend_food_lastweek, - color = factor(wdds_score))) + - geom_point() + - ggtitle("My pretty plot") + - xlab("Total Income (Wins.)") + - ylab("Plot area in season A (Wins.)") + - scale_color_discrete(name = "Women's dietary diversity score") + - scale_size_continuous(name = "Food exp. last week") - -# EXERCISE 8: Saving a plot ---------------------------------------------------------- - - # Open PDF graphics device - pdf(file = file.path(rawOutput, "Example plot.pdf")) - - # Plot - ggplot(data = lwh_s, - aes(y = w_area_plots_a, - x = income_total_trim, - color = factor(wdds_score), - size = expend_food_lastweek)) + - geom_point() + - ggtitle("My pretty plot") + - xlab("Total Income (Wins.)") + - ylab("Plot area in season A (Wins.)") + - scale_color_discrete(name = "Women's dietary diversity score") + - scale_size_continuous(name = "Food exp. last week") - - # Close PDF graphics device - dev.off() - -# BONUS EXERCISE: Interactive graph -------------------------------------------------- - - # Store graph in interactive_plot - interactive_plot <- ggplot(data = lwh_s, - aes(y = w_area_plots_a, - x = income_total_trim, - color = factor(wdds_score))) + - geom_point() - - # Use ggplotly to create an interactive plot - ggplotly(interactive_plot) - - \ No newline at end of file diff --git a/Exercise solutions/Lab 6 - Data Processing.R b/Exercise solutions/Lab 6 - Data Processing.R new file mode 100644 index 0000000..156879d --- /dev/null +++ b/Exercise solutions/Lab 6 - Data Processing.R @@ -0,0 +1,165 @@ + + +# for (year in 2015:2017) { +# download.file(paste0("https://www.kaggle.com/unsdsn/world-happiness/downloads/", year ,".csv/2"), +# destfile = file.path(Data,paste0("Raw/WHR", year, ".csv")), +# method = "curl") # method = "curl" [mac only for https] +# +# } + +dateDownloaded <- date() + +whr15 <- read.csv(file.path(Data,"Raw/WHR2015.csv"), + header = T, + stringsAsFactors = F) +whr16 <- read.csv(file.path(Data,"Raw/WHR2016.csv"), + header = T, + stringsAsFactors = F) +whr17 <- read.csv(file.path(Data,"Raw/WHR2017.csv"), + header = T, + stringsAsFactors = F) +View(whr15) +class(whr15) +dim(whr15) +str(whr15) +head(whr15) +summary(whr15) + + +str(whr15) +str(whr16) +str(whr17) + +n_distinct(whr15$Country, na.rm = TRUE) +n_distinct(whr16$Country, na.rm = TRUE) +n_distinct(whr17$Country, na.rm = TRUE) + +setdiff(whr15$Country, whr16$Country) +setdiff(whr16$Country, whr15$Country) +setdiff(whr16$Country, whr17$Country) +setdiff(whr17$Country, whr16$Country) + + +whr15$Country[whr15$Country == "Somaliland region"] <- "Somaliland Region" +whr17$Country[whr17$Country == "Hong Kong S.A.R., China"] <- "Hong Kong" +whr17$Country[whr17$Country == "Taiwan Province of China"] <- "Taiwan" + + +names(whr15) +names(whr16) + + +whr15$year <- 2015 +whr16$year <- 2016 +whr17$year <- 2017 + +whr_panel <- rbind(whr15, whr16, whr17) +whr_panel <- bind_rows(whr15, whr16, whr17) + +setdiff(names(whr15), names(whr16)) +setdiff(names(whr16), names(whr15)) +setdiff(names(whr17), names(whr16)) +setdiff(names(whr16), names(whr17)) + +whr17 <- rename(whr17, + Lower.Confidence.Interval = Whisker.low, + Upper.Confidence.Interval = Whisker.high) + +whr_panel <- rbind(whr15, whr16, whr17) +whr_panel <- bind_rows(whr15, whr16, whr17) + + + +keepVars <- c("Country", + "Region", + "year", + "Happiness.Rank", + "Happiness.Score", + "Economy..GDP.per.Capita.", + "Family", + "Health..Life.Expectancy.", + "Freedom", + "Trust..Government.Corruption.", + "Generosity", + "Dystopia.Residual") + +whr15 <- select(whr15, keepVars) +whr16 <- select(whr16, keepVars) + +regions <- select(whr16, Country, Region) + + +whr17 <- left_join(whr17, regions) +whr17 <- select(whr17, keepVars) + +whr_panel <- rbind(whr15, whr16, whr17) + +names(whr_panel) <- c("country", + "region", + "year", + "happy_rank", + "happy_score", + "gdp_pc", + "family", + "health", + "freedom", + "trust_gov_corr", + "generosity", + "dystopia_res") + + +any(is.na(whr_panel$region)) +sum(is.na(whr_panel$region)) +whr_panel$country[is.na(whr_panel$region)] + +whr_panel$region[whr_panel$country == "Mozambique"] <- "Sub-Saharan Africa" +whr_panel$region[whr_panel$country == "Lesotho"] <- "Sub-Saharan Africa" +whr_panel$region[whr_panel$country == "Central African Republic"] <- "Sub-Saharan Africa" + +any(is.na(whr_panel$region)) + +str(whr_panel$region) +whr_panel$region_cat <- as.factor(whr_panel$region) +str(whr_panel$region_cat) +levels(whr_panel$region_cat) +table(whr_panel$region_cat) + +whr_panel$region_ord <- factor(whr_panel$region, + levels = c("Australia and New Zealand", + "Central and Eastern Europe", + "Western Europe", + "Latin America and Caribbean", + "North America", + "Southeastern Asia", + "Southern Asia", + "Eastern Asia", + "Middle East and Northern Africa", + "Sub-Saharan Africa"), + ordered = T) +table(whr_panel$region_ord) + +whr_panel <- mutate(whr_panel, happy_high = happy_score > median(happy_score)) + +whr_panel$happy <- factor(whr_panel$happy_high, + levels = c(FALSE, TRUE), + labels = c("Not so happy", "Happy")) + +happy_panel <- filter(whr_panel, happy == "Happy") + +# Reshaping +whr_happy <- select(whr_panel, country, region, year, happy) + +whr_happy_wide <- spread(whr_happy, + key = year, + value = happy) + +whr_happy_long <- gather(whr_happy_wide, + key = year, + value = happy, + `2015`, `2016`, `2017`) + +whr_happy <- select(whr_panel, country, region, year, happy, happy_score) + +whr_hapy_reshape <- reshape(whr_happy, + v.names = "year", + ) \ No newline at end of file diff --git a/Exercise solutions/MASTER.R b/Exercise solutions/MASTER.R index 6853b0c..f5e1e97 100644 --- a/Exercise solutions/MASTER.R +++ b/Exercise solutions/MASTER.R @@ -2,7 +2,7 @@ # # # DIME # # Introduction to R for Stata users # -# MASTER DO_FILE # +# MASTER # # # #------------------------------------------------------------------------------# @@ -13,7 +13,7 @@ # WRITTEN BY: Luiza Cardoso de Andrade, Robert A. Marty, Leonardo Viotti -# Last modified in May 2018 +# Last modified in Nov 2018 # PART 0: Clear boiler plate -------------------------------------------------- diff --git a/Presentations/Lab 1 - Intro to R.Rmd b/Presentations/Lab 1 - Intro to R.Rmd index 4634011..49e1d6a 100644 --- a/Presentations/Lab 1 - Intro to R.Rmd +++ b/Presentations/Lab 1 - Intro to R.Rmd @@ -1,7 +1,7 @@ --- -title: "R basics for Stata users" -subtitle: "Field Coordinator Training - R Track" -date: "June 2018" +title: "R basics" +subtitle: "R for Stata Users" +date: "November-December 2018" author: "Luiza Andrade, Leonardo Viotti & Rob Marty " output: beamer_presentation: @@ -9,7 +9,7 @@ output: theme: "Madrid" colortheme: "whale" fonttheme: "default" - toc: true + toc: false includes: in_header: header.tex --- @@ -25,17 +25,17 @@ library(datasets) # File paths if (Sys.getenv("USERNAME") == "WB501238"){ - projectFolder <- "C:/Users/WB501238/Documents/GitHub/R-Training" + projectFolder <- "C:/Users/WB501238/Documents/GitHub/dime-r-training" } if (Sys.getenv("USERNAME") == "Leonardo"){ - projectFolder <- "C:/Users/Leonardo/Documents/GitHub/R-Training" + projectFolder <- "C:/Users/Leonardo/Documents/GitHub/dime-r-training" } if (Sys.getenv("USERNAME") == "WB519128"){ - projectFolder <- file.path("C:/Users/WB519128/Documents/GitHub/R-Training") + projectFolder <- file.path("C:/Users/WB519128/Documents/GitHub/dime-r-training") } # File paths @@ -44,47 +44,66 @@ if (Sys.getenv("USERNAME") == "WB519128"){ Data <- file.path(dataWorkFolder,"DataSets") finalData <- file.path(Data,"Final") # Load CSV data -lwh <- read.csv(file.path(finalData,"lwh_clean.csv"), +# lwh <- read.csv(file.path(finalData,"lwh_clean.csv"), +# header = T) + +whr <- read.csv(file.path(finalData,"whr_panel.csv"), header = T) ``` - # Introduction ## Introduction These training sessions will offer a quick introduction to R, its amazing features and why it is so much better than Stata. -R is a powerful and flexible tool with a big and active community of users and developers that constantly posts in blogs and forums. +## Introduction + +This first session will present the basic concepts you will need to use R. + +The next sessions will include: + + * __Exporting summary statistics and regression tables__ (Nov 20) + * __Creating awesome-looking plots__ (Nov 29) + * __Writing reproducible and readable code__ (Dec 6) + * __Creating awesome-looking maps__ (Dec 13) + * __Cleaning data__ (Dec 20) + + +For the most recent versions of these trainings, visit the R-training GitHub repo at +https://github.com/worldbank/dime-r-training + +(currently under the 'development' branch) ## Introduction -Why choose R over Stata? +Some advantages of R over Stata: - * It is way cooler. * It is less specialized: + More flexibility when programming. - + Many other functionalities. - * Much broader network of users in general: - + Using google is a lot easier (you'll never want to see Statalist again in your life). + + Many more functionalities. + * Much broader network of users: + + More resources online, which makes using Google a lot easier. You'll never want to see Statalist again in your life. + Development of new features and bug fixes happens faster. - * It makes prettier graphs. + * It is way cooler. + ## Introduction -What are the possible disadvantages of R? +Some possible disadvantages of R: - * Steeper learning curve (at least in the beginning). + * Higher cost of entry than Stata. + + That doesn't mean that the learning curve is steeper all the way up! * Stata is more specialized: - + Certain common tasks are simpler in Stata, especially when you're doing them for the first time in R. + + Certain common tasks are simpler in Stata. * Stata has wider adoption among micro-econometricians. + Network externalities in your work environment. + Development of new specialized techniques and tools could happen faster (e.g. *ietoolkit*). -## Introduction +## Introduction Here are some other advantages: @@ -100,23 +119,17 @@ Here are some other advantages: * You can create complex Markdown documents. This presentation, for example, is entirely done in RStudio. -* You can create dashboards and online applications with the Shiny package. - +* You can create interactive dashboards and online applications with the Shiny package. ## Introduction -This first session will present the basic concepts you will need to use R. +What about Python? -The next sessions will include: + * Python is even more flexible and has more users than R. So, why should I bother to learn R? + * Despite being super popular for data science, Python has fewer libraries developed for econometrics. + * Python still cannot do everything Stata does without some trouble, R can. + * R and Python are very similar, specially if your background is in Stata. -* __Coding for reproducible research__ - Programming basics and best practices. -* __Data processing__ - Data processing workflow and main functions. -* __Descriptive analysis__ - Exploratory and publication tables. -* __Data visualization__ - Exploratory and publication graphs. -* __Spatial analysis__ - GIS basics in R. - -For the most recent versions of these trainings, visit the R-training GitHub repo at -https://github.com/luizaandrade/R-Training # Getting started @@ -130,19 +143,23 @@ This training requires that you have R installed in your computer: * If you're in the US, you can directly visit the mirror at Berkley university at (https://cran.cnr.berkeley.edu). - * Although, not necessary, we also strongly suggest installing R studio. You can get it in (https://www.rstudio.com/), but you need to install R first. + * we also strongly suggest installing R studio. You can get it in (https://www.rstudio.com/), but you need to install R first. + ## Getting started +Let's start by loading the data set we'll be using: + ### Exercise 1: Import data -Let's start by loading the data set we'll be using: + 1. In RStudio, go to File > Import Dataset > From Text (Base) and open the whr_panel.csv file. + + Depending on your Rstudio version, it might be File > Import Dataset > From CSV + + 2. The file should be in GitHub/dime-r-training/DataWork/DataSets/Final + + 3. Change the name to `whr` on the import window + -\begin{enumerate} - \item In RStudio, go to File > Import Dataset > From Text (Base) and open the \texttt{lwh\_clean.csv} file. Depending on your Rstudio version, it might be From CSV - \item The file should be in Session Materials/R Track/DataWork/DataSets/Final - \item Change the name to `lwh` on the import window -\end{enumerate} ## Getting started @@ -155,23 +172,25 @@ Let's start by loading the data set we'll be using: \begin{figure} \centering - \includegraphics[scale=0.25]{img/import_data3.png} + \includegraphics[scale=0.5]{img/import_data2.png} \end{figure} # RStudio interface ## RStudio interface -```{r, out.width = "100%", echo=FALSE} -knitr::include_graphics("img/interface2.png") -#![](img/interface.png) -``` +\begin{figure} +\centering + \includegraphics[width=12cm,height=7.7cm]{img/interface.png} +\end{figure} + + ## RStudio interface \begin{figure} \centering - \includegraphics[width=12cm,height=7.7cm]{img/enviroment2.png} + \includegraphics[width=12cm,height=7.7cm]{img/enviroment.png} \end{figure} ## RStudio interface @@ -195,6 +214,12 @@ knitr::include_graphics("img/interface2.png") \includegraphics[width=12cm,height=7.7cm]{img/scritpt3.png} \end{figure} +## RStudio interface + +\begin{figure} +\centering + \includegraphics[width=12cm,height=7.7cm]{img/scritpt4.png} +\end{figure} # Object-oriented language @@ -218,51 +243,69 @@ knitr::include_graphics("img/interface2.png") * Whenever you run anything you intend to use in the future, you need to store it as an object. + + ## Object-oriented language -To better understand the idea, we're going to use the data from the Rwanda LWH project. First, let's take a look at the data. +To better understand the idea, we're going to use the data from the United Nations' World Happiness Report. First, let's take a look at the data. Type the following code to explore the data: ```{r, include = T, results = "hide"} # We can use the function View() to browse the whole data -View(lwh) +View(whr) -# Alternatively we can print the first 5 obs. with head() -head(lwh) +# Alternatively we can print the first 6 obs. with head() +head(whr) ``` +## Object-oriented language + +\begin{figure} +\centering + \includegraphics[scale=0.4]{img/View.png} +\end{figure} ## Object-oriented language +\scriptsize -Now, let's try some simple manipulations. First, assume we're only interested in data of the year 2018. +```{r, include = T} + +head(whr) + +``` + +## Object-oriented language + +Now, let's try some simple manipulations. First, assume we're only interested in data of the year 2015. ### Exercise 2: Subset the data -1. Subset the ``lwh`` data set, keeping only observations where variable `year` equals `2018`. + +1. Subset the data set, keeping only observations where variable `year` equals `2015`. ```{r, eval=FALSE} # To do that we'll use the subset() function -subset(lwh, year == 2018) +subset(whr, year == 2015) ``` -2. Then, look again at the first 5 observations +2. Then, look again at the first 6 observations ```{r, include = T, results = "hide"} # Use the head() function again -head(lwh) +head(whr) ``` ## Object-oriented language \scriptsize ```{r, eval = F} -head(lwh) +head(whr) ``` ```{r, echo = F} # Use the head() function again -head(lwh[,1:15]) +head(whr[,1:12]) ``` ## Object-oriented language @@ -272,8 +315,8 @@ We can see that nothing happened to the original data. This happens because we d ### To store an object, we use the assignment operator (`<-`): ```{r, include = T, results = "hide"} -# Assign a value of the Answer to the Ultimate Question of -# Life, the Universe, and Everything +# Assign the Answer to the Ultimate Question of Life, +# the Universe, and Everything x <- 42 ``` @@ -283,17 +326,17 @@ From now on, *x* is associated with the stored value (until you replace it delet ### Exercise 3: Create an object -Create a new data set, called `lwh2018`, that is a subset of the ``lwh`` data set containing only data from the year 2018. +Create a new data set, called `whr2015`, that is a subset of the `whr` data set containing only data from the year 2015. ```{r, include = T, results = "hide"} # Using the same function but now assigning it to an object -lwh2018 <- subset(lwh, year == 2018) +whr2015 <- subset(whr, year == 2015) # Display the 5 first obs. of the new data -head(lwh2018) +head(whr2015) # Notice that we still have the original data set intact -head(lwh) +head(whr) ``` @@ -301,48 +344,65 @@ head(lwh) \scriptsize ```{r, eval= F} -head(lwh2018) +head(whr2015) ``` ```{r, echo= F} # Use the head() function again -head(lwh2018[,1:12]) +head(whr2015[,1:12]) ``` +## Object-oriented language +\scriptsize - +```{r, eval= F} +head(whr) +``` + +```{r, echo= F} +# Use the head() function again +head(whr[,1:12]) +``` - - - - ## Object-oriented language -### Two an important concepts to take note: - 1. In R, if you want to change your data, you need to store it in an object. It is possible to simply replace the original data, but often, it's more practical to create a new dataset. +You can also see that your environment pane now has two objects: + +\begin{figure} +\centering + \includegraphics[scale=0.5]{img/enviroment_2vars.png} +\end{figure} - 2. Print (display) is built into R. If you execute an action and don't store it anywhere, R will simply print the results of that action but won't save anything in the memory. +## Object-oriented language +### Two important concepts to take note: + 1. In R, if you want to change your data, you need to store it in an object. + 2. It is possible to simply replace the original data. This happens if you assign the new object to the same name as the original. + 3. Print (display) is built into R. If you execute any action without storing it, R will simply print the results of that action but won't save anything in the memory. # R objects ## R objects -Objects can have different structures, that is, different ways to store data. Objects are the building blocks of R programming. You can create and manipulate them to explore your data and construct analytical outputs. +Objects are the building blocks of R programming. T his section will explore some of the most common. + +This will give you the foundation to explore your data and construct analytical outputs. -There are several types of objects. Here are the ones we will cover: +## R objects + +Here are the types of object we will cover today: -* __Vectors:__ an unidimensional object that stores a sequence of values +* __Vectors:__ an uni-dimensional object that stores a sequence of values * __Data frames:__ a combination of different vectors of the same length (the same as your data set in Stata) * __Lists:__ a multidimensional object that can store several objects of different dimension ## R objects \framesubtitle{Vectors} -A vector is an unidimensional object composed by one or more scalars of the same type. +A vector is an uni-dimensional object composed by one or more scalars of the same type. ### Use the following code to create vectors in two different ways ```{r} @@ -363,25 +423,25 @@ v2[4] ## R objects \framesubtitle{Vectors} -To R, each of the columns of `lwh` is a vector. +To R, each of the columns of `whr` is a vector. ### Calling a vector from a `data.frame` column -We use the `$` to call vector (variables) by their names in a `data.frame` +We use the `$` to call vectors (variables) by their names in a `data.frame` ### Type the following code: ```{r} -# Create a vector with the values of `age_hhh` variable -age_vec <- lwh$age_hhh +# Create a vector with the values of the `year` variable +year_vec <- whr$year -# See the 13th element of the column -lwh$age_hhh[67] +# See the 3 first elements of the year column +whr$year[1:3] ``` ## R objects \framesubtitle{Data Frames} -The `lwh` and `lwh2018` objects are both data frames. You can also construct a new data.frame from scratch by combining vectors. +The `whr` and `whr2015` objects are both data frames. You can also construct a new data frame from scratch by combining vectors with the same number of elements . ### Now, type the following code to create a new data frame ```{r} @@ -397,14 +457,14 @@ Since a data frame has two dimensions, you can use indexing on both: ### Numeric indexing ```{r, eval = F} -# The first column of lwh -lwh[,1] +# The first column of whr +whr[,1] -# The 45th line of lwh -lwh[45,] +# The 45th line of whr +whr[45,] # Or the 45th element of the first line -lwh[45,1] +whr[45,1] ``` @@ -415,8 +475,9 @@ Alternatively, you can use the column names for indexing, which is the same as u ### Names indexing ```{r} -# Or the 45th element of the hh_code column -lwh[45,"hh_code"] +# Or the 22th element of the country column +whr[22,"country"] # The same as whr$country[22] + ``` ## R objects @@ -475,7 +536,7 @@ Create two string vector containing the names of commonly used statistical softw str_vec <- c("R", "Python", "SAS", - "Microsoft Excel", + "Excel", "Stata") ``` Now print them to check them out. @@ -516,7 +577,7 @@ paste(str_vec[1], str_scalar, str_vec[5]) R also has other more complex ways of storing data. These are the most used: ### Factors -Factors are numeric categorical values with text label, equivalent to labelled variables in Stata. Turning strings into factors makes it easier to run different analyses on them and also uses less space in the memory, which is why data frames do that by default. +Factors are numeric categorical values with text label, equivalent to labelled variables in Stata. Turning strings into factors makes it easier to run different analyses on them and also uses less space in your memory. ### Booleans Booleans are logical binary variables, accepting either `TRUE` or `FALSE` as values. They are automatically generated when performing logical operations @@ -524,15 +585,12 @@ Booleans are logical binary variables, accepting either `TRUE` or `FALSE` as val ## Advanced types of data \framesubtitle{Factors} -You can see that in `lwh` the `wave`, `treatment_hh`, `treatment_site`, `site_code` and `gender_hhh` are factor variables. You can see in your enviroment panel the type of all your variables, and for factors the number of levels. +In `whr`, we can see that `country` and `region` are factor variables. In your environment panel, there is the information of the type of all variables, and for factors the number of levels. - - - - - - - +\begin{figure} +\centering + \includegraphics[scale=0.46]{img/enviroment_factors.png} +\end{figure} ## Advanced types of data @@ -550,14 +608,20 @@ Unlike Stata, in R ## Advanced types of data \framesubtitle{Booleans} -Boolean data is not normally used directly in data frames, but rather to express the results of a logical condition. +Boolean data is the result of logical conditions + + * Whenever you're using an `if` statement in Stata, you're implicitly using boolean data. + * The difference is that in R, this can be done in 2 steps. + +## Advanced types of data +\framesubtitle{Booleans} ### Exercise 6: Create boolean vector with the condition of annual income below average: ```{r} # Create vector -bool_vec <- (lwh$income_total_win < - mean(lwh$income_total_win)) +bool_vec <- (whr$happy_rank < + mean(whr$happy_rank)) # See the 5 first elements of the vector head(bool_vec) @@ -567,38 +631,40 @@ head(bool_vec) ## Advanced types of data \framesubtitle{Booleans} -Let's use the boolean vector created to add a dummy variable in the `lwh` data set for the same condition. +Let's use the boolean vector created to add a dummy variable in the `whr` data set for the same condition. ### Exercise 6: - 1. Create a column in `lwh` containing zeros and call it `income_low`. You can do this by typing: + 1. Create a column in `whr` containing zeros and call it `rank_low`. You can do this by typing: -```{r, eval = F} -lwh$income_low <- 0 +```{r, eval = T} +whr$rank_low <- 0 ``` 2. Use `bool_vec` to index the lines of the `income_low` column and replace all observations that meet the condition with the value 1. ```{r, eval = F} -lwh$income_low[bool_vec] <- 1 +whr$rank_low[bool_vec] <- 1 ``` ## Advanced types of data \framesubtitle{Booleans} ```{r} -# Create column with zeros -lwh$income_low <- 0 # Replace with 1 those obs that meet the condition -lwh$income_low[bool_vec] <- 1 -``` +whr$rank_low [bool_vec] <- 1 +# is the same as +whr$rank_low[whr$happy_rank < mean(whr$happy_rank)] <- 1 +# This in stata would be +# replace rank_low = 1 if (...) -```{r} -# See the first 5. obs. -head(lwh$income_low) + +# We can use the summary function to get descriptives +summary(whr$rank_low) ``` + # Help, Google and Stackoverflow ## Help, Google and Stackoverflow @@ -623,7 +689,11 @@ help(summary) * For instance, in 2014, Stata had 11 dedicated blogs written by users, while R had 550.[^4] - * The most powerful problem-solving tool in R, however, is Google. Searching the issue, you have or the error message displayed usually yields tons of results and will probably lead you to a Stack Overflow page where someone asked the same question and several people gave different answers. + * The most powerful problem-solving tool in R, however, is Google. Searching the something yields tons of results. + + * Often that means a Stack Overflow page where someone asked the same question and several people gave different answers. Here's a typical example: + https://stackoverflow.com/questions/1660124/how-to-sum-a-variable-by-group + [^4]: Check http://r4stats.com/articles/popularity/ for more. @@ -674,8 +744,7 @@ https://www.r-graph-gallery.com/ # Appendix - -## Syntax +## Appendix - Syntax R's syntax is a bit heavier than Stata's: @@ -692,7 +761,7 @@ Similarly to Stata: * Largely ignores white spaces. -## RStudio interface +## Appendix - RStudio interface ### Script Where you write your code. Just like a do file. @@ -707,9 +776,9 @@ Similarly to Stata: Can display different things, including plots you create, packages loaded and help files. -## Matrices +## Appendix - Matrices -A matrix a bidimensional object composed by one or more vectors of the same type. +A matrix a bi-dimensional object composed by one or more vectors of the same type. ### Type the following code to test two different ways of creating matrices ```{r, include = T, results = "hide"} @@ -721,7 +790,7 @@ m2 <- matrix(c(1,1,2,3,5,8), ncol = 2) ``` -## Matrices +## Appendix - Matrices ### Now use the following code to check the elements of these matrices by indexing ```{r, include = T, results = "hide"} @@ -739,7 +808,7 @@ m1[,2] ``` -## Advanced types of data - Factors +## Appendix - Advanced types of data - Factors \framesubtitle{Factors} @@ -757,7 +826,7 @@ levels(fac_vec) = c('One','Two','Three') ``` -## Numbers and integers +## Appendix - Numbers and integers ### Two scalars, one with a round number the other with a fractional part ```{r} @@ -766,7 +835,7 @@ int <- 13 num <- 12.99 ``` -## Numbers and integers +## Appendix - Numbers and integers ### Now we can see the objects classes with the *class()* function and test it with the *is.integer()* and *is.numeric()* functions. ```{r} @@ -781,10 +850,10 @@ is.numeric(int) Did you notice anything strange? That happens because the default way R stores numbers is *numeric*, which is equivalent to *double* in Stata. -## Numbers and integers +## Appendix - Numbers and integers \framesubtitle{Numbers and integers} -We can, however, coerce objects into different classes. We just need to be carefull because the result might not be what we're expecting. +We can, however, coerce objects into different classes. We just need to be careful because the result might not be what we're expecting. ### Use the *as.integer()* and *round()* functions on the *num* object to see the difference: ```{r} diff --git a/Presentations/Lab 1 - Intro to R.pdf b/Presentations/Lab 1 - Intro to R.pdf deleted file mode 100644 index 3e5ee31..0000000 Binary files a/Presentations/Lab 1 - Intro to R.pdf and /dev/null differ diff --git a/Presentations/Lab 2 - Coding for Reproducible Research.Rmd b/Presentations/Lab 2 - Coding for Reproducible Research.Rmd deleted file mode 100644 index 92c6329..0000000 --- a/Presentations/Lab 2 - Coding for Reproducible Research.Rmd +++ /dev/null @@ -1,574 +0,0 @@ ---- -title: "Coding for Reproducible Research" -subtitle: "Field Coordinator Training - R Track" -date: "June 2018" -author: "Luiza Andrade, Leonardo Viotti & Rob Marty " -output: - beamer_presentation: - #theme: "Pittsburgh" - theme: "Madrid" - colortheme: "whale" - fonttheme: "default" - toc: true - includes: - in_header: header.tex ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = TRUE) -``` - -```{r, echo = F, eval=FALSE} -# Load the pacakge than contains ToothGrowth dataset -library(datasets) -``` - -# Introduction - -## Introduction - - \begin{block}{Objective} - Create an R Master Script. - \end{block} - - \textbf{Content} - \begin{enumerate} - \item Comments - \item Using R Studio to create a code index - \item Indenting - \item File paths - \item If statements - \item Using functions within functions - \item Packages - \item Loops - \end{enumerate} - -## Introduction - - \begin{itemize} - \item The exercises on this section will help us create a Master R script - \item To do them, go to the \textit{DataWork/Code} folder an open the file called \textit{Lab 2 - Coding for Reproducible Research.R} - \end{itemize} - -## Introduction - - \begin{itemize} - \item Just like in Stata, a Master script must be a map of all data work - \item It should be possible to follow all data work in the data folder, from raw data to analysis output, by following the master script - \item It should also be possible to run all the code for the project by simply running the Master script - \item Finally, in Stata, the Master Script is where you create globals. You can do the same in R by just creating new objects - \end{itemize} - -# Commenting - -## Commenting - - \begin{itemize} - \item Let's take a look at the script we just opened - \item You can see that the first few lines in the script are the header, but they're not commented out - \item In R, errors will not always break your code, so you should still be able to run this script - \item However, not commenting out comments is still bad practice, as it makes the code harder to read - \end{itemize} - -## Commenting - - - * To comment a line, write ``#`` as its first character - * You can also add ``#`` half way through a line to comment whatever comes after it - * In Stata, you can use ``/*`` and ``*/`` to comment part of a line's code. That is not possible in R: whatever comes after ``#`` will be a comment - * To comment a selection of lines, press ``Ctrl + Shift + C`` - -## Commenting - - \begin{block}{Exercise 1} - Use the keyboard shortcut to comment the header of the script. - \end{block} - - \begin{block}{Exercise 2} - Use the keyboard shortcut to comment the header of the script again. What happened? - \end{block} - -# Creating a document outline in RStudio - -## Creating a document outline in RStudio - - \begin{itemize} - \item RStudio also allows you to create an interactive index for your scripts - \item To add a section to your code, create a commented line with the title of your section and add at least 4 trailing dashes, pound signs or equal signs after it - \end{itemize} - - \begin{block}{Exercise 3} - Open the script index and make \texttt{PART 0} a section header. Do the same for \texttt{PART 1}. - \end{block} - - \begin{block}{Exercise 4} - Note that once you create a section header, an arrow appears right next to it. Click on the arrows of parts 0 and 1 to see what happens. - \end{block} - - -## Creating a document outline in RStudio - - \begin{itemize} - \item The outline can be accessed by clicking on the button on the top right corner of the script window. You can use it to jump from one section to another - \item You can also use the keyboard shortcuts \texttt{Alt + L} (\texttt{Cmd + Option + L} on Mac) and \texttt{Alt + Shift + L} to collapse and expand sections - \end{itemize} - - -# Initial Settings - -## Initial Settings - - \begin{itemize} - \item A Stata do-file typically starts with a few settings: - \end{itemize} - -```{stata, eval = F} - clear - set maxvar 120000 - set more off -``` - -## Initial Settings - - * We don't need to set the memory or the maximum number of variables in R, and ``more`` is automatically selected - * However, if you saved the last RStudio session in ``.Rhistory``, the objects that were in RStudio's memory last time you closed it will still be there whenever you open it again - * Therefore, it's good practice to always clean the memory when starting a new session - * You can see all the objects currently in you memory in the *Environment* pane - -## Initial Settings - - \begin{block}{Exercise 5} - \begin{enumerate} - \item Make sure the \textit{Environment} window is open (it should be empty now) - \item Create an object called \texttt{foo} with any content you pick - \item Type \texttt{ls()} to print the names of the object in memory - \item Type \texttt{rm(foo)} to remove it - \item To remove all objects, use \texttt{rm(list=ls())} - \end{enumerate} - \end{block} - - -# Using packages - -## Using packages - - * Since there is a lot of people developing for R, it can have many different functionalities - * To make it simpler, these functionalities are bundled into packages - * A package is the fundamental unit of shareable code - * It may contain new functions, but also more complex functionalities, such as a Graphic User Interface (GUI) or settings for parallel processing (similar to Stata MP) - * They can be shared through R's official repository - CRAN (10,000+ packages reviewed and tested) and many other online sources - * There are many other online sources such as Github, but it's important to be careful, as these probably haven't gone through a review process as rigorous as those in CRAN - -## Using packages - -* To install and use packages you can either do it with the user interface or by the command prompt. -```{r, eval = F} -# Installing a package -install.packages("stargazer", - dependencies = T) -# the dependencies argument also installs all other packages -# that it may depend upon -``` - -* You only have to install a package once, but you have to load it every new session. To load a package type: -```{r, eval = F} -library(stargazer) - -``` - -## Using packages - -Once a package is loaded, you can use its features and functions. Here's a list of some useful and cool packages: - -* Rcmdr - Easy to use GUI -* swirl - An interactive learning environment for R and statistics. -* ggplot2 - beautiful and versatile graphics (the syntax is a pain, though) -* stargazer - awesome latex regression and summary statistics tables -* foreign - reads dtas and other formats from inferior statistical software -* zoo - time series and panel data manipulation useful functions -* data.table - some functions to deal with huge data sets -* sp and rgeos - spatial analysis -* multiwayvcov and sandwich - clustered and robust standard errors -* RODBC, RMySQL, RPostgresSQL, RSQLite - For relational databases and using SQL in R. - -## Using packages - - \begin{block}{Exercise 6} - Install the \texttt{swirl} and \texttt{stargazer} packages, including packages necessary for them to run. - \end{block} - -## Using packages - -```{r, eval = F} - - # Install stargazer and swirl - install.packages(c("stargazer", "swirl"), - dependencies = TRUE) - -``` - -## Using packages - - \begin{block}{Exercise 7} - Call the packages you just installed. Note that the \texttt{library} function only accepts one argument, so you will need to load each of them separately. - \end{block} - -## Using packages - \scriptsize - -```{r, eval = T} - - library(stargazer) - library(swirl) - -``` - -# Indenting - -## Indenting - - * Indenting can be pretty different from what it looks like in Stata - * To indent a whole line, you can select that line and press ``Tab`` - * To unindent a whole line, you can select that line and press ``Shift + Tab`` - * However, this will not always work for different parts of a code in the same line - * In R, we typically don't introduce white space manually - * It's rather introduced by RStudio for us - -## Indenting - - \begin{block}{Exercise 8} - To see an example of how indenting works in RStudio, add a line between the two arguments of the \texttt{install.packages} function (the vector of package names and the dependents option). Then add a line between the two package names. Note that RStudio formats the different arguments of the function differently. - \end{block} - -## Indenting - -```{r, eval = F} - # Same code, but this time easier to read - install.packages(c("stargazer", - "swirl"), - dependencies = TRUE) -``` - -# Functions inception - -## Functions inception - - * In R, you can use the output of one function as the input of another, as long as they have the same format - * In fact, that's exactly what we just did when installing the packages - * To see that, select just the first argument of the ``install.packages`` function and press ``Ctrl + Enter`` - * The ``c()`` function, as we know, creates a vector with its arguments - - -```{r, eval = T} - - c("stargazer", "swirl") - -``` - -## Functions inception - - * The resulting vector is used as an input to the ``install.packages`` function - * We could also have stored this vector in the memory as an object and used that object as the input - * In fact, that's exactly what we are going to do next, so the code doesn't get too polluted as we add new packages - -## Functions inception - -```{r, eval = T, echo = F} - packages <- c("stargazer", - "swirl") - -``` - -```{r, eval = F} - - # Create packages object - packages <- c("stargazer", - "swirl") - - # Use it as an input to the install.packageS() function - install.packages(packages, - dependencies = TRUE) - -``` - -# Section switches - -## Section switches - - * Now, installing packages can be time-consuming, especially as the number of packages grow, and each package only needs to be installed once - * What can we bring from our Stata Master do-files to avoid installing packages twice? - -## Section switches - - * In Stata, section switches would be saved as locals - * In R, the equivalent to that would be to create a new object - -### Exercise 10 - Create a dummy scalar object called PACKAGES. - - * TIP: Section switches can also be Boolean objects. - -## Section switches - - * Now we need to create an if statement using this switch - * If statements in R look like this: -```{r, eval = F} - - # Turn switch on - PACKAGES <- 1 - - # Install packages - if (PACKAGES == 1) { - install.packages(packages, - dependencies = TRUE) - } - -``` - -## Section switches - - * Possible variations would include -```{r, eval = F} - # Turn switch on - PACKAGES <- TRUE - - # Using a Boolean object - if (PACKAGES == TRUE) { - install.packages(packages, dep = T) - } - - # Which is the same as - if (PACKAGES) { - install.packages(packages, dep = T) - } - -``` - -# File paths - -## File paths - - * The next important part of a Master script are file paths - * We recommend always using **explicit** and **dynamic** file paths - -## File paths - - * Implicit and static file path: - -```{r, eval = F} - # Set working directory - setwd("C:/Users/luiza/Documents/GitHub/R-Training/ - DataWork/DataSets/Final") - - # Load data set - read.csv("lwh_clean.csv", - header = T) -``` - -## File paths - - * Explicit and static file path: - -```{r, eval = F} - # Load data set - read.csv("C:/Users/luiza/Documents/GitHub/R-Training/ - DataWork/DataSets/Final/lwh_clean.csv", - header = T) -``` - -## File paths - - * Explicit and dynamic file path: - -```{r, eval = F} - # Define dynamic file path - finalData <- "C:/Users/luiza/Documents/GitHub/R-Training/ - DataWork/DataSets/Final" - - # Load data set - lwh <- read.csv(file.path(finalData,"lwh_clean.csv"), - header = T) - -``` -## File paths - - * File paths in R, as in Stata, are basically just strings - * Note, however, that in R we can only use forward slashes (``/``) to separate folder names - - \begin{block}{Exercise 11} - Let's start by adding the folder path to the training's folder in your computer to the beginning of \texttt{PART 3}. - \end{block} - -## File paths - - * You can set file paths in your master using the ``file.path()`` function - * This function concatenates strings using ``/`` as a separator to create file paths - -## File paths - -```{r, eval = T} - - # Project folder - projectFolder <- - "C:/Users/luiza/Documents/GitHub/R-Training" - - # Data work folder - dataWorkFolder <- file.path(projectFolder,"DataWork") - - # Print data work folder - dataWorkFolder - -``` - -## File paths - - Let's check if that worked, as we will need your Master script to be running smoothly for the other sessions. - - \begin{block}{Exercise 12} - \begin{enumerate} - \item Turn off the switch that installs packages in your Master script - \item Run the whole script - \item Type the following code to open a data set using the file paths you just set: - \end{enumerate} - \end{block} - -```{r, eval = F} -# Load data set -lwh <- read.csv(file.path(rawData,"lwh_panel.csv"), - header = T) - - -``` - -# Looping - -## Looping - - * We're almost at the end of this section - * But the DRY rule can still be applied to part of this code - -```{r, eval = F} - # The DRY rule: - DONT REPEAT YOURSELF - -``` - -## Looping - * In Stata, we'd usually use a ``foreach`` loop to go through a list of objects - * The equivalent to that in R would be to write a for loop like this - - -```{r} - # A for loop in R - for (number in c(1.2,2.5)) { - print(round(number)) - } -``` - -## Looping - - * R, however, has a whole function family that allows users to loop through an object in a more efficient way - * They're called ``apply`` and there are many of them, for different use cases. - * For the purpose of this training, we will only use two of them, ``sapply`` and ``apply`` - * If you look for the ``apply`` help file, you can see all of them - -## Looping - - * ``sapply(X, FUN, ...)``: applies a function to all elements of a vector or list and returns the result in a vector. Its arguments are - * **X:** a matrix (or data frame) the function will be applied to - * **FUN:** the function you want to apply - * **...:** possible function options - -```{r} - # A much more elegant for loop in R - sapply(c(1.2,2.5), round) -``` - -## Looping - - \begin{block}{Exercise 13} - Use the \texttt{sapply()} function to apply the \texttt{library()} function to all packages you have selected. - \end{block} - -## Looping - -```{r, eval = F} - # Load all listed packages - sapply(packages, - library, character.only = TRUE) -``` - -## Looping - -A more generic version is the `apply` function. - - * ``apply(X, MARGIN, FUN, ...)``: applies a function to all columns or rows of matrix. Its arguments are - * **X:** a matrix (or data frame) the function will be applied to - * **MARGIN:** 1 to apply the function to all rows or 2 to apply the function to all columns - * **FUN:** the function you want to apply - * **...:** possible function options - -## Looping - - -```{r, eval = T} - # Create a matrix - matrix <- matrix(c(1, 24, 9, 6, 9, 4, 2, 74, 2), - nrow = 3) - - # Look at the matrix - matrix -``` - - -## Looping - -```{r, eval = T} - # Row means - apply(matrix, 1, mean) - - # Column means - apply(matrix, 2, mean) -``` - -## That's all, folks - - * Now you have a template master script to use across this training's sessions - * Save the script that you created during this session in the *DataWork* folder. Call it *MASTER.R* - * You can run scripts from the Master script by using the ``source()`` function as you write scripts for future sessions, but we will do that on the next session - -## That's all, folks - - \begin{block}{Homework} - In the next slide here's a list of all the packages we'll need to the next sessions. Installing them might take a while, so paste them to your Master script and run it again before the next session: - \end{block} - -```{r, eval = T} - packages <- c("readstata13","foreign", - "doBy", "broom", - "stargazer", - "ggplot2", "plotly", "ggrepel", - "RColorBrewer", - "sp", "rgdal", "rgeos", "raster", "velox", - "ggmap", "rasterVis", "leaflet") -``` - - ----- - -\huge Thank you! - ----- - - \frametitle{Bonus exercise} - - * You can customize your loops in R by defining your own function - * This is done using a function conveniently called ``function()`` - * For example, if instead of just printing a number we want to print it's square, we could create a function that does both: - - -```{r, eval = T} - # A much more elegant for loop in R - sapply(c(1,2), function(x) x^2) -``` - diff --git a/Presentations/Lab 2 - Coding for Reproducible Research.pdf b/Presentations/Lab 2 - Coding for Reproducible Research.pdf deleted file mode 100644 index 8f13464..0000000 Binary files a/Presentations/Lab 2 - Coding for Reproducible Research.pdf and /dev/null differ diff --git a/Presentations/Lab 2 - Descriptive analysis.Rmd b/Presentations/Lab 2 - Descriptive analysis.Rmd new file mode 100644 index 0000000..f1f29db --- /dev/null +++ b/Presentations/Lab 2 - Descriptive analysis.Rmd @@ -0,0 +1,884 @@ +--- +title: "Descriptive analysis" +subtitle: "R for Stata Users" +date: "November-December 2018" +author: "Luiza Andrade, Leonardo Viotti & Rob Marty " +output: + beamer_presentation: + #theme: "Pittsburgh" + theme: "Madrid" + colortheme: "whale" + fonttheme: "default" + #toc: true + includes: + in_header: header.tex +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) +``` + +```{r, echo = F, eval=T, include=FALSE} +library(stargazer) +library(tidyverse) + +# File paths + +if (Sys.getenv("USERNAME") == "luiza"){ + projectFolder <- "C:/Users/luiza/Documents/GitHub/dime-r-training" + +} + +if (Sys.getenv("USERNAME") == "WB501238"){ + projectFolder <- "C:/Users/WB501238/Documents/GitHub/dime-r-training" + +} + +if (Sys.getenv("USERNAME") == "Leonardo"){ + projectFolder <- "C:/Users/Leonardo/Documents/GitHub/dime-r-training" + +} + +if (Sys.getenv("USERNAME") == "WB519128"){ + projectFolder <- file.path("C:/Users/WB519128/Documents/GitHub/dime-r-training") +} + + # File paths + dataWorkFolder <- file.path(projectFolder,"DataWork") + + Data <- file.path(dataWorkFolder,"DataSets") + finalData <- file.path(Data,"Final") + rawOutput <- file.path(dataWorkFolder,"Output","Raw") + + # Load CSV data + whr <- read.csv(file.path(finalData,"whr_panel.csv"), + header = T, + stringsAsFactors = F) + + #whr <- read_csv(file.path(finalData,"whr_panel.csv")) + +``` + +# Introduction + +## Introduction + + * Descriptive statistics are used to represent the basic features of data. When we talk about descriptive analysis, it usually means that we're not making any assumptions, and we're not using probability theory to infer anything beyond the immediate data. + + * This session is mostly focused on how to implement descriptive analysis in R. + + * We will not go in depth into these concepts, but you can find some useful references at the end of this presentation. + +## Introduction + +This session will cover two topics: + + \begin{enumerate} + \item Quick ways to extract summary information from your data. + \item How to use this information to create and export tables. + \end{enumerate} + + +# Getting started + +## Getting started +First, let's load the data that is going to be used in the training. + +### Load the data +```{r, include = T, results = "hide", eval = F} + # Replace with where your data is + FolderPath <- file.path("YOUR FOLDER PATH HERE") + + whr <- read.csv(file.path(FolderPath,"whr_panel.csv"), + header = T, + stringsAsFactors = F) + +``` + +## Getting started + +Before starting, lets install the packages we'll use in this session since it might take a while. + +We'll get into what packages are in a moment. + +### Install today's packages + +```{r, include = T, results = "hide", eval = F} + install.packages(c("stargazer", + "tidyverse", + "openxlsx"), + dependencies = T) + +``` + +# Using packages + +## Using packages + + * Since there is a lot of people developing for R, it can have many different functionalities. + * To make it simpler, these functionalities are bundled into packages. + * A package is just a unit of shareable code. + + +## Using packages + + * It may contain new functions, but also more complex functionalities, such as a Graphic User Interface (GUI) or settings for parallel processing (similar to Stata MP). + * They can be shared through R's official repository - CRAN (13,000+ packages reviewed and tested). + * There are many other online sources such as GitHub, but it's important to be careful, as these probably haven't gone through a review process as rigorous as those in CRAN. + +## Using packages + + +* To install and use packages you can either do it with the user interface or by the command prompt. +```{r, eval = F} +# Installing a package +install.packages("stargazer", + dependencies = T) +# the dependencies argument also installs all other packages +# that it may depend upon to run +``` + + +* You only have to install a package once, but you have to load it every new session. + +## Using packages + +### Exercise 1 + 1. Now load the packages we just installed. Use the `library()` function to do it. + + Note that the `library` function only accepts one argument, so you will need to load each of them separately. + +```{r, eval = T} + + library(stargazer) + library(tidyverse) + library(openxlsx) + +``` + +## Warinings vs Errors + +What if this happens? + +\begin{figure} + \includegraphics[scale=0.5]{img/Warning.png} +\end{figure} + + +## Warinings vs Errors + + +R has two types of error messages, `warnings` and actual `erros`: + + * `Errors` - break your code, i.e., prevent it from running. + * `Warnings` - usually mean that nothing went wrong yet, but you should be careful. + +RStudio's default is to print warning messages, but not stop the code at the lines where they occur. You can configure R to stop at warnings if you want. + + +# Quick summary statistics + + +## Quick summary statistics + + +### `summary(x, digits)` - equivalent to Stata's *summarize*, displays summary statistics. Its arguments are: + + * **x:** the object you want to summarize, usually a vector or data frame + * **digits:** the number of decimal digits to be displayed + +\begin{block}{Exercise 1} + Use the \texttt{summary()} function to display summary statistics for the \textit{whr} data frame. +\end{block} + +## Quick summary statistics + +\scriptsize + +```{r, eval = F} + # Summary statistics + summary(whr) +``` + +```{r, eval = T, echo = F} + # Summary statistics + summary(whr[,1:7]) +``` + +## Quick summary statistics + + +### ``table()`` + +Equivalent to `tabulate` in Stata, creates a frequency table. Its main arguments are vectors to be tabulated. + +### Exercise 2 +Use the `table()` function to display frequency tables for: + +1. The variable `year` in the `whr` data frame +2. The variables `region` and `year` in the `whr` data frame, simultaneously + + +## Quick summary statistics +```{r, eval = T} + # Year of data collection + table(whr$year) +``` + +## Quick summary statistics +\scriptsize + + +```{r, eval = T} + # Number of countries per region per year + table(whr$region, whr$year) +``` + + +## Quick summary statistics + +### Bonus Exercise: + Use the `table()` function to display a frequency table for the number of countries above the average happiness per region in 2017. + + 1. Create another data.frame called `whr17` with only 2017 observations + 2. Use the `table()` function to tabulate a the `region` variable and a boolean vector. + + - TIP: Using the condition directly in the function or creating a separate vector will yield the exact same results. + +## Quick summary statistics +\scriptsize + + +```{r, eval = T} + # Restrict to only 2017 obs + whr17 <- whr[whr$year == 2017, ] + + # table with a bolean vector + table(whr17$region, + whr17$happy_rank > mean(whr17$happy_rank)) # Who is above average +``` + +# Descriptives tables + +## Descriptives tables + + + We can also use the \texttt{stargazer()} function to quickly display a nice-looking descriptives table. + + \textit{Stargazer} was originally developed to export beautiful regression tables to \LaTeX\ or html, but it also allows you to generate summary statistics. + + +## Descriptives tables + +\begin{block}{Exercise 3 - \texttt{stargazer()} summary statistics table} + Use the \texttt{stargazer()} function to display summary statistics for the variables in the \textit{whr} data frame. +\end{block} + + The \texttt{stargazer()} function accepts \textbf{a lot} of arguments, most of which are beyond the scope of this session. Here are the arguments you'll need for this specific table: + \begin{itemize} + \item \textbf{x:} the object you want to summarize -- in this case a vector or data frame + \item \textbf{type:} the output format -- "text" to just display, "latex" (the default) to save as a \LaTeX\ table, and "html" for, yes, html + \item \textbf{digits:} the number of decimal digits to be displayed + \end{itemize} + +## Descriptives tables +\tiny + +```{r, eval = T} +# A descriptive table with stargazer +stargazer(whr, + digits = 1, + type = "text") + +``` + + + +# Export tables to \LaTeX + + +## Export tables to \LaTeX\ + + +To export the table to \LaTeX\, we will use a couple of additional arguments of the `stargazer()` function: + + \begin{itemize} + \item \textbf{out:} where to save the table, i.e., the file path, including the file name + \item \textbf{covariate.labels:} a vector of variable labels + \end{itemize} + +## Export tables to \LaTeX\ + +But first, let's pick a few variables of interest in the `whr` data set so the table fits in these slides. + +### Exercise 3 + + 1. Create a vector called ``covariates`` containing the string names of the variables you want to keep: `happy_score`, `gdp_pc`, `family`, and `trust_gov_corr`. + 2. Use this vector to subset the whr data-set to contain only these variables. Call the new data frame `whr_simp`. + + +## Export tables to \LaTeX\ +\scriptsize + + +```{r} +# Vector with covariates to be kept +covariates <- c("happy_score", + "gdp_pc", + "freedom", + "trust_gov_corr") + +# subset whr +whr_simp <- whr[, covariates] + +``` + + +## Export tables to \LaTeX\ + +### Exercise 4 +Now use the stargazer function to export the `whr_simp`: + + 1. Create a vector `cov_labels` containing labels for the `happy_score`, `gdp_pc`, `freedom` and `trust_gov_corr` variables. + 2. Set ``whr_simp`` as the `x` argument this time + 3. Set the `covariate.labels` argument as the vector you just created + +## Export tables to \LaTeX\ + +\tiny + +``` {r, eval = T, results = "hide"} + + # Set labels + cov_labels <- c("Happy score", "GDP per capita", + "Freedom", "Trust in gornment and currption") + + # Save table to latex + stargazer(whr_simp, + # Formatting + covariate.labels = cov_labels, + summary.stat = c("n", "mean", "sd", "min", "max"), # Chose which stats to comopute + digits = 2, + # You can directly export with the out argument + out = file.path(rawOutput,"desc_table.tex")) + +``` + +\input{"../DataWork/Output/Raw/desc_table.tex"} + +# Descriptives tables - Create tables from scratch + +## Descriptives tables - Create tables from scratch + +In R, it is relatively straightforward to construct any table you can think of by manipulating objects. + +There are multiple ways to do this, but We will construct a simple table using two functions: + + * `aggregate()` - Similar to collapse in Stata, it can compute statistics of a variable based on the values of other variable. + * `spread()` - Reshapes data sets from long to wide. [^1] + +[^1]: `gather()` Reshapes data sets from wide to long, both are from `tidyverse`. There are other ways to reshape data, but these are becoming the standard in R. + +## Descriptives tables - Create tables from scratch + +### `aggregate(formula, data, FUN, ...,)`: + + * **formula:** a formula, such as y ~ x or cbind(y1, y2) ~ x1 + x2, where the y variables are numeric data to be split into groups according to the grouping x variables + * **data: ** a data frame (or list) from which the variables in formula should be take + * **FUN:** a function to compute statistics + * **...:** further arguments passed to or used by FUN + +## Descriptives tables - Create tables from scratch + +### Exercise 5 + Use the `aggregate()` function to create a data frame called `happy_table` with the mean of `happy_score` per year and region. + +```{r, eval = T, results = "hide"} +# Aggregate happy_score by year and region +happy_table <- + aggregate(happy_score ~ year + region, + data = whr, + FUN = mean) +``` + + +For comparison, here's how you'd do it in Stata: +`collapse (mean) happy_score, by(region)` + +## Descriptives tables - Create tables from scratch +\scriptsize + +```{r, eval = F} +print(happy_table) + +``` + + +```{r, echo= F} +print(happy_table[1:19,]) + +``` + +## Descriptives tables - Create tables from scratch + +`spread(data, key, value)`: + + * __data__: a data frame + * __key__: the variables that identify the group in the wide data set + * __value__: the variable in long format that has multiple records from the same group or individual + +## Descriptives tables - Create tables from scratch + +### Exercise 6 + Use the `spread` function to make the `happy_table` data frame wide in the `year` variable. + + +```{r, eval = T, results = "hide"} +# Reshape into wide on year +happy_table <- + spread(happy_table, + key = year, + value = happy_score) +``` + +For comparison, here's how you'd do it in Stata: +`reshape wide happy_score, i(region) j(year)` + + +## Descriptives tables - Create tables from scratch +\scriptsize + +```{r} +print(happy_table) + +``` + +## Descriptives tables - Create tables from scratch + +With a data frame as input, `stargazer` by default tries to summarize it. So, to export this table we must specify one additional argument: `summary = F`. + +### Exercise 7 +Print the ``happy_table`` table you created in exercise 6 using ``stargazer``. If you want, you can also save it using the ``out`` option. + + +## Descriptives tables - Create tables from scratch +\scriptsize + +```{r, eval = F} + +# Create table +stargazer(happy_table, + summary = F, + # Exporting + out = file.path(...), + # Formatting: + title = "Happy table", + digits = 1, + rownames = F) +``` + + +## Descriptives tables - Create tables from scratch +\scriptsize + +```{r, echo = F, results='asis'} + +# Create table +stargazer(happy_table, + summary = F, + header = F, + title = "Happy table", + digits = 1, + rownames = F) +``` + + +## Descriptives tables - Create tables from scratch +\framesubtitle{Challenge exercise} + +Ok, but what if we want to create something very specific, different from the output of those two functions? Something like this: + +## Descriptives tables - Create tables from scratch +\framesubtitle{Challenge exercise} +\scriptsize + + +```{r, echo = F} +#### Creacte duplicated dfs with a stat identifier +x1 <- whr +x1$stat <- "Mean" + +x2 <- whr +x2$stat <- "N" + +# Combine them +x12 <- rbind(x1,x2) + +#### Collapse also by that stat id to have duplicated rows +happy_table2 <- + aggregate(happy_score ~ stat + region + year, + data = x12, + FUN = mean) + +happy_table2 <- + rename(happy_table2, value = happy_score) + +#### Create a frequencies df + +freqdf <- + as.data.frame(table(whr$region, whr$year)) + +#### Replace values for ne N rows +happy_table2$value[happy_table2$stat == "N"] <- freqdf$Freq + +#### Reshape + +# Re order columns so the reshape will give us the righ order +happy_table2 <- + select(happy_table2, region, year, stat, value) + +# Reshape +ht_wd <- + spread(happy_table2, + key = year, + value = value) + + +#### Cosmetics + +# Remove duplicated region names +ht_wd$region[as.integer(row.names(ht_wd)) %% 2 == 0] <- "" + +# Rename columns +names(ht_wd) <- c("Region", + "", + "2015", + "2016", + "2017") + +``` + + + +```{r, echo = F, results='asis'} + +stargazer(ht_wd, + font.size = "tiny", + title = "Happiness score by world region", + column.labels = c("foo", "bar", "bar", "Good", "Better"), + summary = F, + rownames = F, + header = F) + +``` + + +## Descriptives tables - Create tables from scratch +\framesubtitle{Challenge exercise} + +### Exercise 8: Try to replicate the table in the previous slide + +There are multiple ways to do this. Here are two painful but straightforward approaches that you get extra points if you avoid: + + 1. Write string objects with latex code and combine them. + 2. Appending vectors of with the desired stats for each region. + +Here are a few tips if you chose to use `aggregate()` and `spread()`: + + 1. When using `aggregate`, the order of the right-hand-side variables affects the order of the columns. + 2. The order of the columns affects the order of observations after you reshape. + +# Export tables to Excel + +## Export tables to Excel + + +There are several ways to export R objects to Excel. We will use here the the `write.xslx()` function of the `openxlsx` package. + + It takes a matrix or data frame object as input and saves it as a `.xlsx` file. + +`write.xslx()` is one of the most common functions, but there are many other functions that allow you to export formatted tables to Microsoft Excel, Word or PowerPoint. Here are some examples: + + * ReporteRs + * Flextable + * r2excel (only available in GitHub). + +## Export tables to Excel + + `write.xlsx(x, file, row.names = TRUE, col.names ...)` + + * __x__: the object to be written + * __file__: where to save the table, i.e., the file path including the file name + * __row.names__: a logical value indicating whether the row names of x are to be written along with x + +## Export tables to Excel + +### Exercise 9 + +Use the `write.xlsx()` function to save the `happy_table` you table created in Exercise 4 into an xlsx file. + +1. Set `x` argument as *happy_table*. +2. Set `file` as the folder path to your output folder including a name for the file plus ".xlsx" + + TIP: + + * Use the help function to check syntax if needed + +## Export tables to Excel +\scriptsize + +```{r, include = T, eval = F} +write.xlsx(happy_table, + file = file.path(rawOutput, "happy_table.xlsx")) + +``` + + +\begin{figure}[H] + \centering + \includegraphics[scale=.7]{img/happy_table_xlsx.png} +\end{figure} + + +# Export regression table + +## Export regression table + +\begin{block}{Warning:} +This is a session on \textbf{descriptive} analysis, so regressions are beyond its scope. +\end{block} + +But since you'll probably ask, here's how you run a regression and how you export a very simple regression table to \LaTeX\ using `stargazer` and the `iris`[^2] data-set: + +```{r} +# Run a Regression +reg1 <- lm(Sepal.Length ~ Sepal.Width, + data = iris) + +``` + +[^2]: `iris` is a built in data set in R with some morphology information for the flowers of 3 iris species. + +## Export regression table +\scriptsize + + +```{r, eval = F} +# Export a regression table + +depvar_label <- "Sepal Length" +covar_labels <- "Sepal Width" + +stargazer(reg1, + title = "Regression table", + dep.var.labels = depvar_label, + covariate.labels = covar_labels, + digits = 2, + # out = "..." + header = F) + +``` + + + +## Export regression table +\tiny + +```{r, results ='asis', echo = F} +# Export a regression table +depvar_label <- "Sepal Length" +covar_labels <- "Sepal Width" + +stargazer(reg1, + title = "Regression table", + dep.var.labels = depvar_label, + covariate.labels = covar_labels, + digits = 2, + header = F) + +``` + + + +## Export regression table +\scriptsize + +```{r} + +# Regression 1 +reg1 <- lm(Sepal.Length ~ Sepal.Width, + data = iris) + +# Reg with two indep vars +reg2 <- lm(Sepal.Length ~ Sepal.Width + Petal.Length, + data = iris) + +# Reg with two indep vars and species FE +reg3 <- lm(Sepal.Length ~ Sepal.Width + Petal.Length + factor(Species), + data = iris) + +``` + +## Export regression table +\scriptsize + +```{r, eval = F} +depvar_label <- "Sepal Length" +covar_labels <- c("Sepal Width", + "Petal Length") +#Table +stargazer(reg1, + reg2, + reg3, + font.size = "tiny", + title = "Regression table", + keep = c("Sepal.Width", "Petal.Length"), + dep.var.labels = depvar_label, + covariate.labels = covar_labels, + add.lines = list(c("Species FE", "No", "No", "Yes")), + omit.stat = c("ser"), + digits = 2, + header = F) +``` + +## Export regression table +\scriptsize + +```{r, results ='asis', echo = F} +depvar_label <- "Sepal Length" +covar_labels <- c("Sepal Width", + "Petal Length") +#Table +stargazer(reg1, + reg2, + reg3, + font.size = "tiny", + title = "Regression table", + keep = c("Sepal.Width", "Petal.Length"), + dep.var.labels = depvar_label, + covariate.labels = covar_labels, + add.lines = list(c("Species FE", "No", "No", "Yes")), + omit.stat = c("ser"), + digits = 2, + header = F) +``` + +# References and recommendations + +## References and recommendations + + + * Johns Hopkins Exploratory Data Analysis at Coursera: + https://www.coursera.org/learn/exploratory-data-analysis + + * Udacity's Data Analysis with R: + https://www.udacity.com/course/data-analysis-with-r--ud651 + + * Jake Russ stargazer cheat sheet: + https://www.jakeruss.com/cheatsheets/stargazer/ + +## References and recommendations + + Since we talked about \LaTeX\ so much... + + * DIME \LaTeX\ templates and trainings: + https://github.com/worldbank/DIME-LaTeX-Templates + + * All you need to know about \LaTeX\: + https://en.wikibooks.org/wiki/LaTeX + + +# Appendix + +## Appendix - Packages + +Once a package is loaded, you can use its features and functions. Here's a list of some useful and cool packages: + +* swirl - An interactive learning environment for R and statistics. +* tidyverse - a collection of R packages designed for data science. + + ggplot2 - beautiful and versatile graphics. The syntax is a bit of pain, but it can fulfill all of graphics dreams. +* Rcmdr - Easy to use GUI. +* stargazer - awesome latex regression and summary statistics tables. +* foreign - reads dtas and other formats from inferior statistical software. +* sp, sf, rgeos, raster, rgdal - spatial analysis. +* RODBC, RMySQL, RPostgresSQL, RSQLite - For relational databases and using SQL in R. + +## Appendix - Comments and shortcuts + + * There is only one character to comment out code in R, `#`. + * It can be used both by itself or after a command: +```{r} +# This is a comment +foo <- 42 # This is also a comment + +``` + + * `Ctrl + Shift + c` is a shortcut to comment out a larger chunk of code. Similar to `/*` and `*/` in Stata. + + * `Ctrl + Enter` is a shortcut to run the currently select code. + +## Appendix - Save your data to .csv + +To export our data in .csv format we can use the `write.csv()` function. There other ways, but this is often the most straightforward. + +Here's the basic syntax: + +`write.csv(x, file = "", sep = ",", row.names = TRUE)` + + * __x__: the object to be written + * __file__: where to save the table, i.e., the file path including the file name + * __sep__: the field separator of the csv, Excel's default is comma + * __row.names__: either a logical value indicating whether the row names of x are to be written along with x, or a character vector of row names to be written + + +## Appendix - Save your data to .csv + +You can write the following code: + +```{r, eval = F} + write.csv(whr, + file = file.path(...,"whr.csv"), + row.names = F) + +``` + +It is important to specify the `row.names` as `FALSE` since the function default is `TRUE`. There are situations when saving row names might make sense, but normally that's nor the case for data.frames. + +## Appendix - Formulas + +Formulas are a way of describing a relationship between variables or objects. They work as inputs for several functions, notably regression functions. + + +### We can create formulas by using the formula function +```{r} +# or Formula function yield same results +formula1 <- formula(y ~ x1 + x2) +formula1 +``` + + + +## Appendix - Formulas + +The most basic structure of a formula is actually just the tilde symbol ~ and at least one right-hand variable. + +### You can also covert strings to create formulas +```{r} +# or Formula function yield same results +formula2 <- as.formula("~ x1") +formula2 +``` + +## Appendix - Formulas + +Note that values that assigned to the symbols in the formula are not accessed when the formula is created. + +Alternatively, if you write an expression containing a tilde R already understands it as a formula. + +### Just using the tilde +```{r} + +formula3 <- y ~ x1 + z1 +formula3 +``` + + diff --git a/Presentations/Lab 3 - Data Processing.Rmd b/Presentations/Lab 3 - Data Processing.Rmd deleted file mode 100644 index ee72e73..0000000 --- a/Presentations/Lab 3 - Data Processing.Rmd +++ /dev/null @@ -1,723 +0,0 @@ ---- -title: "Data Processing" -subtitle: "Field Coordinator Training" -date: "June 2018" -author: "Luiza Andrade, Leonardo Viotti & Rob Marty " -output: - beamer_presentation: - #theme: "Pittsburgh" - theme: "Madrid" - colortheme: "whale" - fonttheme: "default" - toc: true - includes: - in_header: header.tex ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = TRUE) -``` -```{r, echo = F, eval=T} -# Load the pacakge than contains ToothGrowth dataset -library(datasets) - -# File paths - -if (Sys.getenv("USERNAME") == "luiza"){ - projectFolder <- "C:/Users/luiza/Documents/GitHub/R-Training" - -} - -if (Sys.getenv("USERNAME") == "Leonardo"){ - projectFolder <- "C:/Users/Leonardo/Documents/GitHub/R-Training" - -} - -if (Sys.getenv("USERNAME") == "WB519128"){ - projectFolder <- file.path("C:/Users/WB519128/Documents/GitHub/R-Training") -} - -library(readstata13) -# File paths -dataWorkFolder <- file.path(projectFolder,"DataWork") - -Data <- file.path(dataWorkFolder,"DataSets") -finalData <- file.path(Data,"Final") -rawData <- file.path(Data,"Raw") - -``` - -# Introduction - -## Introduction - -* In this session, you'll be introduced to some basic concepts of data cleaning in R. The contents covered are: - * Importing data - * Exploring a data set - * Uniquely and fully indentifiable ID variables - * Creating new variables - * Dropping variables - * Subsetting a data set - * Dealing with factor variables - * Treating outliers -* There are many other tasks that we usually perform as part of data cleaning that are beyond the scope of this session - -## Introduction - -Before we start, let's make sure we're all set: - - \begin{enumerate} -\item Make sure the packages \texttt{readstata13} and \texttt{dplyr} are listed in the \texttt{packages} vector of your Master script. If they are not installed (they should be!), install them. -\item If you haven't yet in this session, run the Master script to load all the necessary packages and set file paths. -\item Remeber to disable the \texttt{PACKAGES} switch in the Master if you already installed them. This will save you a lot of time. -\item Open the script called \texttt{Lab 3} in DataWork > Code. We've provided you with some code to build upon that will save you some time during this session. -\end{enumerate} - -## Introduction - -Here's a shortcut if you missed the last session: - -\scriptsize - -```{r, eval= F} - # Install packages - install.packages(c("readstata13","dplyr"), - dependencies = TRUE) - - # Load packages - library(dplyr) - library(readstata13) - - # Set folder paths - projectFolder <- "YOUR/FOLDER/PATH" - finalData <- file.path(projectFolder, "DataWork", "DataSets", "Final") - rawlData <- file.path(projectFolder, "DataWork", "DataSets", "Raw") - -``` - -# Loading a data set - -## Loading a data set from CSV - -* In R, we usually save data in CSV format -* CSV files can be a lot lighter than binary files -* You can do version control of CSV files in .git -* On the other hand, the data they store is in a much simpler format, and we'll see some shortcomings of that soon -* To load a data set, we use the ``read.csv()`` function - -## Loading a data set from CSV - -### ``read.csv(file)`` - - * **file**: is the path to the file you want to open, including it's name and format (``.csv``) - -### Exercise 1 -Load the ``lwh_panel.csv`` data set from DataWork > DataSets > Raw. Create an object called ``panel`` with this data set. - - * TIP: use the ``file.path()`` function and the ``rawData`` object created in the master to simplify the folder path. - - -## Loading a data set from CSV - -```{r} - # Load a CSV data set - panel <- read.csv(file.path(rawData, "lwh_panel.csv")) -``` - -If you look at the ``Environment`` pane, you'll be able to see this data set. Click on the blue arrow to see the list of variables. - - -\begin{figure} -\centering - \includegraphics[scale=0.4]{img/panel.png} -\end{figure} - - -## Loading a data set from CSV - -* Note that R reads string variables as factors as default -* This format saves memory, but can be tricky if you actually want to use the variables as strings (which is rarely the case) -* You can specify the option ``stringsAsFactors = FALSE`` to prevent R from turning strings into factors -* You can also simply recast any variables you want to be strings (as they'll probably be just a few) using the ``as.character()`` function - -## Loading a Stata data set - -* You can also load a ``.dta`` file, i.e., a Stata data set, using the ``read.dta13()`` function. -* This function takes exactly the same argument as the ``read.csv`` function - -### Exercise 2 -Use the function ``read.dta13()`` to load the ``endline_data_raw.dta`` data set from DataWork > DataSets > Raw. Create an object called ``endline`` with this data set. - - -## Loading a Stata data set -\scriptsize - -```{r} - # Load the raw data from LWH endline - endline <- read.dta13(file.path(rawData, - "endline_data_raw.dta")) -``` - -## Loading a Stata data set - -* In Stata, you can have different levels of a labelled variable with the same value label. That's not true for R factors, which is why all those warnings were created -* If you go to the ``Environment`` pane and click on the blue arrow, you'll see that string variables were imported as strings, and only labelled values were imported as factors -* If you scroll down on the variables list, you can also find some metadata such as variable labels, value labels and Stata formats saved as variables' attributes - -# Exploring a data set - -## Exploring a data set - -Some useful functions: - -* **``class()``:** reports object type or type of data stored -* **``dim()``:** reports the size of each one of an object's dimension -* **``names()``:** returns the variable names of a data set -* **``str()``:** general information on a R object, similar to ``codebook`` in Stata - -## Exploring a data set - -### Exercise 3 -Use some of the functions listed in the previous slides to explor the ``endline`` and ``panel`` objects. - -* TIP: all functions take a single argument, which is the object to be described. - -## Exploring a data set -\scriptsize - -```{r} - # See objects' formats - class(endline) - class(panel) - - # How many observations and variables? - dim(endline) - dim(panel) - -``` - -## Exploring a data set -\scriptsize -```{r, eval = F} - # Variable names - names(endline) -``` - -```{r, eval = T, echo = F} - # Variable names - names(endline)[1:12] -``` - -```{r, eval = F} - str(panel) -``` -```{r, eval = T, echo = F} - # Data set structure - str(panel[, 1:5]) - - # Variable structure - str(panel$treatment_hh) -``` - -# ID variables - -## ID variables - -Desired properties of an ID variable: *uniquely and fully identifying* - -* An ID variable cannot have duplicates -* An ID variable may never be missing -* The ID variable must be constant across a project -* The ID variable must be anonymous - -## ID variables - -Some useful functions to identify an ID variable in R - -* **unique():** displays unique occurrences in an object -* **duplicated():** returns a boolean vector showing which observations of the object are duplicated -* **length():** length of an object -* **is.na():** returns a boolean vector showing which observations of the object have missing values - -## ID variables - -### Exercise 4 -Use one of the functions in the previous slide to tell if ``hh_code`` is a uniquely and fully identifying variables for the ``panel`` data set. - -* TIP: All these functions take a single argument, which is the object we're testing. - -## ID variables - - * Uniquely identifiying? - - \scriptsize -```{r, eval = F} - # See all unique values of hh_code - duplicated(panel$hh_code) -``` - -```{r, eval = T, echo = F} - # See all unique values of hh_code - duplicated(panel$hh_code)[1:20] -``` - -```{r} - # Count duplicates - sum(duplicated(panel$hh_code)) -``` - -## ID variables - - * Fully identifiying? - - \scriptsize -```{r, eval = F} - # See all unique values of hh_code - is.na(panel$hh_code) -``` - -```{r, eval = T, echo = F} - # See all unique values of hh_code - is.na(panel$hh_code)[1:20] -``` - -```{r} - # Count NAs - sum(is.na(panel$hh_code)) -``` - - -## ID variables - -Ok, that was a tricky question, as we already knew this is a panel data set. So let's try with what is a more credible ID variable - -### Exercise 5 -Count the number of missing observations and duplicates values in the variables ``hh_code`` and ``year`` in the ``panel`` data set. - -* TIP: you can use ``panel[,c("hh_code","year")]`` as the function's argument to test both variables at the same time. Note that this is still a single argument, but this time it's a data frame with two columns. - -## ID variables - -\scriptsize - -```{r} -# Uniquely identifying? -sum(duplicated(panel[,c("hh_code","year")])) -# Fully identifiying? -sum(is.na(panel$year)) -sum(is.na(panel[,c("hh_code","year")])) -``` - -## ID variables - -### Exercise 6 -Create new variable called ``id`` in the ``panel`` data set containing a uniquely and fully identifiable ID. - - * TIP: To create a new variable in a data set, we simply assign a value to a column that doesn't year exist. You can make the ID unique by concatenating the year and the household ID. - -## ID variables - -\scriptsize -```{r} - # Create panel ID - panel$id <- (panel$hh_code * 10000) + panel$year - - # Check properties - sum(duplicated(panel$id)) - sum(is.na(panel$id)) - - # Here's a shortcut similar to the isid variable in Stata - length(unique(panel$id, na.rm = TRUE)) == length(panel$id) -``` - -# Subsetting - -## Subsetting - -* The ``panel`` has a lot more variables than we'll actually use for analysis -* Let's construct a new data set with only the variables we'll you -* For starters, we'll subset the ``panel`` data set and keep only some variables of interest -* These variables are already listed in the script we gave you -* But first let's take a look at how to find variables to select in a data set - -## Subsetting - -The ``grep`` function is similar to the wildcard (``*``) in Stata: it looks for all elements of an object containing a certain pattern of strings. - -``grep(pattern, x)``: finds the elements of ``x`` that contain the pattern in ``pattern`` and returns a vector of indexes with their position - -* **`pattern`:** the string expression you want to look for -* **`x`:** a vector of strings - -## Subsetting - -### Exercise 7 -Create a vector called ``plot_area_vars`` containing the names of the variables with winsorized plot areas in seasons A, B and C: - -1. Use the ``grep()`` function to display the position index of all the variables in the ``panel`` data set whose name contains the expression ``area``. - -2. Use the resulting vector from the ``grep()`` function to subset the vector of all variable names in the ``panel`` data frame. - -3. Create the ``plot_area_vars`` vector containing only the names of the winsorized plot area variables - -* TIP: these variables' names start with "w_". - -## Subsetting -```{r, eval =F} -# Get plot area variables indexes -grep("area", names(panel)) - -# Get plot area variables names -names(panel)[grep("area", names(panel))] - -# These are the ones we want: -plot_area_vars <- c("w_area_plots_a", - "w_area_plots_b", - "w_area_plots_c") - -``` - - -## Subsetting - -\scriptsize - -* In the previous sessions, we talked about indexing and how we can use it to select specific rows and columns of objects -* Here's a quick recall: - -```{r, eval = F} - ## Select a column (variable) in a data frame: - # First column - panel[, 1] - - # Select by name - panel[, "hh_code"] - panel$hh_code - - ## Select elements in a vector - - # Select one element in a vector - panel$hh_code[2] - - # Select multiple elements in a vector - panel$hh_code[c(3,6,89)] - - # Which is the same as - panel[c(3,6,89), "hh_code"] -``` - -## Subsetting - -* Now, let's use this information to subset our ``panel`` data frame and keep only the variables we have already listed - -### Exercise 8 -Create a new object, called ``lwh``, containing only the variables we selected from the ``panel`` data set. - - -## Subsetting -\tiny -```{r} - # Select variables - id_vars <- c("hh_code", "wave", "year", "treatment_hh", - "treatment_site", "site_code") - demographic_vars <- c("gender_hhh", "age_hhh", "num_dependents", "read_and_write") - yield_vars <- c("w_gross_yield_a", "w_gross_yield_b") - food_vars <- c("expend_food_yearly", "expend_food_lastweek", "wdds_score") - income_vars <- names(panel)[28:37] - plot_area_vars <- c("w_area_plots_a", "w_area_plots_b", "w_area_plots_c") - - # Subset data - lwh <- panel[, c(id_vars, - demographic_vars, - yield_vars, - food_vars, - income_vars, - plot_area_vars)] - - # Check result - names(lwh) -``` - -# Factor variables - -## Creating a factor - -Let's take a look at the variable ``gender_hhh`` in the ``lwh`` data set: - -```{r} - # Tabulate gender of household head - table(panel$gender_hhh) -``` - -This is supposed to be categorical variable, but for now it's just numeric. - -## Creating a factor - -To create a factor variable, we use the ``factor`` function: - -``factor(x, levels, labels)`` : turns numeric or string vector ``x`` into a factor vector -* **``x``:** the vector you want to turn into a factor -* **``levels``:** a vector containing the possible values of ``x`` -* **``labels``:** a vector of strings containing the labels you want to apply to your factor variable - -### Exercise 9 -Replace the variable ``gender_hhh`` in the ``lwh`` data set by a factor. Label the factor so that 0 is "Female" and 1 is "Male". - -* TIP: the first string in the ``labels`` argument will be applied to the first value in the ``levels`` argument, the second to the second and so on. - - -## Creating a factor -\scriptsize -```{r} - # The numeric variable - table(lwh$gender_hhh) - - # Turn numeric variable into factor - lwh$gender_hhh <- factor(lwh$gender_hhh, - levels = c(0, 1), - labels = c("Female", "Male")) - - # The factor variable - table(lwh$gender_hhh) -``` - -## Exploring factors - -Here are a few useful functions to explore factors: - - * **``str()``:** we've already seen this function. When using it with factors is that is shows the number and some variable labels - * **``levels()``:** displays the different values of a factor variable - * **``table()``:** displays a frequency table per level of a factor variable - -### Exercise 10 - -1. Use the functions listed above to explore the ``wave`` variable in the ``lwh`` data set. -2. (challenge) Use the ``table()`` function to see the year of each wave. Use the help if necessary. - -## Exploring factors -\tiny -```{r} - # First look at the variable - str(lwh$save) - - # List all levels - levels(lwh$wave) - - # Observations per level - table(lwh$wave) - - # Year of each wave - table(lwh$wave, lwh$year) -``` - -## Ordering factors - -* The ``wave`` variable's levels are not ordered correctly -* That's because R creates the number underlying a factor in alphabetical order -* However, that can be misleading, specially if we're using this variable for tables and graphs -* If we want the levels of a factor to be in an specific order, we need to set the argument ``ordered`` of the ``factor()`` function to ``TRUE`` -* This will tell R that the order of the levels is exactly the same as the one specified in the ``levels`` argument - - -## Ordering factors - -### Exercise 11 -Make the ``wave`` variable in the ``lwh`` dataset an ordered factor, ordering them by the year in which they happened - -## Ordering factors - -\scriptsize - -```{r} - # Order variable - panel$wave <- factor(panel$wave, - levels = c("Baseline", "FUP1&2", "FUP3", "FUP4", "Endline"), - ordered = T) - - # Check new order - levels(panel$wave) - table(panel$wave, panel$year) -``` - - -# Aggregating variables - -## Aggregate variables - -* For our analysis in the next sessions, we will not use the disaggregated income variables -* We'll rather be interested in the total income of a household -* To calculate the total income, we'll use the ``rowSums()`` function, equivalent to ``rowtotal`` in Stata -* ``rowSums()`` is part of a function family that allows you to aggregate row and columns of a data set -* They all have the same syntax - -## Aggregate variables - -``rowSums(x, na.rm = FALSE)``: sums selected variables in all rows of a data set - -* **``x``:** the two-dimensional object whose columns you want to add -* **``na.rm``:** by default, when any of the values you're trying to add are missing, the result will be a missing value. Make this argument ``TRUE`` for R to treat missing values as zero in the sum - - -## Aggregate variables - -### Exercise 12 -Create a variable called ``income_total`` in the ``lwh`` data set containing the sum of all the income variables you listed in the ``income_vars`` vector. - -## Aggregate variables - -\scriptsize - -### Exercise 13 -Create a variable called ``income_total`` in the ``lwh`` data set containing the sum of all the income variables you listed in the ``income_vars`` vector. - -```{r} -# List variables -income_vars - -# Add columns -lwh$income_total <- - rowSums(lwh[, income_vars], - na.rm = TRUE) -``` - -## Dropping variables - -Now, let's remove the disaggregated income variables from data set, since we're using them anymore. - -The easiest way to remove variables from a data set is to make those variables ``NULL``. Like this: - -```{r, eval = F} -dataset$variable <- NULL -``` - -To do this for a series of variables at the same time, you can select them by their names. Like this: -```{r, eval = F} -dataset[, c("var1", "var2", "var3")] <- NULL -``` - -## Dropping variables - -### Exercise 14 -Remove all of the raw income variables listed in the ``income_vars`` vector from the ``lwh`` data data set. - -## Dropping variables - -### Exercise 15 -Remove all of the raw income variables listed in the ``income_vars`` vector from the ``lwh`` data data set. - -```{r} - # Remove income variables - lwh[, income_vars] <- NULL -``` - -# Treating outliers - -## Treating outliers - -* We all know outliers can bias analysis results -* But most importantly, they make our plots ugly and out of scale -* There are different ways to treat outliers and there's not a single best method for doing it -* In the end, this is a research question and you should always discuss it with your PI before deciding what to do -* The one thing you should never do is drop a whole observation from your data set because of an outlier in a single variable -* There are a lot of - -## Winsorizing - -To winsorize a variable means to replace all observations with value above (or below) a given percentile by the value of that percentile, therefore capping the distribution of the resulting variable. - -As we've seen in session 2, you can create functions in R using a function called ``function()``. In the next slide, we'll give you some code to create a function that - -1. Calculates the value of the 90th percentile of its argument -2. Replaces all values above this percentile in its argument by the 90th percentile -3. Returns the resulting variable - - -## Winsorizing - -```{r} - # Create a function to winsorize at the 90th percentile - winsor <- function(x) { - x[x > quantile(x, 0.9, na.rm = T)] <- - quantile(x, 0.9, na.rm = T) - return(x) - } - - # Create winsorized income - lwh$income_total_win <- winsor(lwh$income_total) -``` - -## Trimming - -To trim a variable means to replace all observation above a certain value (usually a given percentile) with missing values. It's particularly useful to create non-distorted graphs. - -### Exercise 16 -Edit the ``winsor()`` function created earlier to create a new function that trims observations: - -1. Instead of replacing the values with the 90th quantile, replace it with "NA" -2. Call this function ``trim`` -3. Use this function to trim the ``income_total`` variable in the ``lwh`` data set and create a new variable called ``income_total_trim`` - -## Trimming - -```{r} - - # Create the trim function - trim <- function(x) { - x[x > quantile(x, 0.9, na.rm = T)] <- NA - return(x) - } - - # Trim income - lwh$income_total_trim <- trim(lwh$income_total) - -``` - -## Treating outliers - -```{r} - # Compare variables - summary(lwh$income_total) - summary(lwh$income_total_win) - summary(lwh$income_total_trim) -``` - -# Saving a data set - -## Saving a data set - -* As mentioned before, R data sets are usually save as CSVs -* To save a data set, we use the ``write.csv()`` function: - -### ``write.csv(x, file, row.names = TRUE)`` -* **``x``:** the object (usually a data frame) you want to export to CSV -* **``file``:** the file path to where you want to save it, including the file name and the format (".csv") -* **``row.names``**: by default, R adds a column to the CSV file with the names (or numbers) of the rows in the data frame. Set it to ``FALSE`` if you don't want that column to be exported - -## Saving a data set - -### Exercise 17 -Save the ``lwh`` data set to DataWork > DataSets > Final. - -* TIP: Use the ``file.path()`` function and the object ``finalData`` created in the master to simplify the folder path. - -## Saving a data set - -```{r, eval = F} -# Save the lwh data set -write.csv(lwh, - file.path(finalData,"lwh_clean.csv"), - row.names = F) -``` - -# Running a script from the master - -## Running a script from the master - -### Exercise 18 - -1. Save this script -2. Add a switch for Lab3 in your master -3. Create an if statement that runs this script if the switch is on. To do this, use the ``source()`` function, which is equivalent to the ``do`` function in Stata. - -* TIP: just like the ``do`` function in Stata, the only argumento of the ``source()`` function is the complete file path to your script. Don't forget to you explicit and dynamic file paths! - diff --git a/Presentations/Lab 3 - Data Processing.pdf b/Presentations/Lab 3 - Data Processing.pdf deleted file mode 100644 index 41260be..0000000 Binary files a/Presentations/Lab 3 - Data Processing.pdf and /dev/null differ diff --git a/Presentations/Lab 5 - Data visualization.Rmd b/Presentations/Lab 3 - Data visualization.Rmd similarity index 55% rename from Presentations/Lab 5 - Data visualization.Rmd rename to Presentations/Lab 3 - Data visualization.Rmd index b359906..b1d10a0 100644 --- a/Presentations/Lab 5 - Data visualization.Rmd +++ b/Presentations/Lab 3 - Data visualization.Rmd @@ -1,7 +1,7 @@ --- title: "Data Visualization" -subtitle: "Field Coordinator Training - R Track" -date: "June 2018" +subtitle: "R for Stata Users" +date: "November-December 2018" author: "Luiza Andrade, Leonardo Viotti & Rob Marty " output: beamer_presentation: @@ -25,17 +25,22 @@ library(ggplot2) # File paths if (Sys.getenv("USERNAME") == "luiza"){ - projectFolder <- "C:/Users/luiza/Documents/GitHub/R-Training" + projectFolder <- "C:/Users/luiza/Documents/GitHub/dime-r-training" + +} + +if (Sys.getenv("USERNAME") == "WB501238"){ + projectFolder <- "C:/Users/WB501238/Documents/GitHub/dime-r-training" } if (Sys.getenv("USERNAME") == "Leonardo"){ - projectFolder <- "C:/Users/Leonardo/Documents/GitHub/R-Training" + projectFolder <- "C:/Users/Leonardo/Documents/GitHub/dime-r-training" } if (Sys.getenv("USERNAME") == "WB519128"){ - projectFolder <- file.path("C:/Users/WB519128/Documents/GitHub/R-Training") + projectFolder <- file.path("C:/Users/WB519128/Documents/GitHub/dime-r-training") } # File paths @@ -43,10 +48,14 @@ if (Sys.getenv("USERNAME") == "WB519128"){ Data <- file.path(dataWorkFolder,"DataSets") finalData <- file.path(Data,"Final") + rawOutput <- file.path(dataWorkFolder,"Output","Raw") + # Load CSV data -lwh <- read.csv(file.path(finalData,"lwh_clean.csv"), + +whr <- read.csv(file.path(finalData,"whr_panel.csv"), header = T) + ``` @@ -56,7 +65,7 @@ lwh <- read.csv(file.path(finalData,"lwh_clean.csv"), In this session, you'll learn how to use R to produce insightful, meaningful and (hopefully) nice-looking graphs. In particular, you'll use a package called ``ggplot2``. -Similarly, to the previous session, you can find some references at the end of this presentation that include a more comprehensive discussions on data visualization. +Similarly to the previous session, you can find some references at the end of this presentation that include a more comprehensive discussions on data visualization. ## Introduction @@ -64,18 +73,10 @@ Before we start, let's make sure we're all set: \begin{enumerate} - \item Make sure you the packages \texttt{ggplot2} and \texttt{plotly} are installed and loaded. - - \begin{itemize} - \item Preferably, do this by adding them to \texttt{packages} vector of your Master script. - \end{itemize} - - \item Load the \texttt{lwh\_clean.csv} data set that were created in the last session. - \begin{itemize} - \item Run the Master script to load all the necessary packages and set file paths. - \item Remeber to disable the \texttt{PACKAGES} switch in the Master if you already installed them. This will save you a lot of time. - \end{itemize} - + \item Make sure you the packages \texttt{ggplot2} and \texttt{plotly} are installed and loaded. + + \item Load the \texttt{whr\_panel.csv} data set. + \end{enumerate} ## Introduction @@ -83,21 +84,21 @@ Before we start, let's make sure we're all set: ```{r, eval= F} # Install packages - install.packages(c("ggplot2","plotly"), + install.packages("plotly", # We already installed ggplot2 dependencies = TRUE) # Load packages library(ggplot2) library(plotly) - - # Load CSV data - projectFolder <- "YOUR/FOLDER/PATH" - finalData <- file.path(projectFolder, - "DataWork","DataSets","Final") - lwh <- read.csv(file.path(finalData,"lwh_clean.csv"), - header = TRUE) - lwh_simp <- read.csv(file.path(finalData,"lwh_simp.csv"), - header = TRUE) + + #### Load CSV data + + # Replace with where your data is + FolderPath <- file.path("YOUR/FOLDER/PATH") + + whr <- read.csv(file.path(FolderPath,"whr_panel.csv"), + header = T, + stringsAsFactors = F) ``` @@ -127,19 +128,20 @@ In our workflow there are usually two distinct uses for plots: ## Exploratory graphs - Base plot ### Exercise 1: -Plot the ``lwh_simp`` data set you constructed in the previous session using the ``plot()`` function. +Plot the ``whr_simp`` data set you constructed in the previous session using the ``plot()`` function. 1. If you don't have the code from the previous session. Here it is: ```{r, eval = T} +# Vector with covariates to be kept +covariates <- c("happy_score", + "gdp_pc", + "freedom", + "trust_gov_corr") -covariates <- c("age_hhh", - "num_dependents", - "income_total_trim", - "expend_food_yearly", - "w_area_plots_a") +# subset whr +whr_simp <- whr[, covariates] -lwh_simp <- lwh[,covariates] ``` 2. Now, pass the name of the data set as the only argument for the plot function @@ -148,14 +150,14 @@ lwh_simp <- lwh[,covariates] \scriptsize ```{r} -plot(lwh_simp) +plot(whr_simp) ``` ## Exploratory graphs - Base plot \begin{block}{Exercise 2} \begin{enumerate} - \item Create a histogram of variable \texttt{w\_area\_plots\_a} in data frame \texttt{lwh\_simp}. Use the \texttt{hist()} function for that. + \item Create a histogram of variable \texttt{happy\_score} in data frame \texttt{whr\_simp}. Use the \texttt{hist()} function for that. \item Try to set the color as \texttt{"green"}. Use the help if you are not sure how to do it. \end{enumerate} \end{block} @@ -164,22 +166,34 @@ plot(lwh_simp) ## Exploratory graphs - Base plot \scriptsize ```{r, eval = T, echo = T} -hist(lwh_simp$w_area_plots_a, col = "green") +hist(whr_simp$happy_score, col = "green") ``` ## Exploratory graphs - Base plot ### Exercise 3 -Use the ``boxplot()`` function to see a description of the yearly food expenditure by number of dependents. +Use the ``boxplot()`` function to see a description of the happiness score by region. - TIP: The syntax for this function is ``variable_to_be_plotted`` ~ ``variable_to_plot_over``. + ## Exploratory graphs - Base plot -\scriptsize -```{r} -boxplot(lwh_simp$expend_food_yearly ~ lwh_simp$num_dependents) +Since these region labels are too long, I'm using an extra argument so we can actually read the x axis. + +```{r, eval = F} +boxplot(whr$happy_score ~ whr$region, las = 2) +``` + + + +## Exploratory graphs - Base plot +\tiny + +```{r, echo = F} +par(mar=c(14,2,2,2)) +boxplot(whr$happy_score ~ whr$region, las = 2) ``` # ``ggplot`` - Introduction @@ -196,7 +210,7 @@ It is a powerful and easy to use tool (once you understand its logic) that produ There are a few reasons why we use ggplot: * It is generally easier to customize and tailor plots to your needs. - * It works great with almost any kind of plot, including maps, with little variation in syntax. The notable exception is 3D plotting, which can be done, but it's not as straightforward. + * It works great with almost any kind of plot, including maps, with little variation in syntax. * It looks good. ## ``ggplot`` - Grammar @@ -221,49 +235,24 @@ Here is a list of ``ggplot2`` most commonly used "grammar" elements: -## ``ggplot`` - Data structure - -As almost everything in R, ``ggplot`` is very flexible and can be used in different ways. It is easier to work, however, if your data is in a specific format. - - * Data has to be a data frame. - - * Long format, especially categorical values. - + It's better to have one variable with all the categories than a set of dummies. - * Labelled factors. - -This is very similar to Stata, with the advantage of not needing to -preserve and restore anything. - -##``ggplot`` - Data structure - -Here's an example of how an aggregated (collapsed) data set should look like: - - -```{r, include = T, echo = F, warning=FALSE} -anualIncGen <- aggregate(x= lwh["income_total_trim"], - by = list(year = lwh$year, - gender = lwh$gender_hhh), - FUN = mean, na.rm = T) -anualIncGen - -``` - ## ``ggplot`` - Introduction ### Exercise 4 -First, let's create a very simple plot with the yearly expenditure on food on the Y axis and age of the household head on the X axis. +First, let's create a very simple plot with the happiness score on the Y axis and afreedom on the X axis. 1. Use the ``ggplot`` function to store your plot in an object called ``plot1`` - Since it's the first one, Here's the code: -```{r, eval=T} -plot1 <- ggplot(data = lwh, - aes(y = w_area_plots_b, - x = w_gross_yield_b)) +```{r, eval=F} +p1_happyfree <- + ggplot(whr, + aes(y = happy_score, + x = freedom)) + ``` 2. Now, try to print your plot. What happens? @@ -274,10 +263,11 @@ plot1 <- ggplot(data = lwh, We mapped our variables, but without a geometry ``ggplot`` doesn't know how to represent the data. ### Let's add a geometry -```{r, eval=FALSE} -ggplot(data = lwh, - aes(y = w_area_plots_b, - x = w_gross_yield_b)) + +```{r, eval=TRUE} +p1_happyfree <- + ggplot(whr, + aes(y = happy_score, + x = freedom)) + geom_point() ``` @@ -285,94 +275,116 @@ ggplot(data = lwh, ## ``ggplot`` - Introduction \scriptsize -```{r, echo = F, warning = F} -ggplot(lwh, aes(y = w_area_plots_b, - x = w_gross_yield_b)) + - geom_point() +```{r, echo = T, warning = F} +print(p1_happyfree) ``` # ``ggplot`` - Data structure -## ``ggplot`` and ``aggregate()`` +## ``ggplot`` - Data structure + +As almost everything in R, ``ggplot`` is very flexible and can be used in different ways. It is easier to work, however, if your data is in a specific format. + + * Data has to be a data frame. + + * Long format, especially categorical values. + + It's better to have one variable with all the categories than a set of dummies. + * Labelled factors. + +This is very similar to Stata, with the advantage of not needing to +preserve and restore anything. + +##``ggplot`` - Data structure + +Here's an example of how an aggregated (collapsed) data set should look like: + + +```{r, include = T, echo = F, warning=FALSE} +happy_table <- + aggregate(happy_score ~ year + region, + data = whr, + FUN = mean) +happy_table[1:11,] + +``` + +## ``ggplot`` - Data structure * Since ``ggplot`` usually works better with data in a specific format, there are a few functions particularly useful to manipulate data * Here, we'll use the ``aggregate()`` function * ``aggregate()`` works similarly to ``collapse`` in Stata -## ``ggplot`` and ``aggregate()`` +## ``ggplot`` - Data structure -To create a line plot with ``ggplot``, we will create a new data set containing the average of the total house hold annual income by year. +To create a line plot with ``ggplot``, we will create a new data set containing the average of happiness score by year. ```{r, eval= T} -# Aggregate anual income by year -anualInc <- - aggregate(x = lwh["income_total_trim"], # data.frame - by = list(year = lwh$year), #list - FUN = mean, na.rm = T) # function +# Aggregate annual income by year +annualHappy<- + aggregate(happy_score ~ year, + data = whr, + FUN = mean) ``` ### Exercise 5 - 1. Use the ``ggplot()`` function with the ``anualInc`` as the ``data`` argument, ``income_total_trim`` as ``y`` and ``year`` as ``x`` in ``aes()`` - 2. This time add ``geom_line()`` + 1. Use the ``ggplot()`` function with the ``annualHappy`` as the ``data`` argument, ``happy_score`` as ``y`` and ``year`` as ``x`` in ``aes()`` + 2. This time, add ``geom_line()`` -## ``ggplot`` and ``aggregate()`` -\scriptsize +## ``ggplot`` - Data structure -```{r, eval= F} +```{r, eval= T} # Plot code with arguments in geom -ggplot() + - geom_line(data = anualInc, - aes(y = income_total_trim, - x = year)) - +p2_happyyear <- + ggplot(data = annualHappy, + aes(y = happy_score, + x = year)) + + geom_line() ``` -## ``ggplot`` and ``aggregate()`` +## ``ggplot`` - Data structure \scriptsize -```{r, echo = F , eval= T} +```{r, echo = T , eval= T} -ggplot() + - geom_line(data = anualInc, - aes(y = income_total_trim, - x = year)) +print(p2_happyyear) ``` -## ``ggplot`` and ``aggregate()`` -Ok, that worked, but was a really boring plot. Let's do a cooler one. +## ``ggplot`` - Data structure -### Exercise 6: Part 1 +Ok, it worked, but that was a really boring plot. Let's do a better one. -Use the ``aggregate()`` function to average total house hold annual income, now, by two variables: `year` and `treatment_hh`. +### Exercise 6: Part 1 -- TIP: add an argument to the `list()` function in the by argument. Like this: +Use the ``aggregate()`` function to average happiness score, now, by two variables: `year` and `region`. ```{r, eval = F} - by = list(Year = lwh$year, - Treatment = lwh$treatment_hh) +annualHappy_reg <- + aggregate(happy_score ~ year + region, + data = whr, + FUN = mean) ``` -## ``ggplot`` and ``aggregate()`` +## ``ggplot`` - Data structure \scriptsize ```{r} # Aggregate data set -anualIncGen <- aggregate(x= lwh["income_total_win"], - by = list(year = lwh$year, - treatment = lwh$treatment_hh), - FUN = mean, na.rm = T) +annualHappy_reg <- + aggregate(happy_score ~ year + region, + data = whr, + FUN = mean) -anualIncGen +annualHappy_reg[1:10,] ``` -## ``ggplot`` and ``aggregate()`` +## ``ggplot`` - Data structure ### Exercise 6: Part 2 Now, use `ggplot` as in the previous exercise to plot the aggregated data frame. @@ -380,44 +392,109 @@ anualIncGen 1. This time set the ``color`` and `group` arguments in the ``aes()`` function as the treatment variable. 2. Finally, add both line and point ``geoms``. -## ``ggplot`` and ``aggregate()`` +## ``ggplot`` - Data structure \scriptsize ```{r, eval= F} # Aggregate data set -anualIncGen <- aggregate(x= lwh["income_total_win"], - by = list(year = lwh$year, - treatment = lwh$treatment_hh), - FUN = mean, na.rm = T) - -# Plot aggregated data set -ggplot(anualIncGen, - aes(y = income_total_win, - x = year, - color = treatment, - group = treatment)) + - geom_line() + - geom_point() +annualHappy_reg <- + aggregate(happy_score ~ year + region, + data = whr, + FUN = mean) + + +# Plot aggregated data +p3_happyreg <- + ggplot(data = annualHappy_reg, + aes(y = happy_score, + x = year, + color = region, + group = region)) + + geom_line() + + geom_point() + +print(p3_happyreg) ``` -## ``ggplot`` and ``aggregate()`` +## ``ggplot`` - Data structure \scriptsize ```{r, echo = F , eval= T, warning=FALSE} -anualIncGen <- aggregate(x= lwh["income_total_win"], - by = list(year = lwh$year, - treatment = lwh$treatment_hh), - FUN = mean, na.rm = T) +annualHappy_reg <- + aggregate(happy_score ~ year + region, + data = whr, + FUN = mean) -ggplot(anualIncGen, aes(y = income_total_win, x = year, color = treatment, group = treatment)) + +ggplot(annualHappy_reg, + aes(y = happy_score, + x = year, + color = region, + group = region)) + geom_line() + geom_point() ``` + +## ``ggplot`` - Data structure + +You can also use `ggplot` to combine different data frames in the same plot. + + * Everything inside the `ggplot()` function will be shared by all geometries added (e.g. the line and point geometries added to the previous exercise) + + * But you can also pass different arguments to different geometries. + +In the following example, to showcase just 3 regions, instead of restricting the data to only the interest regions, we are adding 3 separeate line geometries. + + +## ``ggplot`` - Data structure +\tiny + +```{r, echo = T , eval= T} +# Separate datasets +sea <- annualHappy_reg[annualHappy_reg$region == "Southeastern Asia", ] +mena <- annualHappy_reg[annualHappy_reg$region == "Middle East and Northern Africa", ] +cee <- annualHappy_reg[annualHappy_reg$region == "Central and Eastern Europe", ] + +# plot with 3 line geometries +p4_happy3reg <- + ggplot() + + geom_line(data = sea, # Southeastern Asia + aes(y = happy_score, + x = year, + color = region, + group = region)) + + geom_line(data = mena, # Middle East and Northern Africa + aes(y = happy_score, + x = year, + color = region, + group = region)) + + geom_line(data = cee, # Central and Eastern Europe + aes(y = happy_score, + x = year, + color = region, + group = region)) + + + + +``` + + +## ``ggplot`` and ``aggregate()`` + +\scriptsize + +```{r, eval= T, warning=FALSE} +print(p4_happy3reg) + + +``` + + # ``ggplot`` - Aesthetics ## ``ggplot`` - Aesthetics @@ -426,69 +503,64 @@ In ``ggplot`` , the aesthetic element defines how data is represented in the aes ### Exercise 7: Part 1 - 1. Create a data frame containing only 2018 observations of ``lwh`` in the Rwamangana 35 site. + 1. Create a data frame containing only 2017 observations of `whr`. - TIP: This time we want to keep only observations (lines) that meet a certain condition. Use the left argument of the brackets operators. Like this: ```{r, eval = F} - lwh[CONDITION,] + whr[CONDITION,] ``` - 2. Use ``ggplot()`` to scatter plot area in season A and total annual income + 2. Use ``ggplot()`` to scatter happiness score and freedom. ## ``ggplot`` - Aesthetics \scriptsize -```{r, eval= F} -# Subset lwh to 2018 and Rwamangana 35 -lwh_s <- lwh[lwh$year == 2018 & lwh$site_code == "Rwamangana 35", ] +```{r, eval= T} +# Subset whr to 2017 +whr17 <- whr[whr$year == 2017, ] # Plot -ggplot(lwh_s, aes(x = w_area_plots_a, - y = income_total_trim)) + - geom_point() +p5_happyfree17 <- + ggplot(whr17, aes(x = freedom, + y = happy_score)) + + geom_point() ``` ## ``ggplot`` - Aesthetics \scriptsize -```{r, echo = F , eval= T , warning=FALSE} -# Subset lwh to 2018 and Rwamangana 35 -lwh_s <- lwh[lwh$year == 2018 & (lwh$site_code == "Rwamangana 35"), ] +```{r, echo = T , eval= T , warning=FALSE} +print(p5_happyfree17) -# Plot -ggplot(lwh_s, aes(x = w_area_plots_a, y = income_total_trim)) + - geom_point() ``` -## ``ggplot`` - Aesthetics - -Now add the women's dietary diversity score[^1] to ``aes()``. +## `ggplot` - Aesthetics +Now add the region to `aes()`. ### Exercise 7: Part 2 - 1. Use the ``wdds_score`` varaible as color in ``aes()``. - - TIP: this is is a categorical variable, but it is stored in numeric format. Use the ``factor()`` function. Like this: + 1. Use the `region` variable as color in ``aes()``. + - TIP: this is is a categorical variable, but it is stored in string format. Use the ``factor()`` function. Like this: ```{r, echo = T , eval= F} - color = factor(wdds_score) + color = factor(region) ``` -[^1]: -Women's dietary diversity score is based on the questions asked specifically to an adult female respondent about her food consumption. We ask if she ate 16 food items categorized into 9 groups. Each group get 1 if any of the food belonging to the group is consumed. The final score is a simply sum of the categories. ## ``ggplot`` - Aesthetics \scriptsize -```{r, eval= F} +```{r, eval= T} # Plot code -ggplot(data = lwh_s, - aes(y = w_area_plots_a, - x = income_total_trim, - color = factor(wdds_score))) + - geom_point() +p6_happyfree17reg <- + ggplot(whr17, + aes(x = freedom, + y = happy_score)) + + geom_point(aes(color = factor(region))) + ``` @@ -497,9 +569,7 @@ ggplot(data = lwh_s, ```{r, echo = F , eval= T, warning=FALSE} -ggplot(data = lwh_s, - aes(y = w_area_plots_a, x = income_total_trim, color = factor(wdds_score))) + - geom_point() +print(p6_happyfree17reg) ``` @@ -509,28 +579,28 @@ Now we will combine different arguments of ``aes()`` ### Exercise 7: Part 3 - 1. Add expenditure on food last week variable, ``expend_food_lastweek`` to the ``size`` argument of ``aes()`` in your last graph. + 1. Add `gdp_pc` to the ``size`` argument of ``aes()`` in your last graph. ## ``ggplot`` - Aesthetics \scriptsize -```{r, eval= F} +```{r, eval= T} # Plot code -ggplot(data = lwh_s, - aes(y = w_area_plots_a, - x = income_total_trim, - color = factor(wdds_score), - size = expend_food_lastweek)) + - geom_point() +p6_happyfree17reg <- + ggplot(whr17, + aes(x = freedom, + y = happy_score)) + + geom_point(aes(color = factor(region), + size = gdp_pc)) + ``` ## ``ggplot`` - Aesthetics \scriptsize -```{r, echo = F , eval= T, warning=FALSE} +```{r, echo = T , eval= T, warning=FALSE} -ggplot(lwh_s, aes(y = w_area_plots_a, x = income_total_trim, color = factor(wdds_score), size = expend_food_lastweek)) + - geom_point() +print(p6_happyfree17reg) ``` @@ -542,12 +612,12 @@ ggplot(lwh_s, aes(y = w_area_plots_a, x = income_total_trim, color = factor(wdds ```{r, eval= F} # Plot code -ggplot(data = lwh_s, - aes(y = w_area_plots_a, - x = income_total_trim, - color = w_gross_yield_a, - size = expend_food_lastweek)) + +ggplot(whr17, aes(x = freedom, + y = happy_score, + color = dystopia_res, + size = gdp_pc)) + geom_point() + ``` ## ``ggplot`` - Aesthetics @@ -555,7 +625,10 @@ ggplot(data = lwh_s, ```{r, echo = F , eval= T, warning=FALSE} -ggplot(lwh_s, aes(y = w_area_plots_a, x = income_total_trim, color = w_gross_yield_a, size = expend_food_lastweek)) + +ggplot(whr17, aes(x = freedom, + y = happy_score, + color = dystopia_res, + size = gdp_pc)) + geom_point() ``` @@ -591,35 +664,22 @@ To add legend titles, we will use two functions that control the formatting of t ## ``ggplot`` - Titles and labels \scriptsize -```{r, eval= F} +```{r, eval= T} # A properly labelled plot -ggplot(lwh_s, aes(y = w_area_plots_a, - x = income_total_trim, - size = expend_food_lastweek, - color = factor(wdds_score))) + - geom_point() + - ggtitle("My pretty plot") + - xlab("Total Income (Wins.)") + - ylab("Plot area in season A (Wins.)") + - scale_color_discrete(name = "Women's dietary diversity score") + - scale_size_continuous(name = "Food exp. last week") +p6_happyfree17reg <- + p6_happyfree17reg + + ggtitle("My pretty plot") + + xlab("Freedom") + + ylab("Happiness") + + scale_color_discrete(name = "World Region") + + scale_size_continuous(name = "Score of GDP per capita") ``` ## ``ggplot`` - Titles and labels \scriptsize -```{r, echo = F , eval= T, warning=FALSE} -ggplot(data = lwh_s, - aes(y = w_area_plots_a, - x = income_total_trim, - color = factor(wdds_score), - size = expend_food_lastweek)) + - geom_point() + - ggtitle("My pretty plot") + - xlab("Total Income (Wins.)") + - ylab("Plot area in season A (Wins.)") + - scale_color_discrete(name = "Women's dietary diversity score") + - scale_size_continuous(name = "Food exp. last week") +```{r, echo = T , eval= T, warning=FALSE} +print(p6_happyfree17reg) ``` # Saving a plot @@ -638,62 +698,105 @@ Let's save the previous plot using the PDF graphics device \scriptsize ```{r, eval= F} - # Open PDF graphics device - pdf(file = file.path(rawOutput, "plot1.pdf")) - # Plot - ggplot(data = lwh_s, - aes(y = w_area_plots_a, - x = income_total_trim, - color = factor(wdds_score), - size = expend_food_lastweek)) + - geom_point() + - ggtitle("My pretty plot") + - xlab("Total Income (Wins.)") + - ylab("Plot area in season A (Wins.)") + - scale_color_discrete(name = "Women's dietary diversity score") + - scale_size_continuous(name = "Food exp. last week") + Output <- "YOUR/PATH/HERE" + +# Open PDF graphics device + pdf(file = file.path(Output, "plot1.pdf")) - # Close PDF graphics device - dev.off() + # Plot + print(p6_happyfree17reg) + +# Close PDF graphics device +dev.off() ``` -# Interactive graph +# Interactive graphs + + +## Interactive graphs -## Interactive graph +There are several packages to create interactive or dynamic data vizualizations with R. Here are a few: -Now we'll use ggplotly() to create an interactive graph! + * `leaflet` - R integration tp one of the most popular open-source libraries for interactive maps. + * `highcharter` - cool interactive graphs. + * `plotly` - interactive graphs with integration to ggplot. + * `gganimate` - ggplot GIFs. + * `DT` - Interactive table -### Bonus exercise - ggplotly +These are generally, html widgets that can be incorporated in to an html document and websites. + +## Interactive graphs + +Now we'll use the `ggplotly()` function from the `plotly` package to create an interactive graph! + +### Exercise 9 1. Load the ``plotly`` package - 2. Store one the plots produced in the previous ``ggplot`` exercises in an object. - 3. Pass that object as argument for the ``ggplotly()`` function + 2. Pass that object with the last plot you created to the ``ggplotly()`` function -## Interactive graph +## Interactive graphs \scriptsize ```{r, eval= F} # Load package library(plotly) -# Store graph in plot1 -plot1 <- ggplot(data = lwh_s, - aes(y = w_area_plots_a, - x = income_total_trim, - color = factor(wdds_score))) + - geom_point() - -# Use ggplotly to create an interactive plot -ggplotly(plot1) + # Use ggplotly to create an interactive plot + ggplotly(p6_happyfree17reg) ``` +## Interactive graphs + +(Sorry, this is a .pdf presentation so this is just a screenshot.) + +\begin{figure} +\centering + \includegraphics[width=12cm,height=7.7cm]{img/ggplotly1.png} +\end{figure} + + + +## Homework - Interactive graphs + +We can add pop-up information by adding another argument to the `aes()` function. + +### Customize your ggplotly plot + 1. Add a `text` argument to the`geom_point` `aes()` function and assign it to the `country` variable. This argument won't be recognized by `ggplot()`, but will be by `ggplotly()`. + 2. Use themes to change the background color. You can find a list of default themes in https://ggplot2.tidyverse.org/reference/ggtheme.html + + - TIP: type `?ggthemes::` on the console for more options + + 3. Use the `tooltip` argument of the `ggplotly` function to customize what is shown when hovering + +## Interactive graphs + +Here's what we've done: + +\begin{figure} +\centering + \includegraphics[width=12cm,height=7.7cm]{img/ggplotly2.png} +\end{figure} + + +## Interactive graphs + +(Sorry, this is a .pdf presentation so this is just a screenshot.) + +\begin{figure} +\centering + \includegraphics[width=12cm,height=7.7cm]{img/ggplotly2.png} +\end{figure} + # References and recommendations ## References and recommendations Websites: + * Interactive stuff : + http://www.htmlwidgets.org/ + * The R Graph Gallery: https://www.r-graph-gallery.com/ @@ -718,9 +821,6 @@ Books: # Appendix -## Appendix - -\large Slides that didn't make the cut ## R common graphic devices diff --git a/Presentations/Lab 4 - Coding for Reproducible Research.Rmd b/Presentations/Lab 4 - Coding for Reproducible Research.Rmd new file mode 100644 index 0000000..0992533 --- /dev/null +++ b/Presentations/Lab 4 - Coding for Reproducible Research.Rmd @@ -0,0 +1,817 @@ +--- +title: "Coding for Reproducible Research" +subtitle: "R for Stata Users" +date: "November-December 2018" +author: "Luiza Andrade, Leonardo Viotti & Rob Marty " +output: + beamer_presentation: + #theme: "Pittsburgh" + theme: "Madrid" + colortheme: "whale" + fonttheme: "default" + toc: true + includes: + in_header: header.tex +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) +``` + +```{r, echo = F, eval= T, include=FALSE} +# Packages +library(ggplot2) + +# File paths + +if (Sys.getenv("USERNAME") == "luiza"){ + projectFolder <- "C:/Users/luiza/Documents/GitHub/dime-r-training" + +} + +if (Sys.getenv("USERNAME") == "WB501238"){ + projectFolder <- "C:/Users/WB501238/Documents/GitHub/dime-r-training" + +} + +if (Sys.getenv("USERNAME") == "Leonardo"){ + projectFolder <- "C:/Users/Leonardo/Documents/GitHub/dime-r-training" + +} + +if (Sys.getenv("USERNAME") == "WB519128"){ + projectFolder <- file.path("C:/Users/WB519128/Documents/GitHub/dime-r-training") +} + +# File paths + dataWorkFolder <- file.path(projectFolder,"DataWork") + + Data <- file.path(dataWorkFolder,"DataSets") + finalData <- file.path(Data,"Final") + rawOutput <- file.path(dataWorkFolder,"Output","Raw") + +# Load CSV data + +whr <- read.csv(file.path(finalData,"whr_panel.csv"), + header = T) + + +``` + +# Introduction + +## Why are we here today? + +### Code is a research output +We often think of code as a mean to an end, but code is an end in itself. + +## Reproducibility + + \begin{block}{Can other people...} + \begin{itemize} + \item run your code? + \item recreate your results? + \item understand what you are doing and why? + \item make changes to your code if necessary? + \end{itemize} + \end{block} + +## Introduction + + \begin{block}{Objective} + Create an R Master Script. + \end{block} + + \textbf{Content} + \begin{enumerate} + \item Comments + \item Using R Studio to create a code index + \item Indenting + \item File paths + \item If statements + \item Using functions within functions + \item Packages + \item Loops + \end{enumerate} + +## Introduction + + * For the purpose of this training, we will assume that you are dealing with a specific folder structure + * Folder organization, though an important part of data work, is outside the scope of this course + * You can find resources about it in the appendix, and we have sent you a folder that is organized as we want it to be + * To follow today's session, go to the ``DataWork/Code`` folder and open the file called ``Lab 4 - Coding for Reproducible Research.R`` + +## Master scripts + + * Just like in Stata, a Master script must be a map of all data work + * It should be possible to follow all data work in the data folder, from raw data to analysis output, by following the master script + * It should also be possible to run all the code for the project by simply running the Master script + * Finally, in Stata, the Master Script is where you create globals. You can do the same in R by just creating new objects + +# Initial Settings + +## Initial Settings + +A Stata do-file typically starts with a few settings: + +```{stata, eval = F} + clear + set maxvar 120000 + set more off +``` + +## Initial Settings + + * We don't need to set the memory or the maximum number of variables in R, and the equivalent of ``more`` is the default in R + * However, if you saved the last RStudio session in ``.Rhistory``, the objects that were in RStudio's memory last time you closed it will still be there whenever you open it again + * You can see all the objects currently in you memory in the *Environment* pane + +## Initial Settings + +### Exercise 1: Clear workspace + * Make sure the `Environment` window is open. Is anything there? + * Create an object called `foo` with any content you pick + * Type `ls()` to print the names of the object in memory + * Type `rm(foo)` to remove the `foo` object from you memory + * To remove all objects, use `rm(list=ls())` + +## Initial Settings + +\begin{figure} +\centering + \includegraphics[width=12cm,height=7.7cm]{img/rm.png} +\end{figure} + +## Initial Settings + +### Exercise 2: No one will burn your computer +Here's how you change these settings + +## Initial Settings + +\begin{figure} +\centering + \includegraphics[width=12cm,height=7.7cm]{img/Options1.png} +\end{figure} + +## Initial Settings + +\begin{figure} +\centering + \includegraphics[width=12cm,height=7.7cm]{img/Options2.png} +\end{figure} + +## Initial Settings + +\begin{figure} +\centering + \includegraphics[width=12cm,height=7.7cm]{img/Options3.png} +\end{figure} + +# Commenting + +## Why comment? + + Comments have two purposes: + \begin{enumerate} + \item Tell the reader what you are doing + \item Tell the reader why you do what you do + \end{enumerate} + + If you have ever read code with no comments, you probably understand why they are necessary. + +## Commenting + + \begin{itemize} + \item Let's take a look at the script we just opened + \item You can see that the first few lines in the script are the header, but they're not commented out + \item In R, errors will not always break your code, so you should still be able to run this script + \item However, not commenting out comments is still bad practice, as it makes the code harder to read + \end{itemize} + +## Commenting + + + * To comment a line, write ``#`` as its first character + * You can also add ``#`` half way through a line to comment whatever comes after it + * In Stata, you can use ``/*`` and ``*/`` to comment part of a line's code. That is not possible in R: whatever comes after ``#`` will be a comment + * To comment a selection of lines, press ``Ctrl + Shift + C`` + +## Commenting + +### Exercise 3: Commenting + +1. Use the keyboard shortcut to comment the header of the script. + +2. Use the keyboard shortcut to comment the header of the script again. What happened? + + +# Creating a document outline in RStudio + +## Creating a document outline in RStudio + + * RStudio also allows you to create an interactive index for your scripts + * To add a section to your code, create a commented line with the title of your section and add at least 4 trailing dashes, pound signs or equal signs after it + +### Exercise 4: headers + +1. Open the script index and make `PART 1` a section header. Do the same for parts 2 and 3. + +2. Note that once you create a section header, an arrow appears right next to it. Click on the arrows of parts 2 and 3 to see what happens. + + +## Creating a document outline in RStudio + +* The outline can be accessed by clicking on the button on the top right corner of the script window. You can use it to jump from one section to another +* You can also use the keyboard shortcuts `Alt + L` (`Cmd + Option + L` on Mac) and `Alt + Shift + L` to collapse and expand sections + +# Functions inception + +## Functions inception + + * In R, you can use the output of one function as the input of another, as long as they have the same format + * In fact, that's exactly what we just did when installing the packages + * To see that, select just the first argument of the ``install.packages`` function and press ``Ctrl + Enter`` + * The ``c()`` function, as we know, creates a vector with its arguments + + +```{r, eval = T} + + c("stargazer", "swirl") + +``` + +## Functions inception + + * The resulting vector is used as an input to the ``install.packages`` function + * We could also have stored this vector in the memory as an object and used that object as the input + * In fact, that's exactly what we are going to do next, so the code doesn't get too polluted as we add new packages + +## Functions inception + +```{r, eval = T, echo = F} + packages <- c("stargazer", + "swirl") + +``` + +```{r, eval = F} + + # Create packages object + packages <- c("stargazer", + "swirl") + + # Use it as an input to the install.packageS() function + install.packages(packages, + dependencies = TRUE) + +``` + +## Using packages + +### Exercise 6: Load packages +Load the packages you just installed, as well as the other packages used in previous sessions. + +- Note that the `library` function only accepts one argument. This means that you cannot use function inception with the `packages` object here, as its output format does not match the function input. In fact, you will need to load each of them separately. + +## Using packages + +```{r, eval = F} + + library(swirl) + library(stargazer) + library(tidyverse) + library(openxlsx) + library(ggplot2) + library(plotly) + +``` + +# Looping + +## Looping + + * If you're thinking what happens as I add more packages here, you're going in the right direction + * Repetition makes the code harder to read and to maintain + +```{r, eval = F} + # The DRY rule: + DONT REPEAT YOURSELF + +``` + +## Looping + * In Stata, we'd usually use a ``foreach`` loop to go through a list of objects + * If you are a Stata coder, you are probably thinking of writing something like + +```{stata, eval = F} + * Looping through packages + foreach package in swirl stargazer tidyverse /// + openxlsx ggplot2 plotly { + library `package' + } +``` + +## Looping + + * The equivalent to that in R would be to write a for loop like this (this code won't run, so don't try it yet) + + +```{r, eval = F} + # A for loop in R + for (package in packages) { + library(package) + } +``` + +## Looping + + * R, however, has a whole function family that allows users to loop through an object in a more efficient way + * They're called ``apply`` and there are many of them, with different use cases + * If you look for the ``apply`` help file, you can see all of them + * For the purpose of this training, we will only use two of them, ``sapply`` and ``apply`` + +## Looping + + * ``sapply(X, FUN, ...)``: applies a function to all elements of a vector or list and returns the result in a vector. Its arguments are + * **X:** a matrix (or data frame) the function will be applied to + * **FUN:** the function you want to apply + * **...:** possible function options + +## Looping + +```{r} + # A for loop in R + for (number in c(1.2,2.5)) { + print(round(number)) + } + + # A much more elegant loop in R + sapply(c(1.2,2.5), round) +``` + +## Looping + +### Exercise 7: Looping in R +Use the `sapply()` function to apply the `library()` function to all packages you have selected. + +- TIP: to do this without creating errors, you will need to use that `...` argument and make the `character.only` argument of the `library()` function equal to `TRUE` + +## Looping + +```{r, eval = F} + # Load all listed packages + sapply(packages, library, character.only = TRUE) +``` + +## Looping + +A more general version is the `apply` function. + + * ``apply(X, MARGIN, FUN, ...)``: applies a function to all columns or rows of matrix. Its arguments are + * **X:** a matrix (or data frame) the function will be applied to + * **MARGIN:** 1 to apply the function to all rows or 2 to apply the function to all columns + * **FUN:** the function you want to apply + * **...:** possible function options + +## Looping + + +```{r, eval = T} + # Create a matrix + matrix <- matrix(c(1, 24, 9, 6, 9, 4, 2, 74, 2), + nrow = 3) + + # Look at the matrix + matrix +``` + + +## Looping + +```{r, eval = T} + # Row means + apply(matrix, 1, mean) + + # Column means + apply(matrix, 2, mean) +``` + +## Looping + + * The `apply()` function is a very powerful tool in R programming + * To explore it to its full potential, we combine it with customized functions + * This is beyond today's session, and we will come back to it in our last day, but you can give it a try on this session's assignment + +# Indentation + +## Why indent? +\scriptsize +```{r, eval = F} +# Here's some code +annualHappy_reg <- aggregate(happy_score ~ year + region, data = whr, FUN = mean) +plot <- ggplot(annualHappy_reg,aes(y = happy_score,x = year, color = region, +group = region)) + geom_line() + geom_point() +print(plot) +``` + +## Why indent? + +```{r, eval = T, echo = F} +annualHappy_reg <- aggregate(happy_score ~ year + region, data = whr, FUN = mean) +plot <- ggplot(annualHappy_reg,aes(y = happy_score,x = year, color = region, group = region)) + +geom_line() + geom_point() +print(plot) +``` + +## Why indent? + +```{r, eval = F} +# Here's the same code +annualHappy_reg <- + aggregate(happy_score ~ year + region, + data = whr, + FUN = mean) + +plot <- + ggplot(annualHappy_reg, + aes(y = happy_score, + x = year, + color = region, + group = region)) + + geom_line() + + geom_point() + +print(plot) +``` + +## Why indent? + +```{r, echo = F} +annualHappy_reg <- + aggregate(happy_score ~ year + region, + data = whr, + FUN = mean) + +plot <- + ggplot(annualHappy_reg, + aes(y = happy_score, + x = year, + color = region, + group = region)) + + geom_line() + + geom_point() + +print(plot) +``` + +## Why indent? + * Even though R understands what unindented code says, it can be quite difficult for a human being to read it + * On the other hand, white space does not have a special meaning for R, so it will understand code that is more readable for a human being + +## Indentation + + \begin{itemize} + \item Indentation in R looks different than in Stata: + \begin{itemize} + \item To indent a whole line, you can select that line and press \texttt{Tab} + \item To unindent a whole line, you can select that line and press \texttt{Shift + Tab} + \item However, this will not always work for different parts of a code in the same line + \end{itemize} + \item In R, we typically don't introduce white space manually + \item It's rather introduced by RStudio for us + \end{itemize} + +## Indentation + +### Exercise 8: Indentation in R +To see an example of how indenting works in RStudio, go back to our first example with `sapply`: + +```{r, eval = F} +# A much more elegant loop in R +sapply(c(1.2,2.5), round) +``` + +1. Add a line between the two arguments of the function (the vector of numbers and the round function) + +2. Now add a line between the numbers in the vector. + + +## Indentation +Note that RStudio formats the different arguments of the function differently: + +```{r, eval = F} +# A much more elegant loop in R +sapply(c(1.2, + 2.5), + round) +``` + +# If statements + +## If statements + + * Now, installing packages can be time-consuming, especially as the number of packages you're using grows, and each package only needs to be installed once + * What can we bring from our Stata Master do-files to avoid installing packages twice? + +## If statements + + * In Stata, section switches would be saved as locals + * In R, the equivalent to that would be to create a new object + +### Exercise 9: Creating an if statement + Create a dummy scalar object called PACKAGES. + + - TIP: Section switches can also be Boolean objects. + +## If statements + + * Now we need to create an if statement using this switch + * If statements in R look like this: +```{r, eval = F} + + # Turn switch on + PACKAGES <- 1 + + # Install packages + if (PACKAGES == 1) { + install.packages(packages, + dependencies = TRUE) + } + +``` + +## If statements + + * Possible variations would include +```{r, eval = F} + # Turn switch on + PACKAGES <- TRUE + + # Using a Boolean object + if (PACKAGES == TRUE) { + install.packages(packages, dep = T) + } + + # Which is the same as + if (PACKAGES) { + install.packages(packages, dep = T) + } + +``` + +# File paths + +## File paths + + * The next important part of a Master script are file paths + * We recommend always using **explicit** and **dynamic** file paths + +## File paths + + * Implicit and static file path: + +```{r, eval = F} + # Set working directory + setwd("C:/Users/luiza/Documents/GitHub/dime-r-training/ + DataWork/DataSets/Final") + + # Load data set + read.csv("whr_panel.csv", + header = T) +``` + +## File paths + + * Explicit and static file path: + +```{r, eval = F} + # Load data set + read.csv("C:/Users/luiza/Documents/GitHub/dime-r-training/ + DataWork/DataSets/Final/whr_panel.csv", + header = T) +``` + +## File paths + + * Explicit and dynamic file path: + +```{r, eval = F} + # Define dynamic file path + finalData <- "C:/Users/luiza/Documents/GitHub/ + dime-r-training/ + DataWork/DataSets/Final" + + # Load data set + whr <- read.csv(file.path(finalData,"whr_panel.csv"), + header = T) + +``` +## File paths + + * File paths in R, as in Stata, are basically just strings + * Note, however, that in R we can only use forward slashes (``/``) to separate folder names + +### Exercise 10: File path to your folder +Let's start by adding the folder path to the training's folder in your computer to the beginning of `PART 3` + +## File paths + + * You can set file paths in your master using the ``file.path()`` function + * This function concatenates strings using ``/`` as a separator to create file paths + +## File paths + +```{r, eval = T} + + # Project folder + projectFolder <- + "C:/Users/luiza/Documents/GitHub/dime-r-training" + + # Data work folder + dataWorkFolder <- file.path(projectFolder,"DataWork") + + # Print data work folder + dataWorkFolder + +``` + +## File paths + +Let's test if that worked: + +### Exercise 11: Test file paths + +1. Save your code. + +2. Start a new R session: go to `Session > New Session`. This session should be completely blank. + +3. Open your Master code and turn off the switch that installs packages in your Master script + +4. Add a line opening the data set in `PART 5` of your Master script +```{r, eval = F} +# Load data set +whr <- read.csv(file.path(finalData,"whr_panel.csv"), + header = T) + +``` + +5. Run the whole script. If it worked, your environment should include only the `whr` data set. + + +## Sourcing scripts + +As we discussed earlier, one of the main objectives of a Master script is to be able to run all codes from a project from it. + +To run an R script from another, we use a function called `source`: + +* `source(file, echo, verbose)`: reads and evaluates the expressions in `file`. Its arguments are: + + * **file: ** a pathname or URL to read from + + * **echo:** if TRUE, each expression is printed after parsing, before evaluation + + * **verbose:** if TRUE, more diagnostics (than just echo = TRUE) are printed during parsing and evaluation of input, including extra info for each expression + +## Sourcing scripts + +### Exercise 12: Run scripts from the master +1. Add your scripts from sessions 2 and 3 to the `Code` folder inside your `DataWork` folder. + +2. Add two lines to your Master script, each one sourcing one of your session scripts + +3. Run your master script. + +## Sourcing scripts + +```{r, eval = F} +# PART 5: Run selected sections ========================== +source(file.path(Code, "Lab 2"), + verbose = T, + echo = T) + +source(file.path(Code, "Lab 3"), + verbose = T, + echo = T) +``` + +## That's all, folks + + * Now you have a template master script to use across this training's sessions + * Save the script that you created during this session in the *DataWork* folder. Call it *MASTER.R* + * You can also create switches for each of the sessions, and only run selected pieces of code from you Master Script + + +## Assignment + + * You can customize your loops in R by defining your own function + * This is done using a function conveniently called ``function()`` + * For example, if instead of just printing a number we want to print it's square, we could create a function that does both: + + +```{r, eval = T} + # A much more elegant for loop in R + sapply(c(1,2), function(x) x^2) +``` + +## Assignment + +Create a function that + + 1. Loops through the packages listed in the `packages` vector + + 2. Tests if a package is already installed + + 3. Only installs packages that are not yet installed + + - TIP: to test if a package is already installed, use the following code: +```{r, eval = F} + # Test if object x is contained in + # the vector of installed packages + x %in% installed.packages() +``` + +# Appendix + +## Using packages + + * Since there is a lot of people developing for R, it can have many different functionalities + * To make it simpler, these functionalities are bundled into packages + * A package is the fundamental unit of shareable code + * It may contain new functions, but also more complex functionalities, such as a Graphic User Interface (GUI) or settings for parallel processing (similar to Stata MP) + * They can be shared through R's official repository - CRAN (10,000+ packages reviewed and tested) and many other online sources + * There are many other online sources such as Github, but it's important to be careful, as these probably haven't gone through a review process as rigorous as those in CRAN + +## Using packages + +* To install and use packages you can either do it with the user interface or by the command prompt. +```{r, eval = F} +# Installing a package +install.packages("stargazer", + dependencies = T) +# the dependencies argument also installs all other packages +# that it may depend upon +``` + +* You only have to install a package once, but you have to load it every new session. To load a package type: +```{r, eval = F} +library(stargazer) + +``` + +## Using packages + +Once a package is loaded, you can use its features and functions. Here's a list of some useful and cool packages: + +* Rcmdr - Easy to use GUI +* swirl - An interactive learning environment for R and statistics. +* ggplot2 - beautiful and versatile graphics (the syntax is a pain, though) +* stargazer - awesome latex regression and summary statistics tables +* foreign - reads dtas and other formats from inferior statistical software +* zoo - time series and panel data manipulation useful functions +* data.table - some functions to deal with huge data sets +* sp and rgeos - spatial analysis +* multiwayvcov and sandwich - clustered and robust standard errors +* RODBC, RMySQL, RPostgresSQL, RSQLite - For relational databases and using SQL in R. + +## Using packages + +### Exercise 5: Installing packages +Install the `swirl` and `stargazer` packages, including packages necessary for them to run. + +## Using packages +\scriptsize +```{r, eval = F} + + # Install stargazer and swirl + install.packages(c("stargazer", "swirl"), dependencies = TRUE) + +``` + +## Resources + + * A discussion of folder strucutre and data managament can be found here: + https://dimewiki.worldbank.org/wiki/DataWork_Folder + * For a broader discussion of data management, go to + https://dimewiki.worldbank.org/wiki/Data_Management + +## Git + + Git is a version-control system for tracking changes in code and other text files. It is a great resource to include in your work flow. + + We didn't cover it here because of time constraints, but below are some useful links, and DIME Analytics provides trainings on Git and GitHub, so keep an eye out for them. + + * **DIME Analytics git page:** https://worldbank.github.io/dimeanalytics/git/ + * **A Quick Introduction to Version Control with Git and GitHub:** https://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1004668 + +## R projects + + If you have used R before, you may have heard of RStudio Projects. It's RStudio suggested tool for workflow management. DIME Analytics has found that it is not the best fit for our needs, because + + 1. In DIME, we mainly use Stata, and we prefer to keep a similar structure in R (Stata 15 also has a projects feature, but it is not yet widely adopted) + + 2. We need to keep our code and data in separate folders, as we store code in GitHub and data in DropBox + + However, if you want to learn more about it, we recommend starting here: + https://r4ds.had.co.nz/workflow-projects.html + diff --git a/Presentations/Lab 4 - Descriptive analysis.Rmd b/Presentations/Lab 4 - Descriptive analysis.Rmd deleted file mode 100644 index ccbda6d..0000000 --- a/Presentations/Lab 4 - Descriptive analysis.Rmd +++ /dev/null @@ -1,599 +0,0 @@ ---- -title: "Descriptive analysis" -subtitle: "Field Coordinator Training - R Track" -date: "June 2018" -author: "Luiza Andrade, Leonardo Viotti & Rob Marty " -output: - beamer_presentation: - #theme: "Pittsburgh" - theme: "Madrid" - colortheme: "whale" - fonttheme: "default" - toc: true - includes: - in_header: header.tex ---- - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = TRUE) -``` - -```{r, echo = F, eval=T, include=FALSE} -library(stargazer) - -# File paths - -if (Sys.getenv("USERNAME") == "luiza"){ - projectFolder <- "C:/Users/luiza/Documents/GitHub/R-Training" - -} - -if (Sys.getenv("USERNAME") == "WB501238"){ - projectFolder <- "C:/Users/WB501238/Documents/GitHub/R-Training" - -} - -if (Sys.getenv("USERNAME") == "Leonardo"){ - projectFolder <- "C:/Users/Leonardo/Documents/GitHub/R-Training" - -} - -if (Sys.getenv("USERNAME") == "WB519128"){ - projectFolder <- file.path("C:/Users/WB519128/Documents/GitHub/R-Training") -} - - # File paths - dataWorkFolder <- file.path(projectFolder,"DataWork") - - Data <- file.path(dataWorkFolder,"DataSets") - finalData <- file.path(Data,"Final") - rawOutput <- file.path(dataWorkFolder,"Output","Raw") - - # Load CSV data - lwh <- read.csv(file.path(finalData,"lwh_clean.csv"), - header = T) -``` - -# Introduction - -## Introduction - -Descriptive statistics are used to represent the basic features of data. When we talk about descriptive analysis, it usually means that we're not making any assumptions, and we're not using probability theory to infer anything beyond the immediate data. - -This session is mostly focused on how to implement descriptive analysis in R. We will not go in depth into these concepts, but you can find some useful references at the end of this presentation. - -## Introduction - -This session will cover two topics: - - \begin{enumerate} - \item Quick ways to extract summary information from your data - \item How to use this information to create and export tables - \end{enumerate} - - -## Introduction - -First, let's load the data that is going to be used in the training. Paths should be set in your master file! - -#### Load the data -```{r, include = T, results = "hide", eval = F} - # Load CSV data - lwh <- read.csv(file.path(finalData,"lwh_clean.csv"), - header = T) - -``` - -# Quick summary statistics - - -## Quick summary statistics - - -`summary(x, digits)` - equivalent to Stata's *summarize*, displays summary statistics. Its arguments are: - - * **x:** the object you want to summarize, usually a vector or data frame - * **digits:** the number of decimal digits to be displayed - -\begin{block}{Exercise 1} - Use the \texttt{summary()} function to display summary statistics for the \textit{lwh} data frame. -\end{block} - -## Quick summary statistics - -\scriptsize - -```{r, eval = F} - # Summary statistics - summary(lwh) -``` - -```{r, eval = T, echo = F} - # Summary statistics - summary(lwh[,1:8]) -``` - -## Quick summary statistics - - -``table()`` - equivalent to `tabulate` in Stata, creates a frequency table. Its main arguments are the objects to be tabulated. - -\begin{block}{Exercise 2} - Use the \texttt{table()} function to display frequency tables for: - \begin{enumerate} - \item The variable \textit{year} in the \textit{lwh} data frame - \item The variables \textit{gender\_hhh} and \textit{year} in the \textit{lwh} data frame, simultaneously - \end{enumerate} -\end{block} - - -## Quick summary statistics -```{r, eval = T} - # Year of data collection - table(lwh$year) -``` - -## Quick summary statistics -```{r, eval = T} - # Gender of household head per year - table(lwh$gender_hhh, lwh$year) -``` - - -# Descriptives tables - -## Descriptives tables - - - We can also use the \texttt{stargazer()} function to quickly display a nice-looking descriptives table. - - \textit{Stargazer} was originally developed to export beautiful regression tables to \LaTeX\ or html, but it also allows you to generate summary statistics. - - It can also be used to export any data frames you create to \LaTeX\ as a formatted table. To do that, you fist need to construct a data frame object combining vectors (of the same length) with the desired information. - -## Descriptives tables - - -### If you haven't yet done this in your master, install and load the ``stargazer`` package now! - -```{r, eval = F} - - # Install stargazer - install.packages("stargazer", - dependencies = TRUE) - - # Load stargazer - library(stargazer) -``` - - -## Descriptives tables - -\begin{block}{Exercise 3 - \texttt{stargazer()} summary statistics table} - Use the \texttt{stargazer()} function to display summary statistics for the variables in the \textit{lwh} data frame. -\end{block} - - The \texttt{stargazer()} function accepts \textbf{a lot} of arguments, most of which are beyond the scope of this session. Here are the arguments you'll need for this specific table: - \begin{itemize} - \item \textbf{x:} the object you want to summarize -- in this case a vector or data frame - \item \textbf{type:} the output format -- "text" to just display, "latex" (the default) to save as a \LaTeX\ table, and "html" for, yes, html - \item \textbf{digits:} the number of decimal digits to be displayed - \end{itemize} - -## Descriptives tables -\tiny - -```{r, eval = F} -# A descriptive table with stargazer -stargazer(lwh, - digits = 1, - type = "text") - -``` - - - -```{r, include = T, echo = F} -stargazer(lwh[1:15], - digits = 1, - type = "text") - -``` - - - -# Export tables to \LaTeX - -## Export tables to \LaTeX\ - - -To export the table to \LaTeX\, we will use a couple of additional arguments of the `stargazer()` function: - - \begin{itemize} - \item \textbf{out:} where to save the table, i.e., the file path, including the file name - \item \textbf{covariate.labels:} a vector of variable labels - \end{itemize} - -## Export tables to \LaTeX\ - -But first, let's pick a few variables of interest in the `lwh` data set so the table fits in these slides. - -### Exercise 4 - 1. Create a vector called ``covariates`` containing the string names of the variables you want to keep: `age_hhh`, `num_dependents`, `income_total_win`, and `expend_food_yearly`. - 2. Use this vector to subset the lwh dataset to contain only these variables. Call the new data frame `lwh_simp`. - - -## Export tables to \LaTeX\ -\scriptsize - -```{r} -# Vector with covariates to be kept -covariates <- c("age_hhh", - "num_dependents", - "income_total_win", - "expend_food_yearly") - -# subset lwh -lwh_simp <- lwh[, covariates] - -``` - - -## Export tables to \LaTeX\ - -### Exercise 4 - 1. Create a vector called ``cov_labels`` containing the labels to the covariates, in the same order as in the ``covariates`` vector. - - 2. Now use the stargazer function as in the previous exercise: - + Set ``lwh_simp`` as the `x` argument this time - + Set the `covariate.labels` argument as the vector you just created - -## Export tables to \LaTeX\ - -\tiny - -``` {r, eval = T, results = "hide"} - - # Set labels - cov_labels <- c("Age of household head", "Number of dependents", - "Anual income (winsorized)", "Yearly food expediture") - - # Save table to latex - stargazer(lwh_simp, - covariate.labels = cov_labels, - #summary.stat = c("n", "mean", "sd", "min", "max"), - digits = 1, - out = file.path(rawOutput,"desc_table.tex")) - -``` - -\input{"../DataWork/Output/Raw/desc_table.tex"} - -# Descriptives tables - Create tables from scratch - -## Descriptives tables - Create tables from scratch - - -In R, it is relatively easy to construct any table you can think of by manipulating objects. To construct a table from scratch, we will use two functions: - - * `aggregate()` - Similar to collapse in Stata, it can compute statistics of a variable based on the values of other variable - * `reshape()` - Reshapes data sets from long to wide and vice-versa - -## Descriptives tables - Create tables from scratch - -`aggregate(X, by, FUN)`: - - * __x__: a data frame or column - * __by__: a list of grouping variables - * __FUN__: a function to compute statistics - -## Descriptives tables - Create tables from scratch - -\begin{block}{Exercise 5} - Use the \texttt{aggregate} function to create a data frame called \texttt{year\_inc\_tab} with the mean of the total income per year and treatment status. The syntax of the \texttt{aggregate} function is very similar to that of the \texttt{collapse} function in Stata. -\end{block} - -```{r, eval = T, results = "hide"} -# Aggregate income by year and treatment status -year_inc_tab <- - aggregate(x = lwh$income_total_win, # data.frame - by = list(year = lwh$year, # list - treatment = lwh$treatment_hh), - FUN = mean) # function -``` - -Note that the ``income_total_win`` variable is now named ``x`` in the ``income`` data frame - -## Descriptives tables - Create tables from scratch -\scriptsize - -```{r} -print(year_inc_tab) - -``` - -## Descriptives tables - Create tables from scratch - -`reshape(data, varying, idvar, timevar, direction)`: - - * __data__: a data frame - * __idvar__: the variables that identify the group in the wide data set - * __timevar__: the variable in long format that differentiates multiple records from the same group or individual - -## Descriptives tables - Create tables from scratch - -\begin{block}{Exercise 6} - Use the \texttt{reshape} function to make the \texttt{year\_inc\_tab} data frame wide per treatment status. -\end{block} - - -```{r, eval = T, results = "hide"} -# Aggregate income by year and treatment status -year_inc_tab <- reshape(year_inc_tab, - idvar = "treatment", - timevar = "year", - direction = "wide") -``` - -For comparison, here's how you'd do it in Stata: - -``reshape wide x, i(year) j(treatment_hh)`` - - -## Descriptives tables - Create tables from scratch -\scriptsize - -```{r} -print(year_inc_tab) - -``` - -## Descriptives tables - Create tables from scratch - - -With a data frame as input, `stargazer` by default tries to summarize it. So, to export this table we must specify one additional argument: `summary = F`. - -### Exercise 7 -Print the ``year_inc_tab`` table you created in exercise 6 using ``stargazer``. If you want, you can also save it using the ``out`` option. - - -## Descriptives tables - Create tables from scratch - -\scriptsize - -```{r, results='asis'} -# Label variables -column_lab <- c("Treatment status", "2012", "2013", "2014", "2016", "2018") - -# Create table -stargazer(year_inc_tab, - summary = F, - # Some extra formatting: - covariate.labels = column_lab, - title = "Total income by treatment status and year", - header = F, - digits = 1, - rownames = F) -``` - - -# Export tables to Excel - -## Export tables to Excel - - -To export a table to excel we'll use the `write.table()` function. It takes a data frame object as input and saves it as a ``.csv`` file - -`write.table()` is the most basic function, but there are many other functions that allow you to export formatted tables to Microsoft Excel, Word or PowerPoint. Here are some examples: - - * ReporteRs - * Flextable - * r2excel (only available in GitHub). - -## Export tables to Excel - -`write.table(x, file = "", sep = " ", row.names = TRUE)` - - * __x__: the object to be written - * __file__: where to save the table, i.e., the file path including the file name - * __sep__: the field separator of the csv, Excel's default is comma - * __row.names__: either a logical value indicating whether the row names of x are to be written along with x, or a character vector of row names to be written - * __row.names__: same as `row.names` for columns - -## Export tables to Excel - -### Exercise 8 - -Use the `write.table()` function to save the `year_inc_tab` you table created in Exercise 6 into a csv file. - -1. Set `x` arugment as *year_inc_tab*. -2. Set `row.names` as `FALSE` -3. Set `col.names` as a vector of labels -4. Set `file` as the folder path to your output folder plus a name for a file plus ".csv" -5. Set `sep` as `","`. - - Tips: - - * Make sure to save it in the *Raw Ouput* folder. You can you the function ``file.path`` to do it - * Use the help function to check syntax if needed - -## Export tables to Excel -\scriptsize - -```{r, include = T, eval = T} -write.table(year_inc_tab, - sep = ",", - row.names = F, - col.names = c("Treatment status", - "2012", "2013", "2014", "2016", "2018"), - file = file.path(rawOutput, "year_inc_tab.csv")) - -``` - -\begin{figure}[H] - \centering - \includegraphics[scale=.8]{img/year_inc_tab_csv.png} -\end{figure} - - -# Export regression table - -## Export regression table - -This is a session on *descriptive* analysis, so regressions are beyond its scope. But since you'll probably ask, here's how you run a regression and how you export a very simple regression table to \LaTeX\ using ``stargazer``: - -```{r} -# Run a Regression -reg1 <- lm(expend_food_yearly ~ - income_total_win + num_dependents, - data = lwh) - -``` - -## Export regression table -\scriptsize - - -```{r, eval = F} -# Export a regression table - -depvar_label <- "Yearly food expenditure (winsorized)" -covar_labels <- c("Total income (winsorized)", - "Number of dependents") - -stargazer(reg1, - title = "Regression table", - dep.var.labels = depvar_label, - covar_labels = covar_labels, - digits = 2, - header = F) - -``` - - - -## Export regression table -\scriptsize - -```{r, results ='asis', echo = F} -# Export a regression table -depvar_label <- "Yearly food expenditure (winsorized)" -covar_labels <- c("Total income (winsorized)", "Number of dependents") - -stargazer(reg1, - title = "Regression table", - dep.var.labels = depvar_label, - covariate.labels = covar_labels, - digits = 2, - header = F) -``` - -## Export regression table -\scriptsize - -```{r, eval = F} - - -# Regression 1 -reg1 <- lm(expend_food_yearly ~ - income_total_win + num_dependents, - data = lwh) - -# Reg with year FE -reg2 <- lm(expend_food_yearly ~ - income_total_win + num_dependents + factor(year), - data = lwh) - -# Reg with year and site FE -reg3 <- lm(expend_food_yearly ~ - income_total_win + num_dependents + factor(year) + factor(site_code), - data = lwh) - -``` - -## Export regression table -\scriptsize - -```{r, eval = F} -# Labels -depvar_label <- "Yearly food expenditure (winsorized)" -covar_labels <- c("Total income (winsorized)", "Number of dependents") - -# Table -stargazer(reg1, - reg2, - reg3, - font.size = "tiny", - title = "Regression table", - keep = c("ncome_total_win","num_dependents"), - dep.var.labels = depvar_label, - covariate.labels = covar_labels, - add.lines = list(c("Year FE", "No", "Yes", "Yes"), - c("Site FE", "No", "No", "Yes")), - omit.stat = c("ser"), - digits = 2, - header = F) -``` - -## Export regression table -\scriptsize - -```{r, results ='asis', echo = F} - - -# Run a Regression -reg1 <- lm(expend_food_yearly ~ - income_total_win + num_dependents, - data = lwh) - -# Run a Regression -reg2 <- lm(expend_food_yearly ~ - income_total_win + num_dependents + factor(year), - data = lwh) - -# Run a Regression -reg3 <- lm(expend_food_yearly ~ - income_total_win + num_dependents + factor(year) + factor(site_code), - data = lwh) - -depvar_label <- "Yearly food expenditure (winsorized)" -covar_labels <- c("Total income (winsorized)", "Number of dependents") - -stargazer(reg1, - reg2, - reg3, - font.size = "tiny", - title = "Regression table", - keep = c("ncome_total_win","num_dependents"), - dep.var.labels = depvar_label, - covariate.labels = covar_labels, - add.lines = list(c("Year FE", "No", "Yes", "Yes"), - c("Site FE", "No", "No", "Yes")), - digits = 2, - omit.stat = c("ser"), - header = F) -``` - -# References and recommendations - -## References and recommendations - - - * Johns Hopkins Exploratory Data Analysis at Coursera: - https://www.coursera.org/learn/exploratory-data-analysis - - * Udacity's Data Analysis with R: - https://www.udacity.com/course/data-analysis-with-r--ud651 - - * Jake Russ stargazer cheat sheet: - https://www.jakeruss.com/cheatsheets/stargazer/ - -## References and recommendations - - Since we talked about \LaTeX\ so much... - - * DIME \LaTeX\ templates and trainings: - https://github.com/worldbank/DIME-LaTeX-Templates - - * All you need to know about \LaTeX\: - https://en.wikibooks.org/wiki/LaTeX diff --git a/Presentations/Lab 4 - Descriptive analysis.pdf b/Presentations/Lab 4 - Descriptive analysis.pdf deleted file mode 100644 index 6e2727a..0000000 Binary files a/Presentations/Lab 4 - Descriptive analysis.pdf and /dev/null differ diff --git a/Presentations/Lab 5 - Data visualization.pdf b/Presentations/Lab 5 - Data visualization.pdf deleted file mode 100644 index 5eab74e..0000000 Binary files a/Presentations/Lab 5 - Data visualization.pdf and /dev/null differ diff --git a/Presentations/Lab 5 - Spatial Data.Rmd b/Presentations/Lab 5 - Spatial Data.Rmd new file mode 100644 index 0000000..d800b32 --- /dev/null +++ b/Presentations/Lab 5 - Spatial Data.Rmd @@ -0,0 +1,1207 @@ +--- +title: "Spatial Data in R" +subtitle: "R for Stata Users" +date: "November-December 2018" +author: "Luiza Andrade, Leonardo Viotti & Rob Marty " +output: + beamer_presentation: + #theme: "Pittsburgh" + theme: "Madrid" + colortheme: "whale" + fonttheme: "default" + slide_level: 2 + toc: true + includes: + in_header: header.tex +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) +``` + +```{r, echo = F, eval=T} +# File paths + +if (Sys.getenv("USERNAME") == "luiza"){ + projectFolder <- "C:/Users/luiza/Documents/GitHub/dime-r-training" + +} + +if (Sys.getenv("USERNAME") == "WB501238"){ + projectFolder <- "C:/Users/WB501238/Documents/GitHub/dime-r-training" + +} + +if (Sys.getenv("USERNAME") == "Leonardo"){ + projectFolder <- "C:/Users/Leonardo/Documents/GitHub/dime-r-training" + +} + +if (Sys.getenv("USERNAME") == "WB519128"){ + projectFolder <- file.path("C:/Users/WB519128/Documents/GitHub/dime-r-training") +} + +library(tidyverse, verbose = F) +library(readstata13) +library(data.table) + +# File paths +dataWorkFolder <- file.path(projectFolder,"DataWork") + +Data <- file.path(dataWorkFolder,"DataSets") +finalData <- file.path(Data,"Final") +rawData <- file.path(Data,"Raw") + +``` + +## Introduction + +Before we start, let's make sure we're all set: + + 1. Start a fresh session. + 2. Make sure the `tidyverse` package is listed in the `packages` vector of your Master script. If it's not installed, install it. + 3. Run the Master script to load all the necessary packages and set file paths. + 4. Remember to disable the `PACKAGES` switch in the Master script if you already installed the necessary packages. This will save you a lot of time. + + +## Introduction + +Here's a shortcut if you missed the last session: + +\scriptsize + +```{r, eval= F} + # Install packages + install.packages(c("rgdal", "broom", "ggplot2", + "ggmap", "raster", "leaflet", + "rgeos")) + + # Load packages + library(tidyverse) + library(rworldmap) + library(rgdal) + library(ggmap) + library(ggplot2) + library(broom) + library(ggrepel) + + # Set folder paths + projectFolder <- "YOUR/FOLDER/PATH" + finalData <- file.path(projectFolder, + "DataWork", "DataSets", "Final") + +``` + +# GIS Overview + +## GIS Overview +One of the main GIS data types is vector data. Vectors (also called shapefiles) consist of points, lines and polygons. These shapes are attached to a dataframe, where each row corresponds to a different spatial element. + +```{r fig.align="center", echo=F} +knitr::include_graphics("img/Attribute_Table.png", dpi = 400) +``` + +## GIS Data: Raster Data +The second main GIS data type is raster data. Rasters are spatially-referenced grids, where each grid has a value associated with it. + +```{r fig.align="center", echo=F} +knitr::include_graphics("img/raster_r_example.png", dpi = 400) +``` + +## Coordinate Reference Systems + +__Coordinate reference systems__ map pairs of numbers to a location. + +* __Geographic Coordinate Systems__ live on a sphere; here, the units are in decimal degrees + + + Using the WGS84 coordinate system the World Bank MC building is located at 38.89 degrees latitude and -77.04 degrees longitude. + +* __Projected Coordinate Systems__ project the earth onto a flat surface. This [distorts](http://geoawesomeness.com/5-tools-will-let-master-map-projections/) the earth in some way (shape, area, distance or direction) + + + Using to the World Mercator projection, the World Bank is located 4680364.64 north and -8576320.73 east. + +```{r fig.align="center", echo=F} +knitr::include_graphics("img/coordinate_systems.jpg", dpi = 220) +``` + +## Coordinate Reference Systems + +To see why projections matter, watch this video from the West Wing: click [here](https://www.youtube.com/watch?v=vVX-PrBRtTY). + + +```{r, eval=T, echo=F} +setwd(finalData) +``` + +```{r, eval=F, echo=F} +# webshot::install_phantomjs() +library(rworldmap) +library(rgdal) + +finalData <- "C:/Users/wb521633/Documents/GitHub/dime-r-training/DataWork/DataSets/Final" +whr <- read.csv(file.path(finalData,"whr_panel.csv"), header = T) +whr <- subset(whr, year == 2015) +worldmap <- getMap(resolution="low") +worldmap <- subset(worldmap, select=c(ADMIN, REGION, POP_EST, GDP_MD_EST)) + +whr$country[!whr$country %in% worldmap$ADMIN] + +worldmap$ADMIN <- as.character(worldmap$ADMIN) +worldmap$ADMIN[worldmap$ADMIN %in% "United States of America"] <- "United States" +worldmap$ADMIN[worldmap$ADMIN %in% "Northern Cyprus"] <- "North Cyprus" +worldmap$ADMIN[worldmap$ADMIN %in% "Hong Kong S.A.R."] <- "Hong Kong" +worldmap$ADMIN[worldmap$ADMIN %in% "Republic of Serbia"] <- "Serbia" +worldmap$ADMIN[worldmap$ADMIN %in% "Somaliland"] <- "Somaliland Region" +worldmap$ADMIN[worldmap$ADMIN %in% "West Bank"] <- "Palestinian Territories" +worldmap$ADMIN[worldmap$ADMIN %in% "Democratic Republic of the Congo"] <- "Congo (Kinshasa)" +worldmap$ADMIN[worldmap$ADMIN %in% "Republic of the Congo"] <- "Congo (Brazzaville)" +worldmap$ADMIN[worldmap$ADMIN %in% "United Republic of Tanzania"] <- "Tanzania" + +# Merge in Happiness Data +whr <- read.csv(file.path(finalData,"whr_panel.csv"), + header = T) +whr <- subset(whr, year == 2015) + +worldmap <- merge(worldmap, whr, by.x="ADMIN", by.y="country") + +writeOGR(obj=worldmap, dsn=finalData, layer="worldmap", driver="ESRI Shapefile",overwrite_layer=T) +``` + +# Load and Plot Spatial Object + +## Load a Shapefile +We'll load the map data using the the readOGR function from the rgdal package. The argument for dsn is the folder where the shapefile is and layer is the name of the shapefile. + +The code also shows alternative ways of loading the shapefile, in case the first way doesn't work. + +\scriptsize +```{r, eval=F, warning=FALSE, message=F, results='hide'} +library(rgdal) +worldmap <- readOGR(dsn=finalData, layer="worldmap") + +# POTENTIAL ISSUE 1: Can't find file. +# In this case, try setting the working directory +# to the folder where the shapefile is +# setwd(finalData) +# worldmap <- readOGR(dsn=".", layer="worldmap") + +# POTENTIAL ISSUE 2: Can't find gdal +# In this case, use the following code +# library(maptools) +# worldmap <- readShapeSpatial(file.path(finalData,"worldmap.shp")) +# prj <- "+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0" +# crs(worldmap) <- CRS(prj) +``` + +```{r, echo=F, warning=FALSE, message=F, results='hide'} +library(rgdal) + +setwd(finalData) +worldmap <- readOGR(dsn=".", layer="worldmap") +``` + +## Plot Shapefile +```{r, eval=T} +plot(worldmap) +``` + +## Plot Shapefile +We can subset the spatialdataframe just like any other dataframe. +```{r, eval=T} +plot(worldmap[1:42,]) +``` + +## Spatial Data Frame +Here, we've loaded a SpatialPolygonsDataFrame. (There's equivalent object types for points and lines). A spatial dataframe is comprised of a list that includes different parts of the shapefile. We access these parts using the @ symbol. + +\scriptsize + +```{r, eval=T} +# Coordinate Repference System +worldmap@proj4string +``` + +```{r, eval=T} +# Dataframe +head(worldmap@data, 3) +``` + +# Mapping with ggplot: Polygons + +## Making a Map with ggplot +\scriptsize + +Basics of mapping with ggplot +\begin{enumerate} +\item ggplot has methods for mapping spatial objects. We use geom\_polgyon for polygons and geom\_path for lines. +\item In aes() for spatial objects, we always put x=lon, y=lat and group=group, even though our spatial object may not have these variables. We do this because ggplot converts the spatial object into a dataframe that has these variables. ggplot can't directly use shapefiles, but converts them into a dataframe that it can interpret. +\end{enumerate} + +```{r, eval=F} +library(ggplot2) +ggplot() + + geom_polygon(data=worldmap, aes(x=long, y=lat, group=group)) +``` + +## Making a Map with ggplot +```{r, echo=F, warning=FALSE, message=F, results='hide'} +library(ggplot2) +ggplot() + + geom_polygon(data=worldmap, aes(x=long, + y=lat, + group=group)) +``` + +## Making a Map with ggplot +Now, let's color the map by a variable in the dataset. +\begin{itemize} +\item We may be tempted to use the below code, but it won't work. +\item When ggplot converts the spatial object into a dataframe, it doesn't keep any of our variables -- it only keeps spatial information. +\end{itemize} + +```{r, eval=F} +# THIS CODE WILL NOT WORK :'( +ggplot() + + geom_polygon(data=worldmap, aes(x = long, y = lat, + group = group, + fill = hppy_sc)) +``` + +## Converting Spatial Object into Dataframe for ggplot +To use our variables when mapping a spatial object with ggplot, we need to convert the spatial object into a dataframe and add our variables ourselves. + +\begin{itemize} +\item We use the tidy function from the broom package to convert our shapefile into a dataframe. +\item The \texttt{tidy()} function will create an "id" variable where values are the row.names from the shapefile. So we can later merge data from the shapefile to the new dataframe, we create an id variable in the shapefile that are the row.names. +\end{itemize} +\scriptsize +```{r, eval=TRUE, warning=FALSE, message=F, results='hide'} +library(broom) +worldmap$id <- row.names(worldmap) +worldmap_tidy <- tidy(worldmap) +``` + +```{r} +head(worldmap_tidy, 3) +``` + +## Merge data back to object +Now, merge the "tidy" dataframe with the data from the shapefile by the id variable. Now we have a dataframe that ggplot can understand with our data! + +\scriptsize + +```{r} +worldmap_tidy <- merge(worldmap_tidy, worldmap@data, by="id") +head(worldmap_tidy, 3) +``` + +## Making a Map with ggplot +Now, using this tidy dataframe, we can make a map. +```{r, eval=F, warning=FALSE, message=F, results='hide'} +library(ggplot2) + +ggplot() + + geom_polygon(data=worldmap_tidy, aes(x=long, y=lat, + group=group, + fill = hppy_sc)) + + theme_void() +``` + +## Making a Map with ggplot +```{r, echo=F, warning=FALSE, message=F, results='hide'} +library(ggplot2) + +ggplot() + + geom_polygon(data=worldmap_tidy, aes(x=long, y=lat, + group=group, + fill = hppy_sc)) + + theme_void() +``` + +## Making a Prettier Map with ggplot +```{r, eval=F, warning=FALSE, message=F, results='hide'} +library(ggplot2) + +ggplot() + + geom_polygon(data=worldmap_tidy, aes(x=long, y=lat, + group=group, + fill = hppy_sc)) + + theme_void() + + coord_quickmap() + # Removes distortions + labs(fill="Happiness\nScore") + + scale_fill_gradient(low = "firebrick4", + high = "chartreuse2") +``` + +## Making a Prettier Map with ggplot +```{r, echo=F, warning=FALSE, message=F, results='hide'} +library(ggplot2) + +ggplot() + + geom_polygon(data=worldmap_tidy, aes(x=long, y=lat, + group=group, + fill = hppy_sc)) + + theme_void() + + coord_quickmap() + # Removes distortions + labs(fill="Happiness\nScore") + + scale_fill_gradient(low = "firebrick4", + high = "chartreuse2") +``` + +## Map of Africa + +\begin{block}{Exercise 1} +Make a map just of Africa. +Hint: Subset world\_tidy using the \textit{REGION} variable. You can use the \texttt{subset()} function to subset a dataframe. Forget how the function works? Type \texttt{help(subset)} or \texttt{?subset} for the help file. +\end{block} + +## Map of Africa +```{r, eval=F, warning=FALSE, message=F, results='hide'} +africa_tidy <- subset(worldmap_tidy, REGION == "Africa") + +ggplot() + + geom_polygon(data=africa_tidy, aes(x=long, y=lat, + group=group, + fill = hppy_sc)) + + theme_void() + + coord_quickmap() + # Removes distortions + labs(fill="Happiness\nScore") + + scale_fill_gradient(low = "firebrick4", + high = "chartreuse2") +``` + +## Map of Africa +```{r, echo=F, warning=FALSE, message=F, results='hide'} +africa_tidy <- worldmap_tidy[worldmap_tidy$REGION == "Africa",] + +ggplot() + + geom_polygon(data=africa_tidy, aes(x=long, y=lat, + group=group, + fill = hppy_sc)) + + theme_void() + + coord_quickmap() + # Removes distortions + labs(fill="Happiness\nScore") + + scale_fill_gradient(low = "firebrick4", + high = "chartreuse2") +``` + +# Mapping with ggplot: Points + +## Add Points to Map: Does the WB allocate projects to happy or sad countries? +We have data on the locations of [World Bank projects from AidData](https://www.aiddata.org/data/world-bank-geocoded-research-release-level-1-v1-4-2). Let's plot projects on our map to see if the World Bank tends to allocate projects to happy or sad countries. +\scriptsize +```{r} +wb_projects <- read.csv(file.path(finalData, "wb_projects.csv")) +names(wb_projects) +``` + +## Add Points to Map +For ggplot, we don't need to convert the dataframe to a spatial object. ggplot wants a dataframe, and that's what we have. + +\begin{block}{Exercise 2} +Add World Bank project locations to the previous map. + +Hints: +\begin{enumerate} +\item Use geom\_points() to add points data +\item Set data=wb\_projects +\item In aes(), set x=longitude and y=latitude +\end{enumerate} +\end{block} + +## Add Points to Map +```{r, eval=F} +ggplot() + + geom_polygon(data=africa_tidy, aes(x=long, y=lat, + group=group, + fill = hppy_sc)) + + geom_points(data=wb_projects, aes(x=longitude, + y=latitude), + size=.1) + + theme_void() + + coord_quickmap() + # Removes distortions + labs(fill="Happiness\nScore") + + scale_fill_gradient(low = "firebrick4", + high = "chartreuse2") +``` + +## Add Points to Map +```{r, echo=F, warning=FALSE, message=F, results='hide'} +ggplot() + + geom_polygon(data=africa_tidy, aes(x=long, y=lat, + group=group, + fill = hppy_sc)) + + geom_point(data=wb_projects, aes(x=longitude, + y=latitude), + size=.1) + + theme_void() + + coord_quickmap() + # Removes distortions + labs(fill="Happiness\nScore") + + scale_fill_gradient(low = "firebrick4", + high = "chartreuse2") +``` + +# Mapping with ggplot: Lines + +## Add trunk roads to map + +\begin{block}{Exercise 3} +Let's add roads to our map. In the final data folder, there's a file called troads.shp, which is a shapefile of trunk roads in Africa. + +Hints: +\begin{enumerate} +\item Load the data using readOGR. readOGR has two arguments: dsn (the file path to the shapefile) and layer (the name of the shapefile) +\item To add roads to the map, do we need to use the tidy function to convert the spatial object to a dataframe? +\item In ggplot, geom\_path is used to handle spatial lines. Remember, in aesthetics (aes) we need to add x=long, y=lat, and group=group. +\end{enumerate} +\end{block} + +## Add trunk roads to map +\scriptsize +Lets load the data and add it to our map. +```{r, eval=F} +trunk_roads <- readOGR(dsn=finalData, layer="troads") + +ggplot() + + geom_polygon(data=africa_tidy, aes(x=long, y=lat, + group=group, + fill = hppy_sc)) + + geom_path(data=trunk_roads, aes(x=long, y=lat, + group=group)) + + geom_point(data=wb_projects, aes(x=longitude, + y=latitude), + size=.1) + + theme_void() + + coord_quickmap() + # Removes distortions + labs(fill="Happiness\nScore") + + scale_fill_gradient(low = "firebrick4", + high = "chartreuse2") +``` + +## Add trunk roads to map +\scriptsize +We see the roads, but where did the rest of the map go? +```{r, echo=F, warning=FALSE, message=F, results='hide'} +setwd(finalData) +trunk_roads <- readOGR(dsn=".", layer="troads") + +ggplot() + + geom_polygon(data=africa_tidy, aes(x=long, y=lat, + group=group, + fill = hppy_sc)) + + geom_path(data=trunk_roads, aes(x=long, y=lat, + group=group)) + + geom_point(data=wb_projects, aes(x=longitude, + y=latitude), + size=.1) + + theme_void() + + coord_quickmap() + # Removes distortions + labs(fill="Happiness\nScore") + + scale_fill_gradient(low = "firebrick4", + high = "chartreuse2") +``` + +## Check Projections: They Don't Match +Let's check the projections of the worldmap data and our trunk roads. +\scriptsize +```{r} +# World Map Projection +worldmap@proj4string +``` + +```{r} +# Trunk Roads Projection +trunk_roads@proj4string +``` + +## Reproject Data +We can use the \texttt{spTransform()} function to reproject the data. + +The below code shows two different ways of specifying the same projection. +\begin{enumerate} +\item The first way writes out the full projection, explicitly defining parameters such as proj, datum, etc. +\item The second way uses an epsg code. EPSG codes reference commonly used projections and provide a shorthand to referencing a projection. +\end{enumerate} + +\scriptsize +```{r, eval=F} +# Way 1 +prj <- "+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0" +trunk_roads_reprj <- spTransform(trunk_roads, CRS(prj)) + +# Way 2 +trunk_roads_reprj <- spTransform(trunk_roads, CRS("+init=epsg:4326")) +``` + +```{r, echo=F, warning=FALSE, message=F, results='hide'} +trunk_roads_reprj <- spTransform(trunk_roads, CRS("+init=epsg:4326")) +``` + +## Add trunk roads to map +Now, plot the map using the reprojected version of the trunk roads. +```{r, eval=F, warning=FALSE, message=F, results='hide'} +ggplot() + + geom_polygon(data=africa_tidy, aes(x=long, y=lat, + group=group, + fill = hppy_sc)) + + geom_path(data=trunk_roads_reprj, aes(x=long, y=lat, + group=group)) + + geom_point(data=wb_projects, aes(x=longitude, + y=latitude), + size=.1) + + theme_void() + + coord_quickmap() + # Removes distortions + labs(fill="Happiness\nScore") + + scale_fill_gradient(low = "firebrick4", + high = "chartreuse2") +``` + +## Add trunk roads to map +```{r, echo=F, warning=FALSE, message=F, results='hide'} +ggplot() + + geom_polygon(data=africa_tidy, aes(x=long, y=lat, + group=group, + fill = hppy_sc)) + + geom_path(data=trunk_roads_reprj, aes(x=long, y=lat, + group=group)) + + geom_point(data=wb_projects, aes(x=longitude, + y=latitude), + size=.1) + + theme_void() + + coord_quickmap() + # Removes distortions + labs(fill="Happiness\nScore") + + scale_fill_gradient(low = "firebrick4", + high = "chartreuse2") +``` + +## Add Roads to Legend +\scriptsize +ggplot only puts objects in the legend where some element is in the aesthetics. To add "trunk roads" in the legend, add a color element in the aesthetics then manually define the color. +```{r, eval=F} +ggplot() + + geom_polygon(data=africa_tidy, aes(x=long, y=lat, + group=group, + fill = hppy_sc)) + + geom_point(data=wb_projects, aes(x=longitude, + y=latitude), + size=.1) + + geom_path(data=trunk_roads_reprj, aes(x=long, y=lat, + group=group, + color="Trunk Roads")) + + theme_void() + + coord_quickmap() + # Removes distortions + labs(fill="Happiness\nScore", + color="") + # REMOVE TITLE ABOVE TRUNK ROAD IN LEGEND + scale_fill_gradient(low = "firebrick4", + high = "chartreuse2") + + scale_color_manual(values=c("blue")) # MANUALY DEFINE COLOR HERE +``` + +## Add Roads to Legend +```{r, echo=F, warning=FALSE, message=F, results='hide'} +ggplot() + + geom_polygon(data=africa_tidy, aes(x=long, y=lat, + group=group, + fill = hppy_sc)) + + geom_point(data=wb_projects, aes(x=longitude, + y=latitude), + size=.1) + + geom_path(data=trunk_roads_reprj, aes(x=long, y=lat, + group=group, + color="Trunk Roads")) + + theme_void() + + coord_quickmap() + # Removes distortions + labs(fill="Happiness\nScore", + color="") + + scale_fill_gradient(low = "firebrick4", + high = "chartreuse2") + + scale_color_manual(values=c("blue")) +``` + +# Basemap + +## Basemap +We can also plot our spatial data on basemaps. One of the packages that enables use to do this is the \texttt{ggmap} package. In the function, \textit{maptype} changes the type of basemap we use. Check the helpfile to see what other basemaps can be used help(get\_stamenmap). + +```{r, eval=F, warning=FALSE, message=F, results='hide'} +library(ggmap) + +nairobi <- c(left = 36.66, + bottom = -1.44, + right = 37.10, + top = -1.15) +nairobi_map <- get_stamenmap(nairobi, + zoom = 12, + maptype = "toner-lite") + +ggmap(nairobi_map) +``` + +## Basemap +```{r, echo=F, warning=FALSE, message=F, results='hide'} +library(ggmap) + +nairobi <- c(left = 36.66, bottom = -1.44, right = 37.10, top = -1.15) +nairobi_map <- get_stamenmap(nairobi, zoom = 12, maptype = "toner-lite") + +ggmap(nairobi_map) +``` + +## Basemap + +We can add layers to basemap. The code is the same as ggplot, except here we start the code with ggmap() instead of gplot(). + +\begin{block}{Exercise 4} +Plot projects in Narobi on top of a basemap. As a challenge, make the size AND the color of the points correspond to aid commitments. + +Hint: Set size and color to "commitments" in the aesthetics. +\end{block} + +```{r, eval=F} +# Code where size/color does not correspond to commitments +kenya_projects <- subset(wb_projects, recipients == "Kenya") + +ggmap(nairobi_map) + + geom_point(data=kenya_projects, aes(x=longitude, + y=latitude), + color="orange") + + theme_void() +``` + +## Basemap + +Here's the code to make point size and color correspond with commitments. To combine size and color to one legend item, add "guide=legend" to both scale_color_continuous and scale_size_continuous. + +```{r, eval=F} +ggmap(nairobi_map) + + geom_point(data=kenya_projects, aes(x=longitude, + y=latitude, + size=commitments, + color=commitments)) + + scale_color_continuous(guide = "legend", + low="orange", high="green2") + + scale_size_continuous(guide = "legend") + + labs(size = "Aid\nCommitment", + color = "Aid\nCommitment") + + theme_void() +``` + +## Basemap +```{r, echo=F, warning=FALSE, message=F, results='hide'} +kenya_projects <- subset(wb_projects, recipients == "Kenya") + +ggmap(nairobi_map) + + geom_point(data=kenya_projects, aes(x=longitude, + y=latitude, + size=commitments, + color=commitments)) + + scale_color_continuous(guide = "legend", + low="orange", high="green2") + + scale_size_continuous(guide = "legend") + + labs(size = "Aid\nCommitment", + color = "Aid\nCommitment") + + theme_void() +``` + +# Interactive Map + +## Convert dataframe to spatialdataframe + +So far we've been using points as a dataframe. However, we can also define points as a spatial points dataframe. Here, we explicitly define which variables are the latitude and longitude and can define a projection. + +We don't necessarily need to do this for interactive maps, but it'll make our code simplier later on. + +\begin{enumerate} +\item We first define which variables are the latitude and longitude variables using the \texttt{coordinates()} function. +\item Then, we define our coordinate reference system using the crs function. +\end{enumerate} + +\scriptsize + +```{r, warning=FALSE, message=F, results='hide'} +library(raster) + +# Create Spatial Points Dataframe +# ~longitude+latitude is a formular, where ~ tells R to reference those +# variables in the associated datasets; in this case, kenya_projects +coordinates(kenya_projects) <- ~longitude+latitude + +# Define Projection +crs(kenya_projects) <- CRS("+init=epsg:4326") +``` + +## Convert dataframe to spatialdataframe +We can directly plot spatialdataframes. ( [pch](http://www.sthda.com/english/wiki/r-plot-pch-symbols-the-different-point-shapes-available-in-r) is symbol and cex is size). +```{r} +plot(kenya_projects, pch=16, cex=.5) +``` + +## Interactive Map +We'll use leaflet to make an interactive map. Leaflet is a javascript library for interactive maps. The leaflet R package allows one to interact with this library in R. + +Leaflet works similarly to ggplot where you sequentially define different elements. However, instead of using + between elements, we use a pipe: %>% + +```{r, eval=F} +library(leaflet) + +leaflet() %>% + addCircles(data=kenya_projects) %>% + addTiles() +``` + +## Interactive Map +```{r, echo=F, warning=FALSE, message=F, results='hide'} +library(leaflet) + +leaflet() %>% + addCircles(data=kenya_projects) %>% + addProviderTiles(providers$OpenStreetMap) + +# https://leaflet-extras.github.io/leaflet-providers/preview/ +``` + +## Better Interactive Map +Now, let's make text appear when we click a point, change the radius, change the basemap ([click here for options](https://leaflet-extras.github.io/leaflet-providers/preview/)) and change the color. +```{r, eval=F} +leaflet() %>% + addCircles(data=kenya_projects, + popup = ~project_title, + radius = ~sqrt(commitments)*5, + weight = 1, + color = "orange") %>% + addProviderTiles(providers$Stamen.Terrain) +``` + +## Better Interactive Map +```{r, echo=F, warning=FALSE, message=F, results='hide'} +leaflet() %>% + addCircles(data=kenya_projects, + popup = ~project_title, + radius = ~sqrt(commitments)*5, + weight = 1, + color = "orange") %>% + addProviderTiles(providers$Stamen.Terrain) +``` + +## Two layers on map +\begin{block}{Exercise 5} +Add two layers to the interactive map, differentiating between two types of projects: "Transport and storage" and "Water supply and sanitation." Make sure to color the layers differently. + +Hints: + +\begin{enumerate} +\item Use the "ad\_sector\_names" variable and the subset function to subset kenya\_projects. +\item To add multiple layers, just add another addCircles() layer. +\end{enumerate} +\end{block} + +## Two layers on map +\scriptsize +```{r, eval=F, warning=FALSE, message=F, results='hide'} +leaflet() %>% + addCircles(data=subset(kenya_projects, + ad_sector_names == "Transport and storage"), + popup = ~project_title, + radius = ~sqrt(commitments)*5, + weight = 2.5, + color = "orange") %>% + addCircles(data=subset(kenya_projects, + ad_sector_names == "Water supply and sanitation"), + popup = ~project_title, + radius = ~sqrt(commitments)*5, + weight = 2.5, + color = "blue") %>% + addProviderTiles(providers$Stamen.Terrain) +``` + +## Two layers on map + +```{r, echo=F, warning=FALSE, message=F, results='hide'} +leaflet() %>% + addCircles(data=subset(kenya_projects, + ad_sector_names == "Transport and storage"), + popup = ~project_title, + radius = ~sqrt(commitments)*5, + weight = 2.5, + color = "orange") %>% + addCircles(data=subset(kenya_projects, + ad_sector_names == "Water supply and sanitation"), + popup = ~project_title, + radius = ~sqrt(commitments)*5, + weight = 2.5, + color = "blue") %>% + addProviderTiles(providers$Stamen.Terrain) +``` + +## Add Layers and Buttons to Interactive Map +\scriptsize +We can add buttons that cause layers to appear or dissappear in two steps: + +\begin{enumerate} +\item When defining the layer (eg, in addCircles), add "group=" and give the group a name. +\item Add "addLayersControl", and as a parameter add "overlayGroups" with a list of the above groups. +\end{enumerate} + +```{r, eval=F, warning=FALSE, message=F, results='hide'} +leaflet() %>% + addCircles(data=subset(kenya_projects, + ad_sector_names == "Transport and storage"), + popup = ~project_title, + radius = ~sqrt(commitments)*5, + weight = 2.5, + color = "orange", + group = "Transport") %>% + addCircles(data=subset(kenya_projects, + ad_sector_names == "Water supply and sanitation"), + popup = ~project_title, + radius = ~sqrt(commitments)*5, + weight = 2.5, + color = "blue", + group = "Water") %>% + addProviderTiles(providers$Stamen.Terrain) %>% + addLayersControl(overlayGroups = c("Transport", "Water"), + options = layersControlOptions(collapsed = FALSE)) +``` + +## Add Layers and Buttons to Interactive Map +```{r, echo=F, warning=FALSE, message=F, results='hide'} +leaflet() %>% + addCircles(data=subset(kenya_projects, + ad_sector_names == "Transport and storage"), + popup = ~project_title, + radius = ~sqrt(commitments)*5, + weight = 2.5, + color = "orange", + group = "Transport") %>% + addCircles(data=subset(kenya_projects, + ad_sector_names == "Water supply and sanitation"), + popup = ~project_title, + radius = ~sqrt(commitments)*5, + weight = 2.5, + color = "blue", + group = "Water") %>% + addProviderTiles(providers$Stamen.Terrain) %>% + addLayersControl( + overlayGroups = c("Transport", "Water"), + options = layersControlOptions(collapsed = FALSE) +) +``` + +# Spatial Operations + +## Grabbing Administrative Data +We can grab administrative layers from [GADM (Database of Global Administrative Areas)](https://gadm.org/) using the getData function from the raster package. +```{r, eval=F} +library(raster) + +ken_adm1 <- getData('GADM', country='KEN', level=1) +plot(ken_adm1) +``` + +## Grabbing Administrative Data +```{r, echo=F, warning=FALSE, message=F, results='hide'} +library(raster) + +setwd(finalData) +ken_adm1 <- getData('GADM', country='KEN', level=1) +plot(ken_adm1) +``` + +## Cropping +Let's trim an object based on the spatial extent of another object. Let's say we only wanted trunk roads in Kenya. We can use gIntersection() from the \textit{rgeos} package. The function converts the "SpatialLineDataFrame" to a "SpatialLine." We'll plot this later, and ggplot only works with "SpatialLineDataFrame" -- so to convert the object back, we just add a variable -- here, id. +```{r, eval=F, warning=FALSE, message=F, results='hide'} +library(rgeos) +trunk_roads_kenya <- gIntersection(trunk_roads_reprj, + ken_adm1) +trunk_roads_kenya$id <- 1:length(trunk_roads_kenya) + +plot(ken_adm1) +plot(trunk_roads_kenya, add=T, col="red") +``` + +## Cropping +```{r, echo=F, warning=FALSE, message=F, results='hide'} +library(rgeos) +trunk_roads_kenya <- gIntersection(trunk_roads_reprj, ken_adm1) +trunk_roads_kenya$id <- 1:length(trunk_roads_kenya) + +plot(ken_adm1) +plot(trunk_roads_kenya, add=T, col="red") +``` + +## Distance Calculation +Let's calculate the distance of projects to trunk roads. gDistance calculates the distance in the units of the projection (here, decimal degrees) +\scriptsize +```{r, warning=FALSE, message=F, results='hide', eval=T} +library(rgeos) +# rows = roads; columns = projects +distance_matrix <- gDistance(kenya_projects, trunk_roads_kenya, byid=T) + +# Min values of columns (projects) +kenya_projects$dist_road <- apply(distance_matrix, 2, min) +``` + +## Homework: Map of World Bank Projects Near Roads +\scriptsize +\begin{block}{Exercise 6} +Now that we've calculated the distance, make a map that includes the following characteristics: + +\begin{enumerate} +\item Includes World Bank projects in Kenya that are within 10km of a trunk road (about 0.1 decimal degrees) +\item Includes trunk roads +\item Includes a legend item for both the projects and map +\item The size if the WB project points should correspond to aid commitments. +\item Includes the border of Kenya around the map (hint: use the getData() function from the raster package to get GADM data for ADM 0). +\end{enumerate} +\end{block} + +A tip: + +ggplot() can't handle spatial points dataframes, so we have to convert back to a dataframe. But tidy doesn't work with spatial points dataframes either. + +So we: (1) grab the data part of the spatial object, and (2) add coordinates back in (when converting a dataframe to a spatial dataframe, we loose the coordinates as variables) + +```{r} +kenya_projects_df <- kenya_projects@data +kenya_projects_df$long <- kenya_projects@coords[,1] +kenya_projects_df$lat <- kenya_projects@coords[,2] +``` + +## Homework : Map of World Bank Projects Near Roads +\scriptsize +Code in Appendix! +```{r, echo=F, , warning=FALSE, message=F, results='hide'} +kenya_adm0 <- getData('GADM', country='KEN', level=0) + +ggplot() + + geom_polygon(data=kenya_adm0, aes(x=long, y=lat, group=group), + fill="ivory", color="black") + + geom_point(data=subset(kenya_projects_df, dist_road < 0.1), + aes(x=long, y=lat, + size=commitments/1000000), + fill="orange", + color="black", + pch=21) + # Adds border around points + geom_path(data=trunk_roads_kenya, aes(x=long, y=lat, + group=group, + color="Trunk Road")) + + scale_color_manual(values="brown") + + labs(color="", size="World Bank\nCommitments\n(Millions)", + title = "World Bank Projects Near Trunk Roads") + + coord_quickmap() + + theme_void() + + theme(plot.title = element_text(hjust = 0.5)) # Center Title +``` + +# Appendix + +## Homework Solution +\scriptsize +```{r, eval=F} +kenya_adm0 <- getData('GADM', country='KEN', level=0) + +ggplot() + + geom_polygon(data=kenya_adm0, aes(x=long, y=lat, group=group), + fill="ivory", color="black") + + geom_point(data=subset(kenya_projects_df, dist_road < 0.1), + aes(x=long, y=lat, + size=commitments/1000000), + pch=21, # Adds border around points + fill="orange", + color="black") + + geom_path(data=trunk_roads_kenya, aes(x=long, y=lat, + group=group, + color="Trunk Road")) + + scale_color_manual(values="brown") + + labs(color="", size="World Bank\nCommitments\n(Millions)", + title = "World Bank Projects Near Trunk Roads") + + coord_quickmap() + + theme_void() + + theme(plot.title = element_text(hjust = 0.5)) # Center Title +``` + +## Distance Calculation: More Accurate +\scriptsize +To get a more accurate distance we can use the geosphere package. geosphere uses algorithms that take into account the fact that the earth isn't flat, and that distances between decimal degrees changes depending on location. +```{r, eval=F} +library(geosphere) + +# Returns dataframe that includes shortest distance from each point to the road +# NOTE: This may take a few minutes to run, so I'm only running on first 10 +# observations. Use the commented out version to run on all projects +# distance_df <- dist2Line(kenya_projects, trunk_roads_kenya) + +distance_df <- dist2Line(kenya_projects[1:10,], trunk_roads_kenya) + +head(distance_df, 5) +``` + +## Calculate Areas +To calculate the area of a polygon, we can use: +\begin{enumerate} +\item gArea() from the rgeos package. Here, units will be in units of your coordinate system (eg., decimal degrees of meters). +\item areaPolygon from the geosphere package. This calculation takes into account the curvature of the earth and units will be in meters. +\end{enumerate} + +```{r, eval=F} +library(rgeos) +worldmap_area_v1 <- gArea(worldmap, byid=T) + +library(geosphere) +worldmap_area_v2 <- areaPolygon(worldmap) + +``` + + +## Export Spatial Object as Shapefile +Below shows code to export a spatial object as a shapefile +\scriptsize +```{r, eval=F} +writeOGR(obj=roads_trunk_rpj, # Spatial object + dsn="~/Desktop", # Folder to save shapefile + layer="trunkroads", # Name of shapefile + driver="ESRI Shapefile", # Use ESRI shapefile format (.shp) + overwrite_layer=T) # Overwrite shapefile with same name +``` + +## Over Function +Let's say we want to add variables to the World Bank projects from the Africa polygon dataset. For this, we can use the \texttt{over()} function. We can use this to extract data from the polygons to each point. +\scriptsize +```{r, eval=F} +# Spatially define wb_projects +coordinates(wb_projects) <- ~longitude+latitude +crs(wb_projects) <- CRS("+init=epsg:4326") + +# This yields a dataframe that has the same number of rows as wb_projects, +# but has data from worldmap. +wb_projects_OVER_worldmap <- over(wb_projects, worldmap) + +# Add variables from worldmap to wb_projects +wb_projects$POP_EST <- wb_projects_OVER_worldmap$POP_EST +wb_projects$GDP <- wb_projects_OVER_worldmap$GDP_MD_ +wb_projects$hppy_sc <- wb_projects_OVER_worldmap$hppy_sc +``` + +```{r, echo=F, warning=FALSE, message=F, results='hide'} +# Spatially define wb_projects +coordinates(wb_projects) <- ~longitude+latitude +crs(wb_projects) <- CRS("+init=epsg:4326") +``` + + +## Buffer Spatial Objects +We can create buffers around spatial objects using \texttt{gBuffer()} from \textit{rgeos}. This function works with points, lines or polygons. The width is in the units of your coordinate reference system. Consequently, units here are in decimal degrees. However, you could project your data so that units are in meters. +```{r, eval=F} +library(rgeos) +wb_projects_buff <- gBuffer(wb_projects, width=0.1, byid=T) +plot(wb_projects_buff[1:20,]) +``` + +## Buffer Spatial Objects +```{r, echo=F, warning=FALSE, message=F, results='hide'} +library(rgeos) +wb_projects_buff <- gBuffer(wb_projects, width=0.1, byid=T) +plot(wb_projects_buff[1:20,]) +``` + +## Add Text to Map +\scriptsize +We can also add text to maps. Let's say we wanted to add the project start date to the maps-- this comes from the start\_actual\_isodate variable, where data is stored as a string. To map text values, we use geom\_text\_repel() from the ggrepel package. You can also use geom\_label\_repel(), which will add a border around the text, or the "non-repel" versions: geom\_text() or geom\_label. + +```{r, eval=F, warning=FALSE, message=F, results='hide'} +library(ggrepel) + +wb_projects <- read.csv(file.path(finalData, "wb_projects.csv")) + +ggplot() + + geom_point(data=wb_projects[1:10,], aes(x=longitude, + y=latitude)) + + geom_text_repel(data=wb_projects[1:10,], aes(x=longitude, + y=latitude, + label=start_actual_isodate)) + + theme_void() +``` + +## Add Text to Map +```{r, echo=F, warning=FALSE, message=F, results='hide'} +library(ggrepel) + +wb_projects <- read.csv(file.path(finalData, "wb_projects.csv")) + +ggplot() + + geom_point(data=wb_projects[1:10,], aes(x=longitude, + y=latitude)) + + geom_text_repel(data=wb_projects[1:10,], aes(x=longitude, + y=latitude, + label=start_actual_isodate)) + + theme_void() +``` + +## Extract Centroid of Spatial Object +To obtain the centroid of a spatial object, such as the centroid of a polygon or line, use \texttt{gCentroid} from \textit{rgeos}. + +```{r, warning=FALSE, message=F, results='hide'} +worldmap_centroids <- gCentroid(worldmap) +head(worldmap_centroids) +``` + +## Useful Projections +\scriptsize +### Geographic Coordinate System +The most common geographic coordinate system is the [World Geodetic System (WGS84)](https://en.wikipedia.org/wiki/World_Geodetic_System). +```{r, , eval=F} +"+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0" +``` + +### Projected Coordinate Systems +Below are projections for equal area and equal distance. Replace [LATITUDE VALUE] and [LONGITUDE VALUE] with the center of the area that you're working with. + +Equal Area: [Lambert Azimuthal Equal Area](https://en.wikipedia.org/wiki/Lambert_azimuthal_equal-area_projection) +```{r, eval=F} +"+proj=laea +lat_0=[LATITUDE VALUE] +lon_0=[LONGITUDE VALUE]" +``` + + +Equal Distance: [Azimuthal Equidistant Projection](https://en.wikipedia.org/wiki/Azimuthal_equidistant_projection) +```{r, eval=F} +"+proj=aeqd +lat_0=[LATITUDE VALUE] +lon_0=[LONGITUDE VALUE]" +``` + +## ggplot Colors Scale Summary +Below summarizes defining color scales or palettes in ggplot, with either continuous or discrete data. These are elements that will be included in your ggplot code. +\scriptsize + +[Defining own color scale](https://ggplot2.tidyverse.org/reference/scale_gradient.html) + +\begin{enumerate} +\item scale\_*\_gradient(low, high): [For continuous variable] Manually define low/high colors +\item scale\_*\_gradientn(colors): [For continuous variable] Use a defined list of colors (e.g., c("purple","blue","yellow","white")) +\item scale\_*\_manual(colors): [For discrete variable] Use a defined list of colors, where the list is equal to the number of unique observations in the discrete variable. +\end{enumerate} + +[Using pre-defined palettes](https://ggplot2.tidyverse.org/reference/scale_brewer.html) + +\begin{enumerate} +\item scale\_*\_distiller(palette): [For contiuous variables] +\item scale\_*\_brewer(palette): [For discrete variables] +\end{enumerate} + +Where * is either color or fill. Discrete variables = factor variables. + +## ggplot Colors Cale Summary +RColorBrewer provides a number of palettes to use. +\scriptsize +```{r} +library(RColorBrewer) +display.brewer.all() +``` + +## Useful Resources +* [Rspatial](http://rspatial.org/) provides tutorials for many topics in GIS. + +* [Nick Eubank Tutorials](http://www.nickeubank.com/gis-in-r/) -- another great set of tutorials. + +* [Spatial Features](https://r-spatial.github.io/sf/articles/sf1.html). There's an entirely different way of defining spatial objects in R that we didn't cover, so see this website for an overview of "spatial features" that treat spatial objects more similarly to dataframes. + +* [This](https://gist.github.com/sboysel/fc661f26ef51eae6377b) provides useful links to a bunch of other resources. + diff --git a/Presentations/Lab 6 - Data Processing.Rmd b/Presentations/Lab 6 - Data Processing.Rmd new file mode 100644 index 0000000..f799161 --- /dev/null +++ b/Presentations/Lab 6 - Data Processing.Rmd @@ -0,0 +1,864 @@ +--- +title: "Data Processing" +subtitle: "R for Stata Users" +date: "November-December 2018" +author: "Luiza Andrade, Leonardo Viotti & Rob Marty " +output: + beamer_presentation: + #theme: "Pittsburgh" + theme: "Madrid" + colortheme: "whale" + fonttheme: "default" + toc: true + includes: + in_header: header.tex +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) +``` +```{r, echo = F, eval=T} +def.chunk.hook <- knitr::knit_hooks$get("chunk") +knitr::knit_hooks$set(chunk = function(x, options) { + x <- def.chunk.hook(x, options) + ifelse(options$size != "normalsize", paste0("\\", options$size,"\n\n", x, "\n\n \\normalsize"), x) +}) + + +# File paths + +if (Sys.getenv("USERNAME") == "luiza"){ + projectFolder <- "C:/Users/luiza/Documents/GitHub/dime-r-training" + +} + +if (Sys.getenv("USERNAME") == "WB501238"){ + projectFolder <- "C:/Users/WB501238/Documents/GitHub/dime-r-training" + +} + +if (Sys.getenv("USERNAME") == "Leonardo"){ + projectFolder <- "C:/Users/Leonardo/Documents/GitHub/dime-r-training" + +} + +if (Sys.getenv("USERNAME") == "WB519128"){ + projectFolder <- file.path("C:/Users/WB519128/Documents/GitHub/dime-r-training") +} + +library(tidyverse, verbose = F) +library(readstata13) +library(data.table) + +# File paths +dataWorkFolder <- file.path(projectFolder,"DataWork") + +Data <- file.path(dataWorkFolder,"DataSets") +finalData <- file.path(Data,"Final") +rawData <- file.path(Data,"Raw") + +``` + +# Introduction + + +## Introduction + +* The goal of this session is to recreate the WHR data set that we've been using for this training +* We'll take you through the same steps we've taken when we were preparing it +* We'll use a set of packages that are bundled into something called the `tidyverse` + +## Introduction + +* In this session, you'll be introduced to some basic concepts of data cleaning in R. The contents covered are: + * Exploring a data set + * Creating new variables + * Filtering and subsetting data sets + * Merging and reshaping data sets + * Dealing with factor variables + * Saving data +* There are many other tasks that we usually perform as part of data cleaning that are beyond the scope of this session + + + +## Introduction + +Before we start, let's make sure we're all set: + + 1. Start a fresh session. + 2. Make sure the `tidyverse` package is listed in the `packages` vector of your Master script. If it's not installed, install it. + 3. Run the Master script to load all the necessary packages and set file paths. + 4. Remember to disable the `PACKAGES` switch in the Master script if you already installed the necessary packages. This will save you a lot of time. + + +## Introduction + +Here's a shortcut if you missed the last session: + +\scriptsize + +```{r, eval= F} + # Install packages + install.packages("tidyverse", + dependencies = TRUE) + + # Load packages + library(tidyverse) + + # Set folder paths + projectFolder <- "YOUR/FOLDER/PATH" + finalData <- file.path(projectFolder, "DataWork", "DataSets", "Final") + rawData <- file.path(projectFolder, "DataWork", "DataSets", "Raw") + +``` + +## Loading a data set from CSV + +### ``read.csv(file, header = FALSE)`` + + * **file**: is the path to the file you want to open, including it's name and format (``.csv``) + * **header**: if `TRUE`, will read the first row as variable names + * **stringsAsFactors:** logical. See next slide for more. + +## Loading a data set from CSV + +* R reads string variables as factors as default +* This format saves memory, but can be tricky if you actually want to use the variables as strings +* You can specify the option ``stringsAsFactors = FALSE`` to prevent R from turning strings into factors + +## Loading a data set from CSV + +### Exercise 1: Load data +Use the `read.csv` function to load the three `WHR` data sets from `DataWork > DataSets > Raw`. Create an object called `whrYY` with each data set. + + * TIP 1: use the ``file.path()`` function and the ``rawData`` object created in the master to simplify the folder path. + * TIP 2: for this data set, we want to read strings as strings, not factors. + + +## Loading a data set from CSV + +```{r} +# Load the data sets (we'll discuss why the +# stringsAsFactors argument is necessary soon) +whr15 <- read.csv(file.path(rawData,"WHR2015.csv"), + header = TRUE, + stringsAsFactors = FALSE) + +whr16 <- read.csv(file.path(rawData,"WHR2016.csv"), + header = TRUE, + stringsAsFactors = FALSE) + +whr17 <- read.csv(file.path(rawData,"WHR2017.csv"), + header = TRUE, + stringsAsFactors = FALSE) + +``` + +# Exploring a data set + +## Exploring a data set + +Some useful functions: + +* **``View()``:** open the data set +* **``class()``:** reports object type or type of data stored +* **``dim()``:** reports the size of each one of an object's dimension +* **``names()``:** returns the variable names of a data set +* **``str()``:** general information on an R object +* **``summary()``:** summary information about the variables in a data frame +* **``head()``:** shows the first few observations in the dataset +* **``tail()``:** shows the last few observations in the dataset + +## Exploring a data set + +### Exercise 2: Explore a data set + +Use some of the functions listed above to explore the `whr15` data set. + +## Exploring a data set + +\footnotesize +```{r, eval = F} +# View the data set (same as clickin on it in the Environment pane) +View(whr15) +``` +\begin{figure} +\centering + \includegraphics[scale=0.4]{img/View.png} +\end{figure} + +## Exploring a data set + +\footnotesize +```{r} +class(whr15) +dim(whr15) +``` + +## Exploring a data set + +\footnotesize +```{r} +str(whr15) +``` + +## Exploring a data set + +```{r, size = "tiny"} +summary(whr15) +``` + +## Exploring a data set + +\footnotesize +```{r, size = "tiny"} +head(whr15) +``` + +# ID variables + +## ID variables + +Desired properties of an ID variable: *uniquely and fully identifying* + +* An ID variable cannot have duplicates +* An ID variable may never be missing +* The ID variable must be constant across a project +* The ID variable must be anonymous + +## ID variables + + +### ``n_distinct(..., na.rm = FALSE)`` + +Counts the number of unique values of a variablelength of a vector + + * **...**: a vector of values + * **na.rm**: if `TRUE`, missing values don't count + +### Exercise 3: identify the ID + +Using the `n_distinct` function, can you tell if the following variables are IDs of the `whr15` data set? + +1. Region +2. Country + +## ID variables + +```{r} +dim(whr15) +n_distinct(whr15$Region, na.rm = TRUE) +n_distinct(whr15$Country, na.rm = TRUE) +``` + +## ID variables + +We did the same for the other two data sets: +```{r, size = "footnotesize"} +n_distinct(whr16$Country, na.rm = TRUE) == nrow(whr16) + +n_distinct(whr17$Country, na.rm = TRUE) == nrow(whr17) +``` + +## ID variables + + * The data set we've been using in the last few sessions combines all three data sets we have now + * Before we combine them, we should take a better look at the ID variables to find out if they are consistent + + +## Comparing vectors + +### ``setdiff(first, second)`` +Prints all the elements of the `first` object that are not in the `second` object (ignores duplicates). + +* **first:** an object +* **second:** an (other?) object + +## Comparing vectors + +### Exercise 4: compare vectors +Use the `setdiff()` function to see which countries are coming in and out of the WHR data set between 2015 and 2016. + +## Comparing vectors + +```{r} +# Any countries in 2015 that are not in 2016? +setdiff(whr15$Country, whr16$Country) + +# And vice-versa +setdiff(whr16$Country, whr15$Country) +``` + +## Replacing values + +Wait, "Somaliland region" and "Somaliland Region" are not the same?! + +### Exercise 5: replacing values +Replace the occurrences of "Somaliland region" in whr15 with "Somaliland **R**egion". + +- TIP: use indexing to select only the observations of whr15$Country that are equal to "Somaliland region" + +## Replacing values + +Wait, "Somaliland region" and "Somaliland Region" are not the same?! +```{r} +# Now they are: +whr15$Country[whr15$Country == "Somaliland region"] <- + "Somaliland Region" +``` + +## Replacing values + +We also did the same for 2017: + +```{r, size = "footnotesize"} +# Compare countries +setdiff(whr16$Country, whr17$Country) +setdiff(whr17$Country, whr16$Country) + +# Fix names +whr17$Country[whr17$Country == "Hong Kong S.A.R., China"] <- "Hong Kong" +whr17$Country[whr17$Country == "Taiwan Province of China"] <- "Taiwan" +``` + +## Creating variables + + * Ok, the ID variables are consistent now + * But once we merge the data sets, we need to still be able to identify them + * So let's add a `year` variable so we can tell them apart + +### Exercise 6: creating variables +Create a variable called `year` in each WHR data set identifying what year it refers to. + +## Creating variables + +```{r} +# Piece of cake! +whr15$year <- 2015 +whr16$year <- 2016 +whr17$year <- 2017 +``` + +# Appending and merging data sets + +## Appending data sets + + * Now that we can identify the observations, we can combine the data set + * Here's a function to append objects by row: + +### ``rbind(...)`` +Take a sequence of vector, matrix or data-frame arguments and combine by rows. + +* **...:** vectors or matrices to be combined (separated by comma) + +## Appending and merging + +### Exercise 7: append data sets +Use the `rbind` function to append the three WHR datasets into a data set called `whr_panel`. + +## Appending and merging data sets + +```{r, eval = F} +# Append data sets +whr_panel <- rbind(whr15, whr16, whr17) +``` +\textcolor{red}{\texttt{Error in rbind(deparse.level, ...) : \\ numbers of columns of arguments do not match}} + +## Appending data sets + + * Our data sets are still too different to use this function, as it has very strong requirements + * There's a number of ways to fix this, and we will explore them soon + * But first, here's how we could append the data as is if we wanted to + +```{r} +# This is the quick fix +whr_panel <- bind_rows(whr15, whr16, whr17) +``` + + * Now, let's take a closer look at the variables in our data sets and see how we can make them compatible + +## Exploring a data set (again) + +```{r, size = "tiny"} +names(whr15) +names(whr16) +names(whr17) +``` + +## Exploring a data set (again) + +There are a few issues here: + +1. The data set for 2017 doesn't include a region identifier +2. The names for the same variables are different in 2017 and 2016 (and the variable names are terrible in general) +3. The data for 2015 only includes the standard error, not the confidence interval + +## Subsetting + +To fix the first issue, we will merge the region variable from 2016 to the 2017 data set. But first, we need to isolate the variables we actually want to merge to 2017. To do this, we'll use our first `tidyverse` function: + +### ``select(.data, ...)`` +Keeps only the variables you mention. + + * **.data: ** a data set + * **...:** one or more unquoted expressions separated by commas indicating the names of the variables you want to keep + +## Subsetting + +### Exercise 8: subset the data +Create a new object called `regions` containing only the columns `Country` and `Region` of the `whr16` data set. + +## Subsetting + +```{r} +# Subset the whr 16 data set +regions <- select(whr16, Country, Region) + +# Here's what the new dataset looks like +str(regions) +``` + +## Subsetting +```{r, size = "footnotesize"} +# This also works with a vector of variables +keepVars <- c("Country", + "Region", + "year", + "Happiness.Rank", + "Happiness.Score", + "Economy..GDP.per.Capita.", + "Family", + "Health..Life.Expectancy.", + "Freedom", + "Trust..Government.Corruption.", + "Generosity", + "Dystopia.Residual") + +whr15 <- whr15[, keepVars] +whr16 <- select(whr16, keepVars) +``` +```{r, eval = FALSE, size = "footnotesize"} +whr17 <- select(whr17, keepVars) +``` +\footnotesize +\textcolor{red}{\texttt{Error: Unkown column `Region`}} + + +## Tidyverse + + * As you might have noticed, some `tidyverse` functions have a syntax that is a little bit different from most R functions + * Its first argument is the name of a data set + * The following arguments are variable names + * You don't need to write variable names in quotes + * You don't need to name your arguments (and it will break if they are in the wrong order) + +## Merging + + * The `tidyverse` package `dplyr` has a whole family of functions to do merging. You can look them up by typing `?join` + * The different `join` functions have a similar function to the `keep` and `keepusing` options of Stata's `merge` function + * We now want to merge the values in the `regions` object to the `whr17` object, keeping all observations in `whr17`, regardless of them having matches or not + +## Merging + +### ``left_join(x, y, by)`` +Return all rows from x, and all columns from x and y. Rows in x with no match in y will have NA values in the new columns. If there are multiple matches between x and y, all combinations of the matches are returned. + +* **x, y:** data sets to join +* **by:** a character vector of variables to join by. If NULL, the default, *_join() will do a natural join, using all variables with common names across the two tables. + +## Merging + +### Exercise 9: Merge +Merge the `regions` data set into the `whr17` data set. + +## Merging + +```{r, size = "tiny"} +# Merge +whr17 <- left_join(whr17, regions) + +# See the result +str(whr17) +``` + +## Missing values + +```{r, size = "tiny"} +# Did that solve it? +any(is.na(whr17$Region)) +sum(is.na(whr17$Region)) + +# Where is it still missing? +whr17$Country[is.na(whr17$Region)] + +# Let's fix that +whr17$Region[whr17$Country %in% c("Mozambique", + "Lesotho", + "Central African Republic")] <- + "Sub-Saharan Africa" + +# That's better +any(is.na(whr17$Region)) +``` + +## Renaming variables + +The second problem we found, of different names for the same variable in different data sets, can be easily fixed with the rename function: + +```{r} +# Rename function: new name on the left! +whr17 <- rename(whr17, + Lower.Confidence.Interval = Whisker.low, + Upper.Confidence.Interval = Whisker.high) +``` + +## Renaming + + * If you need to change the name of variables in bulk, setting the whole vector of variable names would be easier, as in the example below + * However, to do that you need to be sure that the variables are in the right order! + +## Renaming +```{r, size = "tiny"} +# Bulk rename +names(whr15) + +newnames <- c("country", + "region", + "year", + "happy_rank", + "happy_score", + "gdp_pc", + "family", + "health", + "freedom", + "trust_gov_corr", + "generosity", + "dystopia_res") + +names(whr15) <- newnames + +``` + +## Ordering variables + +Fortunately, `select` also does that for you +```{r} +# Subset the 2017 data set and order variables +whr17 <- select(whr17, keepVars) + +# Rename variables +names(whr16) <- newnames +names(whr17) <- newnames + +# Now we can append safely +whr_panel <- rbind(whr15, whr16, whr17) +``` + +# Saving a data set + +## Saving a data set to csv + +* The data set you have now is the same data set we've been using for earlier sessions, so we can save it now +* As mentioned before, R data sets are often save as csv +* To save a data set, we use the ``write.csv()`` function: + +### ``write.csv(x, file, row.names = TRUE)`` +* **``x``:** the object (usually a data frame) you want to export to CSV +* **``file``:** the file path to where you want to save it, including the file name and the format (".csv") +* **``row.names``**: by default, R adds a column to the CSV file with the names (or numbers) of the rows in the data frame. Set it to ``FALSE`` if you don't want that column to be exported + + +## Saving a data set to csv + +### Exercise 12: save the data set + +Save the ``whr_panel`` data set to DataWork > DataSets > Final. + +* TIP: Use the ``file.path()`` function and the object ``finalData`` created in the master to simplify the folder path. + +## Saving a data set to csv + +```{r, eval = F} +# Save the whr data set +write.csv(whr_panel, + file.path(finalData,"whr_panel.csv"), + row.names = F) +``` + +# Adding variables + + +## Creating variables based on a formula + + * When we created the `year` variable, we just assigned a value to a column in the data frames using `$` + * This is great for simple variables, but can get tricky if it takes more complex values + * For example, to create a dummy variable that shows if the happy score is above the median, we would write the following + +```{r, eval = F} +# Using $ +whr_panel$happy_high <- + whr_panel$happy_score > median(whr_panel$happy_score) +``` + + * The `tidyverse` function `mutate` make this process simpler + +## Creating variables base on a formula + +### `mutate(.data, ...)` + +Adds new variables and preserves existing + + * **.data: ** the data set you want to add a variable to + * **...:** name-value pairs of expressions. Use NULL to drop a variable + + +## Creating variables base on a formula + +### Exercise 10: Create a variable based on a formula +Use the `mutate` function to create a variable called `happy_high` in the `whr_panel` data set indicating whether the `happy_score` is above the median. + +- TIP: as usual in `tidyverse`, you can refer to variables by their names, without quotes or `$` + +## Creating variables based on a formula + +```{r} +# Adding a new variable +whr_panel <- + mutate(whr_panel, + happy_high = happy_score > median(happy_score)) +``` + +## Creating variables based on a formula + +```{r} +# You can do this for multiple variables at a time +whr_panel <- + mutate(whr_panel, + happy_high = happy_score > median(happy_score), + happy_low = happy_score < median(happy_score), + dystopia_res = NULL) # NULL drops the variable +``` + +## Creating factor variables + + * When we imported this data set, we told R explicitly to not read strings as factor + * We did that because we knew that we'd have to fix the country names + * The region variable, however, should be a factor: + +```{r, size = "tiny"} +str(whr_panel$region) +unique(whr_panel$region) +``` + +## Creating a factor + +To create a factor variable, we use the ``factor`` function: + +### ``factor(x, levels, labels)`` : turns numeric or string vector ``x`` into a factor vector +* **``x``:** the vector you want to turn into a factor +* **``levels``:** a vector containing the possible values of ``x`` +* **``labels``:** a vector of strings containing the labels you want to apply to your factor variable +* **``ordered``:** logical flag to determine if the levels should be regarded as ordered (in the order given). + +## Converting strings into factors + +### Exercise 11: turn a variable into a factor +Use the `mutate` function to create a variable called `region_cat` containing a categorical version of the `region` variable. + +- TIP: to do this, you only need the first argument of the `factor` function + +## Converting strings into factors +```{r, size = "footnotesize"} +# Create categorical region +whr_panel <- + mutate(whr_panel, + region_cat = factor(region)) + + +class(whr_panel$region_cat) +levels(whr_panel$region_cat) +``` + +## Labelling values + +The `labels` argument of the `factor` function can be used to assign labels to specific values + +```{r, size = "footnotesize"} +# Assign 'not so happy' and 'happy' as labels +# for above of below median happy score, respectively +whr_panel <- + mutate(whr_panel, + happy_cat = factor(happy_high, + levels = c(FALSE, TRUE), + labels = c("Not so happy", + "Happy"))) + +table(whr_panel$happy_cat) +``` + +## Ordering factors + +The `ordered` argument of the `factor` function can be used tell R to always display the categories in a certain order. + +```{r, size = "footnotesize"} +# Assign 'not so happy' and 'happy' as labels +# for above of below median happy score, respectively +whr_panel <- + mutate(whr_panel, + happy_ord = factor(happy_high, + levels = c(FALSE, TRUE), + labels = c("Not so happy", + "Happy"), + ordered = TRUE)) + +table(whr_panel$happy_ord) +``` + +# Appendix + +# Reshaping + +## `spread` and `gather` + + * These are the `tidyverse` functions to reshape data + * We've `spread` in Lab 2 to create a table of the happy score by year + +```{r, size = "footnotesize"} +# Spread data +happy_table <- + spread(select(whr_panel, country, year, happy_score), # data + key = year, # column to be used as column headings + value = happy_score) # column to populate the cells + +# See result +head(happy_table) +``` + +## `spread` and `gather` + +If we wanted to make the resulting data set long again, we'd use `gather`: +```{r, size = "footnotesize"} +# Gather the happy_table +happy_table_long <- + gather(happy_table, + key = year, + value = happy_score, + `2015`, `2016`, `2017`) # name of columns that will be gathered + +# See result +head(happy_table_long) + +``` + +## `dcast`: reshape from long to wide + +### `dcast(data, formula, value.var)` + +* **data: ** a **data.table** +* **formula: ** a formula of the format `LHS ~ RHS`, where LHS is the unique ID in the final data set and `RHS` are the `j` variables in Stata's `reshape` +* **value.var: ** name of the columns to be reshaped + +## `dcast`: reshape from long to wide + +```{r, eval = F} +library(data.table) +``` + +```{r, size = "tiny"} +# Making the data long +happy_long <- + dcast(setDT(whr_panel), + country ~ year, + value.var = c("happy_score", + "happy_rank")) + +# See result +head(happy_long) +``` + +## `melt`: reshape from wide to long + +### `melt(data, id.vars, measure.vars) + + * **data: ** a **data.table** object to melt + * **id.vars:** a vector of unique IDs in `data` + * **measure.vars:** a list of variables to melt + * **variable.name:** the name of the new ID var (the one that was in wide) + * **value.name:** a vector of names for the reshaped variables + +## `melt`: reshape from wide to long +```{r, size = "tiny"} +# Reshape +happy_wide <- + melt(happy_long, + id.vars = "country", + measure.vars = patterns("^happy_score", "^happy_rank"), + variable.name = "year", + value.name = c("happy_score", "happy_rank")) + +# See result +head(happy_wide) +``` + +## Saving a data set as R data + +* The problem with CSVs is that they cannot differentiate between strings and factors +* They also don't save factor orders +* Data attributes (which are beyong the scope of this training, but also useful to document data sets) are also lost in csv data + + +## Saving a data set as R data + +The R equivalent of a `.dta` file is a `.Rda` file. It can be saved and loaded using the following commands: + +### `saveRDS(object, file = "")` +Writes a single R object to a file. + +* **object:** the R object to be save +* **file: ** the file path to where it should be saved + +### `readRDS(file)` +Load a single R object from a file. + +* **file: ** the file path to the data set + + +## Saving a data set as R data + +```{r} +# Save the data set +saveRDS(whr_panel, + file = file.path(finalData, + "whr_panel.Rda")) + +# And load it again +whr_panel <- + readRDS(file.path(finalData, + "whr_panel.Rda")) + +``` +## Saving a data set to Stata + +### ``write.dta13(data, file)`` +Writes a Stata dta-file bytewise and saves the data into a dta-file. + +* **data: ** A data frame +* **file: ** Path to the dta file you want to export + +## Saving a data set to Stata + +```{r} +# This command doesn't handle ordered factors well +whr_panel$happy_ord <- NULL + +# Export it +save.dta13(whr_panel, + file = file.path(finalData, + "whr_panel.dta")) + +# Load it again +whr_panel <- read.dta13(file.path(finalData, + "whr_panel.dta")) +``` + + diff --git a/Presentations/Lab 6 - Spatial Data.Rmd b/Presentations/Lab 6 - Spatial Data.Rmd deleted file mode 100644 index eb64ff5..0000000 --- a/Presentations/Lab 6 - Spatial Data.Rmd +++ /dev/null @@ -1,935 +0,0 @@ ---- -title: "Spatial Data in R" -author: "DIME" -output: - ioslides_presentation: - transition: 0 - smaller: true - fig_caption: yes ---- - - - - - - -```{r setup, include=FALSE} -knitr::opts_chunk$set(echo = TRUE) -``` - -# GIS Overview - -## GIS Data -There are two main types of spatial data: vector and raster data. - -* __Vector:__ Vectors (also called shapefiles) consist of points, lines and polygons. These shapes are attached to a dataframe, where each row corresponds to a different spatial element. - -* __Raster:__ Rasters are spatially-referenced grids where each cell has one value. - -
- -
-__Shapefile__ -```{r fig.align="center", echo=F} -knitr::include_graphics("img/shapefile.png", dpi = 120) -``` - -__Raster__ -```{r fig.align="center", echo=F} -knitr::include_graphics("img/raster.png", dpi = 130) -``` -
- -## Coordinate Reference Systems - -* __Coordinate reference systems__ map pairs of numbers to a location on the earth. -* __Geographic Coordinate Systems__ live on a sphere; here, the units are in decimal degrees - + Using the WGS84 coordinate system the World Bank MC building is located at 38.89 degrees latitude and -77.04 degrees longitude. -* __Projected Coordinate Systems__ project the earth onto a flat surface. Doing this [distorts](http://geoawesomeness.com/5-tools-will-let-master-map-projections/) the earth in some way (shape, area, distance or direction) - + Using to the World Mercator projection, the World Bank is located 4680364.64 north and -8576320.73 east. - -```{r fig.align="center", echo=F} -knitr::include_graphics("img/coordinate_systems.jpg", dpi = 120) -``` - -## GIS in R System - -Here, we'll use the following packages - -* __sp:__ Defines classes and methods for spatial objects. -* __rgdal:__ For processing spatial data, particularly for reading and writing spatial data. -* __rgeos:__ A vector processing library. -* __raster:__ Reading, writing, manipulating, analyzing and modeling of gridded spatial data. - -```{r fig.align="center", echo=F} -knitr::include_graphics("img/GISsystem.png", dpi = 380) -``` - -```{r, include=F} - -projectFolder <- "~/Dropbox/R-Training" -dataWorkFolder <- file.path(projectFolder,"DataWork") -Data <- file.path(dataWorkFolder,"DataSets") -rawData <- file.path(Data,"Raw") -intData <- file.path(Data,"Intermediate") -finalData <- file.path(Data,"Final") -Code <- file.path(dataWorkFolder,"Code") -Doc <- file.path(dataWorkFolder,"Documentation") -Output <- file.path(dataWorkFolder,"Output") -rawOutput <- file.path(Output,"Raw") -finalOutput <- file.path(Output,"Final") -``` - - -## Load and Map HH-Level Data -```{r, echo=T, warning=F, message=F} -# Packages and Filepaths ------------------------------------------------------- -library(sp) -library(rgdal) -library(rgeos) -library(raster) - -hh_data <- read.csv(file.path(finalData,"HH_data.csv")) - -str(hh_data) -``` - -## Make a Map -```{r, fig.width=5, fig.height=4, fig.align="center"} -library(ggplot2) -ggplot() + - geom_point(data=hh_data, - aes(x=longitude_scramble, y=latitude_scramble, color=food_security), - size=.7) -``` - -## Make a Better Map -```{r, eval=T, echo=T, warning=F, message=F} -hh_map1 <- ggplot() + - - # Points - geom_point(data=hh_data, - aes(x=longitude_scramble, y=latitude_scramble, color=food_security), - size=.7) + - - # Other elements to improve map - coord_quickmap() + # make sure the map doesn't look distorted. Can also use - # coord_map(), but sometimes makes the process slow - theme_void() + - scale_color_manual(values=c("green", "orange", "red")) + - labs(title="Household Food Security", color="Food Security") + - theme(plot.title = element_text(hjust = 0.5, face="bold")) # center and bold title -``` - -## -```{r} -hh_map1 -``` - -## Add a Basemap -```{r, warning=F, message=F} -library(ggmap) -basemap <- get_map(location = c(lon=mean(hh_data$longitude_scramble), - lat=mean(hh_data$latitude_scramble)), - zoom=10, - maptype="roadmap") # roadmap, satellite, etc. See help(get_map) - -hh_map2 <- ggmap(basemap) + - geom_point(data=hh_data, - aes(x=longitude_scramble, - y=latitude_scramble, - color=food_security), - size=.7) + - coord_quickmap() + # make sure the map doesn't look distorted. Can also use - # coord_map(), but sometimes makes the process slow - theme_void() + - labs(title="Household Food Security", color="Food Security") + - theme(plot.title = element_text(hjust = 0.5, face="bold")) + - scale_color_manual(values=c("green", "orange", "red")) -``` - -## -```{r} -hh_map2 -``` - -## -```{r, warning=F, message=F} -hh_map2 + - scale_x_continuous(limits = c(min(hh_data$longitude_scramble), - max(hh_data$longitude_scramble)), - expand = c(.03, .03)) + - scale_y_continuous(limits = c(min(hh_data$latitude_scramble), - max(hh_data$latitude_scramble)), - expand = c(.03, .03)) -``` - -## Spatial Dataframe -So far, we've just been working with a dataframe and using the longitude and latitude variables. However, for many other uses of spatial data we need to convert the dataframe into a _spatial_ dataframe. - -* `coordinates` transforms a dataframe into a spatial dataframe. The syntax is: `coordinates(DATA FRAME) <- ~LONGITUDE+LATITUDE`, where LONGITUDE and LATITUDE are the names of variables in DATA FRAME. -* After creating a _spatial_ dataframe we need to tell it what coordinate reference system (CRS) the coordinates are in. We define the crs using `crs`. - -```{r, fig.width=4, fig.height=4, fig.align="center"} -hh_data_sdf <- hh_data -coordinates(hh_data_sdf) <- ~longitude_scramble+latitude_scramble -crs(hh_data_sdf) <- CRS("+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0") -``` - -## Spatial Dataframe Structure -Here, the variables are separated from the coordinates. Specifically, a spatial dataframe is a list, where one element of the list is a dataframe of variables and other is a dataframe of coordinates. - -```{r} -hh_data_sdf -``` - -## -Access a dataframe containing the variables (except the coordinates) -```{r, eval=F} -hh_data_sdf@data -``` -
-Access the coordinates of the points -```{r, eval=F} -hh_data_sdf@coords -``` -
-We can still access variables like a normal dataframe -```{r, eval=F} -hh_data_sdf@data$food_security -hh_data_sdf$food_security # short-cut -``` - -## Quickly Plot Spatial Data -We can quickly plot spatial dataframes. - -```{r, warning=F, message=F} -plot(hh_data_sdf) -``` - -## Perform Spatial Operations -See [rgeos](https://www.rdocumentation.org/packages/rgeos/versions/0.3-28) for a list of common functions to apply on vector data. - -### Buffering -```{r, warning=F, message=F} -library(rgeos) -hh_data_sdf_buff <- gBuffer(hh_data_sdf, width= 0.5/111, byid=T) #buffer by about 10km -plot(hh_data_sdf_buff) -``` - -## -### Reproject -```{r, warning=F, message=F} -head(hh_data_sdf@coords) -hh_data_sdf_newproj <- spTransform(hh_data_sdf, CRS("+init=epsg:3857")) -head(hh_data_sdf_newproj@coords) -``` - -## -### Calculating Distances -_Note:_ See appendix for ways to calculate distances in a more accurate way (e.g., by using an equal area projection or by taking into account the curvator of the earth) -```{r, warning=F, message=F} -dist_matrix <- gDistance(hh_data_sdf_newproj, byid=T) -dist_matrix[1:7,1:7] -``` - -## Interactive Maps -So far, we've been making static maps. But those are boring. Let's make an interactive map using [Leaflet](https://rstudio.github.io/leaflet/). There's a bunch of different basemaps to choose from; from the list [here](http://leaflet-extras.github.io/leaflet-providers/preview/), enter the name of the basemap in quotes in the `addProviderTiles` function. - -```{r, eval=T, warning=F, message=F} -library(leaflet) -library(dplyr) - -imap_1 <- leaflet() %>% - addProviderTiles("OpenStreetMap") %>% - addCircles(data=hh_data_sdf) -``` - -_NOTE:_ `leaflet` assumes you are using the following coordinate reference system: -```{r, eval=F} -"+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0" -``` - -## -```{r} -imap_1 -``` - -## -We can save our interactive map outside of R. Here, we use `saveWidget` from the `htmlwidgets` package. Doubleclicking the html file will open the file in any browser. -```{r, eval=T, warning=F, message=F} -library(htmlwidgets) -saveWidget(imap_1, file=file.path(Output,"interactive_map.html")) -``` - -## Lets make a better interactive map -```{r} -# Define Palette -pal <- colorFactor( - palette = c("Green","Yellow","Red"), - domain = hh_data_sdf$food_security) - -imap_2 <- leaflet() %>% - addProviderTiles("Stamen.Terrain") %>% - addCircleMarkers(data=hh_data_sdf, - radius=3, - fillOpacity=1, - color=~pal(food_security), - stroke=F, # remove outer-circle around circle - popup=~food_security) %>% # variable to display when click feature - # Add legend for points layer - addLegend(pal = pal, - values = hh_data_sdf$food_security, - title = "Food Security") -``` - -## -```{r} -imap_2 -``` - -## Polygon Data -```{r, echo=T, warning=F, message=F} -# Load and Prep Plot-Level Data ------------------------------------------------ -setwd(finalData) # set filepath to folder with shapefile -ag_fields <- readOGR(dsn=".", layer="allsitessubset", verbose = FALSE) - -# Plot File Projection -crs(ag_fields) - -# HH Survey Location Projection -crs(hh_data_sdf) - -# Reproject plot data to have same projection as HH survey data -hh_dd_proj <- as.character(crs(hh_data_sdf)) -ag_fields <- spTransform(ag_fields, CRS(hh_dd_proj)) -``` - -## -```{r} -plot(ag_fields) -``` - -## -```{r} -ag_fields -``` - -```{r, eval=F} -ag_fields@polygons Each polyon is defined by a matrix of vertices -``` - -## -### Simplify the Shapefile -```{r} -head(ag_fields) -``` - -You'll notice that multiple features have the same name. Here, let's simplify the polygon so that each site name represents one feature. - -```{r fig.align="center", echo=F} -knitr::include_graphics("img/dissolve.png", dpi = 190) -``` - -## -To aggregate (or dissolve) the polygon by the variable `site`, use the `aggregate` function from the `raster` package. -```{r} -ag_fields <- aggregate(ag_fields, by = "site") -``` - -```{r} -plot(ag_fields) -``` - -## -In the previous slide, you'll notice there's a weird tic mark in one of the polygons. To get rid of this, let's buffer the polygon by a very small amount -- which will help cover the hole in the field. - -```{r, warning=F, message=F} -ag_fields <- gBuffer(ag_fields, width=0.000001/111, byid=T) -plot(ag_fields) -``` - -## Prep Data for Map -Before we make a map, let's merge in plot some data for each agriculture field. I've already prepared this data, but it includes average food securtiy and average food expenditure per agriculture field based on the household data. - -```{r} -ag_fields_data <- read.csv(file.path(finalData, "plot_data.csv")) -ag_fields <- merge(ag_fields, ag_fields_data, by="site") -summary(ag_fields@data) -``` - -## -ggplot can't interpret spatial dataframes. Consequently, we need to transform the data into a format that ggplot can understand. Here, we make a dataframe where each vertice of the polygon is an observation. - -```{r, warning=F, message=F} -library(broom) - -ag_fields_df <- tidy(ag_fields, region="site") -head(ag_fields_df) -``` - -The resulting dataframe from `tidy`: - -* Has the variable `id`, which is taken from the the variable in the `region` argument. -* Doesn't have any of our other variables (e.g., food security). -* We can merge our other variables (in `ag_fields`) into this dataframe using the `site` variable from `ag_fields` and the `id` variable from `ag_fields_df`. - -## -```{r} -ag_fields_df <- merge(ag_fields_df, ag_fields, by.x="id", by.y="site") -head(ag_fields_df) -``` - -## Now, let's make a map -```{r, echo=T} -ggplot() + - geom_polygon(data = ag_fields_df, aes(x = long, y=lat, group=group, - fill=expend_food_yearly_mean)) -``` - -## Let's make a better map -```{r} -ag_map1 <- ggplot() + - geom_polygon(data = ag_fields_df, aes(x = long, y=lat, group=group, - fill=expend_food_yearly_mean), - color="black", size=0.25) + # Borders of polygons - - coord_quickmap() + # Make sure map doesn't look distorted - theme_void() + # Set theme of plot - scale_fill_gradient(low = "orangered1", high = "green1") + # Color Gradient - labs(fill="Food\nExpenditure", title="Annual Food Expenditure") + # Labels - theme(plot.title = element_text(hjust = 0.5, face="bold")) # Center title -``` - -## -```{r} -ag_map1 -``` - -## Lets use a different color palette {#brewerslide} - - -```{r, warning=F, message=F} -library(RColorBrewer) -``` - -```{r fig.height=6.5, fig.height=6.5, echo=F} -RColorBrewer::display.brewer.all() -``` - -## -```{r} -ag_map2 <- ggplot() + - geom_polygon(data = ag_fields_df, aes(x = long, y=lat, group=group, - fill=expend_food_yearly_mean), - color="black", size=0.25) + # Borders of polygons - - coord_quickmap() + - theme_void() + - scale_fill_distiller(palette = "Spectral", direction = 1) + # Color Gradient - labs(fill="Food\nExpenditure", title="Annual Food Expenditure") + - theme(plot.title = element_text(hjust = 0.5, face="bold")) -``` - -## -```{r} -ag_map2 -``` - -## ggplot colors scale summary - -[Defing own color scale](http://ggplot2.tidyverse.org/reference/scale_gradient.html) - -* `scale_*_gradient(low, high)`: [For continuous variable] Manually define low/high colors -* `scale_*_gradientn(colors)`: [For continuous variable] Use a defined list of colors (e.g., `c("purple","blue","yellow","white")`) -* `scale_*_manual(colors)`: [For discrete variable] Use a defined list of colors, where the list is equal to the number of unique observations in the discrete variable. - -[Using pre-defined palettes](http://ggplot2.tidyverse.org/reference/scale_brewer.html) - -* `scale_*_distiller(palette)`: [For contiuous variables] -* `scale_*_brewer(palette)`: [For discrete variables] - -Where * is either `color` or `fill`. Discrete variables = factor variables. - -## Add text to map -Now, let's add [text](http://ggplot2.tidyverse.org/reference/geom_text.html) to the map showing regions. To do this, we need to create a dataframe that includes the lat/lon of the center of each plot and a variable for the site name. To do this, we use: - -* `gCentroid()` which returns a shapefile of the center of each region -* `coordinates` which returns a matrix of the coordinates of a shapefile, where the first column is longitude and the second column is latitude. -```{r} -# Create dataframe of center of each plot with site name as variable -ag_fields_center <- gCentroid(ag_fields, byid=T) -ag_fields_center <- as.data.frame(coordinates(ag_fields_center)) - -ag_fields_center$site <- ag_fields$site -names(ag_fields_center) <- c("longitude","latitude","site") -``` - -## -Now, lets use `geom_text` to add text to our map. -```{r} -ag_map2 + geom_text(data=ag_fields_center, aes(label=site, x=longitude, y=latitude), - check_overlap = TRUE) # makes sure text doesn't overlap -``` - -## -If you have lots of cluttered text, use `geom_text_repel` from `ggrepel`. -```{r} -library(ggrepel) - -ag_map2 + geom_text_repel(data=ag_fields_center, aes(label=site, x=longitude, y=latitude)) -``` - -## Let's add HH locations to the map -```{r} -ag_map2 <- ggplot() + - geom_polygon(data = ag_fields_df, aes(x = long, y=lat, group=group, - fill=expend_food_yearly_mean), - color="black", size=0.25) + # Borders of polygons - geom_point(data=hh_data, - aes(x=longitude_scramble, - y=latitude_scramble, - color="HH Location"), # Name an aesthetic want you want to - # appear on legend - size=.1, alpha=.6) + - scale_color_manual(values="black") + # Manually define color of points - coord_quickmap() + - theme_void() + - scale_fill_gradient(low = "orangered1", high = "green1") + - labs(fill="Food\nExpenditure", title="Annual Food Expenditure", - color="") + # Setting color="" makes the title above the legend item blank - theme(plot.title = element_text(hjust = 0.5, face="bold")) -``` - -## -```{r} -ag_map2 -``` - -## Add Polygon to Interactive Map -```{r} -leaflet() %>% - addProviderTiles("OpenStreetMap") %>% - addPolygons(data=ag_fields) -``` - -## -```{r} -pal_foodexpend <- colorNumeric("RdYlGn", ag_fields$expend_food_yearly_mean) - -imap_3 <- leaflet() %>% - addProviderTiles("OpenStreetMap") %>% - addPolygons(data=ag_fields, - fillColor = ~pal_foodexpend(expend_food_yearly_mean), - fillOpacity = 0.6, - color="black", # color of line around polygons - weight=1, # width of line around polygons - popup=~site) -``` - -## -```{r} -imap_3 -``` - -## Map Using Admin Data - -Now, let's make a map using administrative-level data. The `get_data` function from the `raster` package allows us to download data from [GADM](https://gadm.org/) (the Database of Global Administrative Areas). Let's download the third administrative division for Rwanda. - -```{r} -rwa_adm <- getData('GADM', country='RWA', level=3) -rwa_adm -``` - -## -```{r} -head(rwa_adm) -``` - -```{r, include=F, eval=F} -# Extract vegetation data to district data -library(velox) -rwa_adm_NDVI <- rwa_adm -ndvi_2014_0910 <- raster(file.path(finalData, "ndvi_2014_0910.tif")) -rwa_adm_NDVI$NDVI <- velox(ndvi_2014_0910)$extract(rwa_adm_NDVI, function(x) mean(x, na.rm=T)) -rwa_adm_NDVI$NDVI <- rwa_adm_NDVI$NDVI / 10000 -rwa_adm_NDVI$vegetation <- round(rwa_adm_NDVI$NDVI, 3) -rwa_adm_NDVI <- subset(rwa_adm_NDVI, select=c(OBJECTID, vegetation)) -rwa_adm_NDVI <- rwa_adm_NDVI@data -rwa_adm_NDVI <- rwa_adm_NDVI[rwa_adm_NDVI$vegetation > 0.2,] -write.csv(rwa_adm_NDVI, file.path(finalData, "rwanda_vegetation.csv")) -``` - -## -I've preppared a dataset of average vegetation levels per administrative unit in Rwanda. Here, 0 means no vegetation and 1 means significant vegetation. -```{r} -rwa_veg <- read.csv(file.path(finalData, "rwanda_vegetation.csv")) -head(rwa_veg) -``` - -## Exercise -Make an administrative-level map of vegetation levels. Here's a broad outline of steps you'll need to implement: - -* Merge the vegetation data with the Rwanda spatial dataframe. -* Convert the spatial dataframe into a dataframe that is interpretable by ggplot -* Make the map using ggplot. -* __Note__: There are some NA values; think about effective ways to map those. -* __Hint__: If your computer is talking a long time to plot, run the below code which simplifies the polygon (i.e., removes some of the vertices to simplify the shape). Larger values of `tol` make the polygon more simplified. I played around with values and found `0.003` to be adequate. Note that large values of `tol` might reduce the number of polygons! -```{r} -rwa_adm_simple <- gSimplify(rwa_adm, tol=0.003) # simplifies the polygon - -# Resulting layer from gSimplify has no variables, so need to add a variable back in -rwa_adm_simple$OBJECTID <- rwa_adm$OBJECTID -``` - -## Solution -```{r} -# 1. Merge data together ------------------------------------------------------- -rwa_adm_simple <- merge(rwa_adm_simple, rwa_veg, by="OBJECTID") - -# 2. Make dataframe interpretable by ggplot ------------------------------------ -rwa_adm_df <- tidy(rwa_adm_simple, region="OBJECTID") -rwa_adm_df <- merge(rwa_adm_df, rwa_adm_simple, by.x="id", by.y="OBJECTID") - -# 3. Make Plot ----------------------------------------------------------------- -veg_map <- ggplot() + - geom_polygon(data=rwa_adm_df, aes(x=long, y=lat, group=group), - fill="white", color="black",size=.1) + - geom_polygon(data=rwa_adm_df[!is.na(rwa_adm_df$vegetation),], - aes(x=long, y=lat, group=group, fill=vegetation), - color="black",size=.1) + - theme_void() + - scale_fill_distiller(palette="RdYlGn", direction=1) + - labs(fill="Vegetation", title="Vegetation in Rwanda") + - theme(plot.title = element_text(hjust = 0.5, face="bold")) -``` - -## -```{r} -veg_map -``` - -## APPENDIX - -## Raster Files -Satellite imagery often comes in multiple bands. Each band captures a different part of the electromagnetic spectrum. Different combinations of bands are used to create _spectral indices_ which are used to highlight different land cover elements, such as vegetation, built-up area, or burnt areas. - -
-```{r fig.align="center", echo=F} -knitr::include_graphics("img/landsat_bands.gif", dpi = 240) -``` - -```{r fig.align="center", echo=F} -knitr::include_graphics("img/spectral_profile.jpg", dpi = 210) -``` -
- -$$NDVI = \frac{NIR-Red}{NIR+Red}$$ - -## Some common satellites include - -* [Landsat](https://explorer.earthengine.google.com/#detail/LANDSAT%2FLC08%2FC01%2FT1_TOA): Captures images across the earth every 16 days at a 30 meter resolution across multiple spectral bands. Landsat began collecting images in 1972 and continues to the present. - -* [Sentinel](https://explorer.earthengine.google.com/#detail/COPERNICUS%2FS2): Captures images across the earth every 5 days (at the equator) at a 10 meter resolution across multiple sepctral bands. Began in 2014 and continues to the present. - -* [MODIS](https://en.wikipedia.org/wiki/Moderate_Resolution_Imaging_Spectroradiometer): Captures daily to bi-weekly imagery ranging from a 250m to 500m resolution. - -* [VIIRS](https://ngdc.noaa.gov/eog/viirs/download_dnb_composites.html): Monthly nighttime lights at a 750m resolution from April 2012 to present. - -## Load and Plot NDVI Around Rwanda -```{r, eval=T} -ndvi_2012_a <- raster(file.path(finalData, "ndvi_2012_0203.tif")) -plot(ndvi_2012_a) -plot(ag_fields,add=T) -``` - -## Crop Image to Agricultural Fields -```{r, eval=T} -ndvi_2012_a <- crop(ndvi_2012_a, ag_fields) -plot(ndvi_2012_a) -plot(ag_fields, add=T) -``` - -## Mask Image to Agricultural Fields -```{r, eval=T} -ndvi_2012_a <- mask(ndvi_2012_a, ag_fields) -plot(ndvi_2012_a) -``` - -## Plot Raster Using ggplot - -You can also plot rasters using ggplot. Here, we need to convert the raster into a dataframe with latitude, longitude and the values. Then, we use `geom_tile`. - -```{r, eval=F} -ndvi_2012_a_df <- as(ndvi_2012_a, "SpatialPixelsDataFrame") -ndvi_2012_a_df <- as.data.frame(ndvi_2012_a_df) - -ggplot() + - geom_tile(data=ndvi_2012_a_df, aes(x=x, y=y, fill=ndvi_2012_0203),alpha=1) + - scale_fill_gradient(low="red",high="green") + - coord_quickmap() + - theme_void() + - labs(fill="NDVI", title="NDVI") -``` - -## -```{r, eval=T, echo=F} -ndvi_2012_a_df <- as(ndvi_2012_a, "SpatialPixelsDataFrame") -ndvi_2012_a_df <- as.data.frame(ndvi_2012_a_df) - -ggplot() + - geom_tile(data=ndvi_2012_a_df, aes(x=x, y=y, fill=ndvi_2012_0203), alpha=1) + - scale_fill_gradient(low="red",high="green") + - coord_quickmap() + - theme_void() + - labs(fill="NDVI", title="Vegetation") -``` - -## Plot Raster Using rasterVis {#brewerslide} - - -We can also plot the raster using other packages that don't require first converting the object to a dataframe. Here, we use [rastervis](https://oscarperpinan.github.io/rastervis/). - -```{r, warning=F, message=F} -library(rasterVis) - -cols <- colorRampPalette(brewer.pal(9,"RdYlGn")) - -levelplot(ndvi_2012_a, - col.regions = cols) -``` - -## -### Make better map -```{r, eval=F} -levelplot(ndvi_2012_a, - col.regions = cols, - margin=F, # No histograms on side - par.settings= - list(axis.line=list(col='transparent')), # No box around image - scales=list(draw=FALSE), # No latitude and longitude markers - xlab="", - ylab="", - main="NDVI 2012 (Season A)", - colorkey=list(space="right"), # legend location. can change to bottom, - # left and top. If you don't want a - # legend, use: colorkey = F - maxpixels=1e6 # sometimes blends pixels together - ) + - layer(sp.polygons(ag_fields, - col="black", # border color - lwd=1, # line width - alpha=1)) -``` - -## -```{r, eval=T, echo=F, warning=F, message=F} -levelplot(ndvi_2012_a, - col.regions = cols, - margin=F, # No histograms on side - par.settings= - list(axis.line=list(col='transparent')), # No box around image - scales=list(draw=FALSE), # No latitude and longitude markers - xlab="", - ylab="", - main="NDVI 2012 (Season A)", - colorkey=list(space="right"), # legend location. can change to bottom, - # left and top. If you don't want a - # legend, use: colorkey = F - maxpixels=1e6 # sometimes blends pixels together - ) + - layer(sp.polygons(ag_fields, - col="black", # border color - lwd=1, # line width - alpha=1)) -``` - - -## Raster Arithmetic -Note that NDVI goes from -1 to 1; MODIS scales this value by 10000 so we'll scale this back to -1 to 1 by multiplying values by 0.0001. There are two ways to do arithmetic on rasters: - -```{r, eval=F} -raster_new <- raster * 0.0001 -``` -OR -```{r, eval=F} -raster_new <- calc(raster, fun = function(x) x * 0.0001) -``` - -The second way looks ugly, but it is more efficient. - -## Plot Multiple Images - -```{r, eval=T, warning=F, message=F} -library(rasterVis) -library(RColorBrewer) - -ndvi_2012_a <- raster(file.path(finalData, "ndvi_2012_0203.tif")) %>% - crop(ag_fields) %>% - mask(ag_fields) %>% - calc(fun=function(x) x*0.0001) -ndvi_2012_b <- raster(file.path(finalData, "ndvi_2012_0910.tif")) %>% - crop(ag_fields) %>% - mask(ag_fields) %>% - calc(fun=function(x) x*0.0001) -ndvi_2016_a <- raster(file.path(finalData, "ndvi_2016_0203.tif")) %>% - crop(ag_fields) %>% - mask(ag_fields) %>% - calc(fun=function(x) x*0.0001) -ndvi_2016_b <- raster(file.path(finalData, "ndvi_2016_0910.tif")) %>% - crop(ag_fields) %>% - mask(ag_fields) %>% - calc(fun=function(x) x*0.0001) - -ndvi_stack <- stack(ndvi_2012_a, ndvi_2012_b, - ndvi_2016_a, ndvi_2016_b) - -cols <- colorRampPalette(brewer.pal(9,"RdYlGn")) -``` - -## -```{r} -levelplot(ndvi_stack, - main="NDVI Across Seasons", - col.regions=cols, - layout=c(2, 2)) -``` - -## Interactive Map with Rasters - -```{r, echo=T, warning=F, message=F} -pal <- colorNumeric(c("red3", "yellow", "palegreen4"), values(ndvi_stack), - na.color = "transparent") - -NDVI_interactive <- leaflet() %>% - addProviderTiles("OpenStreetMap") %>% - addRasterImage(ndvi_2012_a, colors = pal, opacity = 1, group="2012 A") %>% - addRasterImage(ndvi_2012_b, colors = pal, opacity = 1, group="2012 B") %>% - addRasterImage(ndvi_2016_a, colors = pal, opacity = 1, group="2016 A") %>% - addRasterImage(ndvi_2016_b, colors = pal, opacity = 1, group="2016 B") %>% - addLegend(pal = pal, values = values(ndvi_stack), - title = "NDVI") %>% - addLayersControl( - overlayGroups = c("2012 A","2012 B","2016 A","2016 B"), - options = layersControlOptions(collapsed = FALSE)) -``` - -## -```{r} -NDVI_interactive -``` - -## Summarize NDVI By Plot -Let's say we want to determine average NDVI values for each plot or the NDVI value under each household location. To do this, we use `extract` from the `raster` package. -```{r} -# Average NDVI Value per Plot -ag_fields$NDVI_2012_a <- extract(ndvi_2012_a, ag_fields, fun=mean) - -# NDVI Value per HH Location -hh_data_sdf$NDVI_2012_a <- extract(ndvi_2012_a, hh_data_sdf) -``` -Sometimes this process can be slow. To (dramatically) increase the speed of this task, use the [velox package](https://hunzikp.github.io/velox/). This is built to work with spatial _polygons_ (not points). - -* `velox(raster)` converts the raster object to a velox raster object. From the raster object you can use the extract method, where the inputs are the shapefile and summarize function. -```{r} -library(velox) -ag_fields$ndvi_2012_a_velox <- velox(ndvi_2012_a)$extract(ag_fields, fun=mean) -``` - -## -```{r} -subset(ag_fields@data, select=c(site, NDVI_2012_a, ndvi_2012_a_velox)) -``` - -## Useful Projections - -__Geographic Coordinate System__ - -The most common geographic coordinate system is the [World Geodetic System (WGS84)](https://en.wikipedia.org/wiki/World_Geodetic_System). - -```{r, eval=F} -"+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0" -``` - -__Projected Coordinate Systems__ - -Below are projections for equal area and equal distance. Replace [LATITUDE VALUE] and [LONGITUDE VALUE] with the center of the area that you're working with. - -_Equal Area:_ [Lambert Azimuthal Equal Area](https://en.wikipedia.org/wiki/Lambert_azimuthal_equal-area_projection) -```{r, eval=F} -"+proj=laea +lat_0=[LATITUDE VALUE] +lon_0=[LONGITUDE VALUE]" -``` - -_Equal Distance:_ [Azimuthal equidistant projection](https://en.wikipedia.org/wiki/Azimuthal_equidistant_projection) -```{r, eval=F} -"+proj=aeqd +lat_0=[LATITUDE VALUE] +lon_0=[LONGITUDE VALUE]" -``` - -## Calculating Distances: Way 1 - -A common task in GIS is to calculate distances. Here, let's calculate the distance of each household location to the nearest city (using a dataset of Rwanda's 3 largest cities). This same syntax can be used to calculate the shortest distance to polygons (e.g., a lake) or polylines (e.g., roads). - -Import city-level dataset -```{r, warning=F, message=F} -rwa_cities <- read.csv(file.path(finalData, "rwa_cities.csv")) -coordinates(rwa_cities) <- ~lon+lat -crs(rwa_cities) <- CRS("+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0") -``` - -Re-project to equal distant projection -```{r, warning=F, message=F} -# Here, I used Kigali as the center -eq_dist_projection <- "+proj=aeqd +lat_0=-1.943889 +lon_0=30.059444" - -rwa_cities_ed <- spTransform(rwa_cities, CRS(eq_dist_projection)) -hh_data_sdf_ed <- spTransform(hh_data_sdf, CRS(eq_dist_projection)) -``` - -## -Creates distance matrix -```{r, warning=F, message=F} -gDistance(rwa_cities_ed, hh_data_sdf_ed, byid=T) %>% head -``` - -Calculate shortest distance to any city. -```{r, warning=F, message=F} -gDistance_cities_i <- function(i) gDistance(hh_data_sdf_ed[i,], rwa_cities_ed, byid=F) -hh_data_sdf_ed$dist_city <- lapply(1:nrow(hh_data_sdf_ed), gDistance_cities_i) %>% unlist -``` - -## Calculating Distances: Way 2 -There are also ways to calculate distances by using a geographic coordinate system (ie, keeping the world a sphere) and taking into account the curvator of the earth. __Haversine__ (good for short distances -- think the distance of a short haul flight) and __vincenty__ (slower but better for longer distances) [are common formulas](https://www.r-bloggers.com/great-circle-distance-calculations-in-r/) to calculate these distances. - -The below code uses the haversine method to calculate the distance between all the household locations and the first Rwandan city. The output is in meters. -```{r, eval=F} -library(geosphere) -distHaversine(hh_data_sdf, rwa_cities[1,]) -``` - -We can also calculate the distance between a point and a line using the `dist2Line` function from the `geosphere` package. Below shows an example between an arbitrary point and line shapefile using the haversine formula. -```{r, eval=F} -dist2Line(points, line, distfun=distHaversine) -``` - - - - diff --git a/Presentations/Lab 6 - Spatial Data.html b/Presentations/Lab 6 - Spatial Data.html deleted file mode 100644 index f9a9652..0000000 --- a/Presentations/Lab 6 - Spatial Data.html +++ /dev/null @@ -1,1136 +0,0 @@ - - - - Spatial Data in R - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-

-

-

-
-
- - - - - -

GIS Overview

- -

GIS Data

- -

There are two main types of spatial data: vector and raster data.

- -
    -
  • Vector: Vectors (also called shapefiles) consist of points, lines and polygons. These shapes are attached to a dataframe, where each row corresponds to a different spatial element.

  • -
  • Raster: Rasters are spatially-referenced grids where each cell has one value.

  • -
- -


- -
-

Shapefile

- -

Raster

- -

Coordinate Reference Systems

- -
    -
  • Coordinate reference systems map pairs of numbers to a location on the earth.
  • -
  • Geographic Coordinate Systems live on a sphere; here, the units are in decimal degrees
  • -
  • Using the WGS84 coordinate system the World Bank MC building is located at 38.89 degrees latitude and -77.04 degrees longitude.
  • -
  • Projected Coordinate Systems project the earth onto a flat surface. Doing this distorts the earth in some way (shape, area, distance or direction)
  • -
  • Using to the World Mercator projection, the World Bank is located 4680364.64 north and -8576320.73 east.
  • -
- -

- -

GIS in R System

- -

Here, we'll use the following packages

- -
    -
  • sp: Defines classes and methods for spatial objects.
  • -
  • rgdal: For processing spatial data, particularly for reading and writing spatial data.
  • -
  • rgeos: A vector processing library.
  • -
  • raster: Reading, writing, manipulating, analyzing and modeling of gridded spatial data.
  • -
- -

- -

Load and Map HH-Level Data

- -
# Packages and Filepaths -------------------------------------------------------
-library(sp)
-library(rgdal)
-library(rgeos)
-library(raster)
-
-hh_data <- read.csv(file.path(finalData,"HH_data.csv"))
-
-str(hh_data)
- -
## 'data.frame':    422 obs. of  5 variables:
-##  $ X                 : int  1 3 4 5 6 7 8 10 11 12 ...
-##  $ expend_food_yearly: num  250452 420029 484468 326109 131487 ...
-##  $ food_security     : Factor w/ 3 levels "Little Hunger",..: 1 2 1 1 3 3 1 2 3 1 ...
-##  $ longitude_scramble: num  30.4 30.4 30.4 30.4 30.4 ...
-##  $ latitude_scramble : num  -1.97 -2 -1.99 -1.98 -1.95 ...
- -

Make a Map

- -
library(ggplot2)
-ggplot() +
-  geom_point(data=hh_data, 
-             aes(x=longitude_scramble, y=latitude_scramble, color=food_security),
-             size=.7)
- -

- -

Make a Better Map

- -
hh_map1 <- ggplot() +
-
-  # Points
-  geom_point(data=hh_data, 
-             aes(x=longitude_scramble, y=latitude_scramble, color=food_security),
-             size=.7) +
-  
-  # Other elements to improve map
-  coord_quickmap() + # make sure the map doesn't look distorted. Can also use
-                     # coord_map(), but sometimes makes the process slow
-  theme_void() +
-  scale_color_manual(values=c("green", "orange", "red")) +
-  labs(title="Household Food Security", color="Food Security") +
-  theme(plot.title = element_text(hjust = 0.5, face="bold")) # center and bold title
- -

- -
hh_map1
- -

- -

Add a Basemap

- -
library(ggmap)
-basemap <- get_map(location = c(lon=mean(hh_data$longitude_scramble), 
-                                lat=mean(hh_data$latitude_scramble)), 
-                   zoom=10,
-                   maptype="roadmap") # roadmap, satellite, etc. See help(get_map)
-
-hh_map2 <- ggmap(basemap) +
-  geom_point(data=hh_data, 
-             aes(x=longitude_scramble, 
-                 y=latitude_scramble, 
-                 color=food_security),
-             size=.7) +
-  coord_quickmap() + # make sure the map doesn't look distorted. Can also use
-                     # coord_map(), but sometimes makes the process slow
-  theme_void() +
-  labs(title="Household Food Security", color="Food Security") +
-  theme(plot.title = element_text(hjust = 0.5, face="bold")) +
-  scale_color_manual(values=c("green", "orange", "red"))
- -

- -
hh_map2
- -

- -

- -
hh_map2 + 
-  scale_x_continuous(limits = c(min(hh_data$longitude_scramble), 
-                                max(hh_data$longitude_scramble)), 
-                     expand = c(.03, .03)) +
-  scale_y_continuous(limits = c(min(hh_data$latitude_scramble), 
-                                max(hh_data$latitude_scramble)), 
-                     expand = c(.03, .03))
- -

- -

Spatial Dataframe

- -

So far, we've just been working with a dataframe and using the longitude and latitude variables. However, for many other uses of spatial data we need to convert the dataframe into a spatial dataframe.

- -
    -
  • coordinates transforms a dataframe into a spatial dataframe. The syntax is: coordinates(DATA FRAME) <- ~LONGITUDE+LATITUDE, where LONGITUDE and LATITUDE are the names of variables in DATA FRAME.
  • -
  • After creating a spatial dataframe we need to tell it what coordinate reference system (CRS) the coordinates are in. We define the crs using crs.
  • -
- -
hh_data_sdf <- hh_data
-coordinates(hh_data_sdf) <- ~longitude_scramble+latitude_scramble
-crs(hh_data_sdf) <- CRS("+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0")
- -

Spatial Dataframe Structure

- -

Here, the variables are separated from the coordinates. Specifically, a spatial dataframe is a list, where one element of the list is a dataframe of variables and other is a dataframe of coordinates.

- -
hh_data_sdf
- -
## class       : SpatialPointsDataFrame 
-## features    : 422 
-## extent      : 30.26138, 30.67035, -2.094064, -1.934281  (xmin, xmax, ymin, ymax)
-## coord. ref. : +proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0 
-## variables   : 3
-## names       :   X, expend_food_yearly, food_security 
-## min values  :   1,              0.000, Little Hunger 
-## max values  : 553,        3034121.500, Severe Hunger
- -

- -

Access a dataframe containing the variables (except the coordinates)

- -
hh_data_sdf@data
- -


Access the coordinates of the points

- -
hh_data_sdf@coords
- -


We can still access variables like a normal dataframe

- -
hh_data_sdf@data$food_security
-hh_data_sdf$food_security # short-cut
- -

Quickly Plot Spatial Data

- -

We can quickly plot spatial dataframes.

- -
plot(hh_data_sdf)
- -

- -

Perform Spatial Operations

- -

See rgeos for a list of common functions to apply on vector data.

- -

Buffering

- -
library(rgeos)
-hh_data_sdf_buff <- gBuffer(hh_data_sdf, width= 0.5/111, byid=T) #buffer by about 10km
-plot(hh_data_sdf_buff)
- -

- -

- -

Reproject

- -
head(hh_data_sdf@coords)
- -
##   longitude_scramble latitude_scramble
-## 1           30.41310         -1.965660
-## 2           30.42626         -2.003319
-## 3           30.42944         -1.986508
-## 4           30.42331         -1.976507
-## 5           30.42285         -1.952778
-## 6           30.41069         -2.006962
- -
hh_data_sdf_newproj <- spTransform(hh_data_sdf, CRS("+init=epsg:3857"))
-head(hh_data_sdf_newproj@coords)
- -
##   longitude_scramble latitude_scramble
-## 1            3385571         -218859.2
-## 2            3387035         -223053.9
-## 3            3387390         -221181.4
-## 4            3386707         -220067.4
-## 5            3386657         -217424.3
-## 6            3385302         -223459.7
- -

- -

Calculating Distances

- -

Note: See appendix for ways to calculate distances in a more accurate way (e.g., by using an equal area projection or by taking into account the curvator of the earth)

- -
dist_matrix <- gDistance(hh_data_sdf_newproj, byid=T)
-dist_matrix[1:7,1:7]
- -
##          1         2        3        4        5        6         7
-## 1    0.000 4442.9586 2950.070 1658.693 1799.332 4608.262 4019.3068
-## 2 4442.959    0.0000 1905.795 3004.456 5642.253 1779.875  850.3531
-## 3 2950.070 1905.7947    0.000 1306.708 3828.044 3090.272 1193.2667
-## 4 1658.693 3004.4565 1306.708    0.000 2643.513 3671.725 2442.9956
-## 5 1799.332 5642.2528 3828.044 2643.513    0.000 6185.365 5019.0051
-## 6 4608.262 1779.8748 3090.272 3671.725 6185.365    0.000 2486.2690
-## 7 4019.307  850.3531 1193.267 2442.996 5019.005 2486.269    0.0000
- -

Interactive Maps

- -

So far, we've been making static maps. But those are boring. Let's make an interactive map using Leaflet. There's a bunch of different basemaps to choose from; from the list here, enter the name of the basemap in quotes in the addProviderTiles function.

- -
library(leaflet)
-library(dplyr)
-
-imap_1 <- leaflet() %>%
-  addProviderTiles("OpenStreetMap") %>% 
-  addCircles(data=hh_data_sdf)
- -

NOTE: leaflet assumes you are using the following coordinate reference system:

- -
"+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0" 
- -

- -
imap_1
- -
- - -

- -

We can save our interactive map outside of R. Here, we use saveWidget from the htmlwidgets package. Doubleclicking the html file will open the file in any browser.

- -
library(htmlwidgets)
-saveWidget(imap_1, file=file.path(Output,"interactive_map.html"))
- -

Lets make a better interactive map

- -
# Define Palette
-pal <- colorFactor(
-  palette = c("Green","Yellow","Red"),
-  domain = hh_data_sdf$food_security)
-
-imap_2 <- leaflet() %>%
-  addProviderTiles("Stamen.Terrain") %>% 
-  addCircleMarkers(data=hh_data_sdf,
-                 radius=3,
-                 fillOpacity=1,
-                 color=~pal(food_security),
-                 stroke=F, # remove outer-circle around circle
-                 popup=~food_security) %>% # variable to display when click feature
-  # Add legend for points layer
-  addLegend(pal = pal, 
-          values = hh_data_sdf$food_security,
-          title = "Food Security")
- -

- -
imap_2
- -
- - -

Polygon Data

- -
# Load and Prep Plot-Level Data ------------------------------------------------
-setwd(file.path(finalData,"Shapefiles")) # set filepath to folder with shapefile
-ag_fields <- readOGR(dsn=".", layer="allsitessubset", verbose = FALSE)
-
-# Plot File Projection
-crs(ag_fields)
- -
## CRS arguments:
-##  +proj=tmerc +lat_0=0 +lon_0=30 +k=0.9999 +x_0=500000 +y_0=5000000
-## +ellps=GRS80 +units=m +no_defs
- -
# HH Survey Location Projection
-crs(hh_data_sdf)
- -
## CRS arguments:
-##  +proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0
- -
# Reproject plot data to have same projection as HH survey data
-hh_dd_proj <- as.character(crs(hh_data_sdf))
-ag_fields <- spTransform(ag_fields, CRS(hh_dd_proj))
- -

- -
plot(ag_fields)
- -

- -

- -
ag_fields
- -
## class       : SpatialPolygonsDataFrame 
-## features    : 19 
-## extent      : 30.25664, 30.67621, -2.095173, -1.930931  (xmin, xmax, ymin, ymax)
-## coord. ref. : +proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0 
-## variables   : 1
-## names       :        site 
-## min values  :   Kayonza15 
-## max values  : Rwamagana35
- -
ag_fields@polygons Each polyon is defined by a matrix of vertices
- -

- -

Simplify the Shapefile

- -
head(ag_fields)
- -
##          site
-## 0 Rwamagana33
-## 1 Rwamagana33
-## 2   Kayonza15
-## 3   Kayonza15
-## 4 Rwamagana34
-## 5 Rwamagana34
- -

You'll notice that multiple features have the same name. Here, let's simplify the polygon so that each site name represents one feature.

- -

- -

- -

To aggregate (or dissolve) the polygon by the variable site, use the aggregate function from the raster package.

- -
ag_fields <- aggregate(ag_fields, by = "site")
- -
plot(ag_fields)
- -

- -

- -

In the previous slide, you'll notice there's a weird tic mark in one of the polygons. To get rid of this, let's buffer the polygon by a very small amount – which will help cover the hole in the field.

- -
ag_fields <- gBuffer(ag_fields, width=0.000001/111, byid=T)
-plot(ag_fields)
- -

- -

Prep Data for Map

- -

Before we make a map, let's merge in plot some data for each agriculture field. I've already prepared this data, but it includes average food securtiy and average food expenditure per agriculture field based on the household data.

- -
ag_fields_data <- read.csv(file.path(finalData, "plot_data.csv"))
-ag_fields <- merge(ag_fields, ag_fields_data, by="site")
-summary(ag_fields@data)
- -
##           site   expend_food_yearly_mean food_security_mean
-##  Kayonza15  :1   Min.   :189101          Min.   :1.468     
-##  Kayonza4   :1   1st Qu.:262529          1st Qu.:1.599     
-##  Rwamagana2 :1   Median :321472          Median :1.736     
-##  Rwamagana33:1   Mean   :304092          Mean   :1.726     
-##  Rwamagana34:1   3rd Qu.:354403          3rd Qu.:1.898     
-##  Rwamagana35:1   Max.   :384243          Max.   :1.915
- -

- -

ggplot can't interpret spatial dataframes. Consequently, we need to transform the data into a format that ggplot can understand. Here, we make a dataframe where each vertice of the polygon is an observation.

- -
library(broom)
-
-ag_fields_df <- tidy(ag_fields, region="site")
-head(ag_fields_df)
- -
##       long       lat order  hole piece       group        id
-## 1 30.58359 -1.936452     1 FALSE     1 Kayonza15.1 Kayonza15
-## 2 30.58364 -1.936513     2 FALSE     1 Kayonza15.1 Kayonza15
-## 3 30.58364 -1.936496     3 FALSE     1 Kayonza15.1 Kayonza15
-## 4 30.58366 -1.936071     4 FALSE     1 Kayonza15.1 Kayonza15
-## 5 30.58366 -1.936071     5 FALSE     1 Kayonza15.1 Kayonza15
-## 6 30.58366 -1.936032     6 FALSE     1 Kayonza15.1 Kayonza15
- -

The resulting dataframe from tidy:

- -
    -
  • Has the variable id, which is taken from the the variable in the region argument.
  • -
  • Doesn't have any of our other variables (e.g., food security).
  • -
  • We can merge our other variables (in ag_fields) into this dataframe using the site variable from ag_fields and the id variable from ag_fields_df.
  • -
- -

- -
ag_fields_df <- merge(ag_fields_df, ag_fields, by.x="id", by.y="site")
-head(ag_fields_df)
- -
##          id     long       lat order  hole piece       group
-## 1 Kayonza15 30.58359 -1.936452     1 FALSE     1 Kayonza15.1
-## 2 Kayonza15 30.58364 -1.936513     2 FALSE     1 Kayonza15.1
-## 3 Kayonza15 30.58364 -1.936496     3 FALSE     1 Kayonza15.1
-## 4 Kayonza15 30.58366 -1.936071     4 FALSE     1 Kayonza15.1
-## 5 Kayonza15 30.58366 -1.936071     5 FALSE     1 Kayonza15.1
-## 6 Kayonza15 30.58366 -1.936032     6 FALSE     1 Kayonza15.1
-##   expend_food_yearly_mean food_security_mean
-## 1                189101.4           1.914634
-## 2                189101.4           1.914634
-## 3                189101.4           1.914634
-## 4                189101.4           1.914634
-## 5                189101.4           1.914634
-## 6                189101.4           1.914634
- -

Now, let's make a map

- -
ggplot() + 
-  geom_polygon(data = ag_fields_df, aes(x = long, y=lat, group=group, 
-                                    fill=expend_food_yearly_mean))
- -

- -

Let's make a better map

- -
ag_map1 <- ggplot() + 
-  geom_polygon(data = ag_fields_df, aes(x = long, y=lat, group=group, 
-                                    fill=expend_food_yearly_mean),
-               color="black", size=0.25) + # Borders of polygons
-  
-  coord_quickmap() + # Make sure map doesn't look distorted
-  theme_void() + # Set theme of plot 
-  scale_fill_gradient(low = "orangered1", high = "green1") + # Color Gradient
-  labs(fill="Food\nExpenditure", title="Annual Food Expenditure") + # Labels
-  theme(plot.title = element_text(hjust = 0.5, face="bold")) # Center title
- -

- -
ag_map1
- -

- -

Lets use a different color palette

- - - -
library(RColorBrewer)
- -

- -

- -
ag_map2 <- ggplot() + 
-  geom_polygon(data = ag_fields_df, aes(x = long, y=lat, group=group, 
-                                    fill=expend_food_yearly_mean),
-               color="black", size=0.25) + # Borders of polygons
-  
-  coord_quickmap() + 
-  theme_void() + 
-  scale_fill_distiller(palette = "Spectral", direction = 1) + # Color Gradient
-  labs(fill="Food\nExpenditure", title="Annual Food Expenditure") + 
-  theme(plot.title = element_text(hjust = 0.5, face="bold")) 
- -

- -
ag_map2
- -

- -

ggplot colors scale summary

- -

Defing own color scale

- -
    -
  • scale_*_gradient(low, high): [For continuous variable] Manually define low/high colors
  • -
  • scale_*_gradientn(colors): [For continuous variable] Use a defined list of colors (e.g., c("purple","blue","yellow","white"))
  • -
  • scale_*_manual(colors): [For discrete variable] Use a defined list of colors, where the list is equal to the number of unique observations in the discrete variable.
  • -
- -

Using pre-defined palettes

- -
    -
  • scale_*_distiller(palette): [For contiuous variables]
  • -
  • scale_*_brewer(palette): [For discrete variables]
  • -
- -

Where * is either color or fill. Discrete variables = factor variables.

- -

Add text to map

- -

Now, let's add text to the map showing regions. To do this, we need to create a dataframe that includes the lat/lon of the center of each plot and a variable for the site name. To do this, we use:

- -
    -
  • gCentroid() which returns a shapefile of the center of each region
  • -
  • coordinates which returns a matrix of the coordinates of a shapefile, where the first column is longitude and the second column is latitude.
  • -
- -
# Create dataframe of center of each plot with site name as variable
-ag_fields_center <- gCentroid(ag_fields, byid=T) 
-ag_fields_center <- as.data.frame(coordinates(ag_fields_center))
-
-ag_fields_center$site <- ag_fields$site
-names(ag_fields_center) <- c("longitude","latitude","site")
- -

- -

Now, lets use geom_text to add text to our map.

- -
ag_map2 + geom_text(data=ag_fields_center, aes(label=site, x=longitude, y=latitude),
-            check_overlap = TRUE) # makes sure text doesn't overlap
- -

- -

- -

If you have lots of cluttered text, use geom_text_repel from ggrepel.

- -
library(ggrepel)
-
-ag_map2 + geom_text_repel(data=ag_fields_center, aes(label=site, x=longitude, y=latitude))
- -

- -

Let's add HH locations to the map

- -
ag_map2 <- ggplot() + 
-  geom_polygon(data = ag_fields_df, aes(x = long, y=lat, group=group, 
-                                    fill=expend_food_yearly_mean),
-               color="black", size=0.25) + # Borders of polygons
-  geom_point(data=hh_data, 
-           aes(x=longitude_scramble, 
-               y=latitude_scramble, 
-               color="HH Location"), # Name an aesthetic want you want to 
-                                     # appear on legend
-           size=.1, alpha=.6) +
-  scale_color_manual(values="black") + # Manually define color of points
-  coord_quickmap() + 
-  theme_void() +
-  scale_fill_gradient(low = "orangered1", high = "green1") + 
-  labs(fill="Food\nExpenditure", title="Annual Food Expenditure", 
-       color="") + # Setting color="" makes the title above the legend item blank
-  theme(plot.title = element_text(hjust = 0.5, face="bold")) 
- -

- -
ag_map2
- -

- -

Add Polygon to Interactive Map

- -
leaflet() %>%
-  addProviderTiles("OpenStreetMap") %>% 
-  addPolygons(data=ag_fields)
- -
- - -

- -
pal_foodexpend <- colorNumeric("RdYlGn", ag_fields$expend_food_yearly_mean)
-
-imap_3 <- leaflet() %>%
-  addProviderTiles("OpenStreetMap") %>%
-  addPolygons(data=ag_fields,
-            fillColor = ~pal_foodexpend(expend_food_yearly_mean),
-            fillOpacity = 0.6,
-            color="black", # color of line around polygons
-            weight=1, # width of line around polygons
-            popup=~site)
- -

- -
imap_3
- -
- - -

Map Using Admin Data

- -

Now, let's make a map using administrative-level data. The get_data function from the raster package allows us to download data from GADM (the Database of Global Administrative Areas). Let's download the third administrative division for Rwanda.

- -
rwa_adm <- getData('GADM', country='RWA', level=3)
-rwa_adm
- -
## class       : SpatialPolygonsDataFrame 
-## features    : 422 
-## extent      : 28.86171, 30.89907, -2.839973, -1.04745  (xmin, xmax, ymin, ymax)
-## coord. ref. : +proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0 
-## variables   : 16
-## names       : OBJECTID, ID_0, ISO, NAME_0, ID_1,           NAME_1, ID_2,    NAME_2, ID_3, NAME_3, CCN_3, CCA_3,     TYPE_3,  ENGTYPE_3, NL_NAME_3, ... 
-## min values  :        1,  189, RWA, Rwanda,    1,     Amajyaruguru,    1,  Bugesera,    1,   Base,     0,     0,     Sector,     Sector,          , ... 
-## max values  :      422,  189, RWA, Rwanda,    5, Umujyi wa Kigali,   30, Rwamagana,  422,   Zaza,  5715,  5715, Water body, Water body,          , ...
- -

- -
head(rwa_adm)
- -
##   OBJECTID ID_0 ISO NAME_0 ID_1       NAME_1 ID_2 NAME_2 ID_3  NAME_3
-## 1        1  189 RWA Rwanda    1 Amajyaruguru    1 Burera    1  Bungwe
-## 2        2  189 RWA Rwanda    1 Amajyaruguru    1 Burera    2  Butaro
-## 3        3  189 RWA Rwanda    1 Amajyaruguru    1 Burera    3 Cyanika
-## 4        4  189 RWA Rwanda    1 Amajyaruguru    1 Burera    4   Cyeru
-## 5        5  189 RWA Rwanda    1 Amajyaruguru    1 Burera    5 Gahunga
-## 6        6  189 RWA Rwanda    1 Amajyaruguru    1 Burera    6  Gatebe
-##   CCN_3 CCA_3 TYPE_3 ENGTYPE_3 NL_NAME_3 VARNAME_3
-## 1  4401  4401 Sector    Sector                    
-## 2  4402  4402 Sector    Sector                    
-## 3  4403  4403 Sector    Sector                    
-## 4  4404  4404 Sector    Sector                    
-## 5  4405  4405 Sector    Sector                    
-## 6  4406  4406 Sector    Sector
- -

- -

I've preppared a dataset of average vegetation levels per administrative unit in Rwanda. Here, 0 means no vegetation and 1 means significant vegetation.

- -
rwa_veg <- read.csv(file.path(finalData, "rwanda_vegetation.csv"))
-head(rwa_veg)
- -
##   X OBJECTID vegetation
-## 1 1        1      0.542
-## 2 2        2      0.602
-## 3 3        3      0.677
-## 4 4        4      0.604
-## 5 5        5      0.591
-## 6 6        6      0.546
- -

Exercise

- -

Make an administrative-level map of vegetation levels. Here's a broad outline of steps you'll need to implement:

- -
    -
  • Merge the vegetation data with the Rwanda spatial dataframe.
  • -
  • Convert the spatial dataframe into a dataframe that is interpretable by ggplot
  • -
  • Make the map using ggplot.
  • -
  • Note: There are some NA values; think about effective ways to map those.
  • -
  • Hint: If your computer is talking a long time to plot, run the below code which simplifies the polygon (i.e., removes some of the vertices to simplify the shape). Larger values of tol make the polygon more simplified. I played around with values and found 0.003 to be adequate. Note that large values of tol might reduce the number of polygons!
  • -
- -
rwa_adm_simple <- gSimplify(rwa_adm, tol=0.003) # simplifies the polygon
-
-# Resulting layer from gSimplify has no variables, so need to add a variable back in
-rwa_adm_simple$OBJECTID <- rwa_adm$OBJECTID 
- -

Solution

- -
# 1. Merge data together -------------------------------------------------------
-rwa_adm_simple <- merge(rwa_adm_simple, rwa_veg, by="OBJECTID")
-
-# 2. Make dataframe interpretable by ggplot ------------------------------------
-rwa_adm_df <- tidy(rwa_adm_simple, region="OBJECTID")
-rwa_adm_df <- merge(rwa_adm_df, rwa_adm_simple, by.x="id", by.y="OBJECTID")
-
-# 3. Make Plot -----------------------------------------------------------------
-veg_map <- ggplot() +
-  geom_polygon(data=rwa_adm_df, aes(x=long, y=lat, group=group), 
-               fill="white", color="black",size=.1) +
-  geom_polygon(data=rwa_adm_df[!is.na(rwa_adm_df$vegetation),], 
-             aes(x=long, y=lat, group=group, fill=vegetation),
-             color="black",size=.1) +
-  theme_void() +
-  scale_fill_distiller(palette="RdYlGn", direction=1) +
-  labs(fill="Vegetation", title="Vegetation in Rwanda") +
-  theme(plot.title = element_text(hjust = 0.5, face="bold"))
- -

- -
veg_map
- -

- -

APPENDIX

- -

Raster Files

- -

Satellite imagery often comes in multiple bands. Each band captures a different part of the electromagnetic spectrum. Different combinations of bands are used to create spectral indices which are used to highlight different land cover elements, such as vegetation, built-up area, or burnt areas.

- -
-

- -

- -

\[NDVI = \frac{NIR-Red}{NIR+Red}\]

- -

Some common satellites include

- -
    -
  • Landsat: Captures images across the earth every 16 days at a 30 meter resolution across multiple spectral bands. Landsat began collecting images in 1972 and continues to the present.

  • -
  • Sentinel: Captures images across the earth every 5 days (at the equator) at a 10 meter resolution across multiple sepctral bands. Began in 2014 and continues to the present.

  • -
  • MODIS: Captures daily to bi-weekly imagery ranging from a 250m to 500m resolution.

  • -
  • VIIRS: Monthly nighttime lights at a 750m resolution from April 2012 to present.

  • -
- -

Load and Plot NDVI Around Rwanda

- -
ndvi_2012_a <- raster(file.path(finalData, "Tiffs", "ndvi_2012_0203.tif"))
-plot(ndvi_2012_a) 
-plot(ag_fields,add=T)
- -

- -

Crop Image to Agricultural Fields

- -
ndvi_2012_a <- crop(ndvi_2012_a,  ag_fields)
-plot(ndvi_2012_a)
-plot(ag_fields, add=T)
- -

- -

Mask Image to Agricultural Fields

- -
ndvi_2012_a <- mask(ndvi_2012_a,  ag_fields)
-plot(ndvi_2012_a)
- -

- -

Plot Raster Using ggplot

- -

You can also plot rasters using ggplot. Here, we need to convert the raster into a dataframe with latitude, longitude and the values. Then, we use geom_tile.

- -
ndvi_2012_a_df <- as(ndvi_2012_a, "SpatialPixelsDataFrame")
-ndvi_2012_a_df <- as.data.frame(ndvi_2012_a_df)
-
-ggplot() +
-  geom_tile(data=ndvi_2012_a_df, aes(x=x, y=y, fill=ndvi_2012_0203),alpha=1) +
-  scale_fill_gradient(low="red",high="green") +
-  coord_quickmap() + 
-  theme_void() +
-  labs(fill="NDVI", title="NDVI")
- -

- -

- -

Plot Raster Using rasterVis

- - - -

We can also plot the raster using other packages that don't require first converting the object to a dataframe. Here, we use rastervis.

- -
library(rasterVis)
-
-cols <- colorRampPalette(brewer.pal(9,"RdYlGn"))
-
-levelplot(ndvi_2012_a,
-          col.regions = cols)
- -

- -

- -

Make better map

- -
levelplot(ndvi_2012_a,
-          col.regions = cols,
-          margin=F, # No histograms on side
-          par.settings=
-             list(axis.line=list(col='transparent')), # No box around image
-          scales=list(draw=FALSE), # No latitude and longitude markers
-          xlab="",
-          ylab="",
-          main="NDVI 2012 (Season A)",
-          colorkey=list(space="right"), # legend location. can change to bottom,
-                                        # left and top. If you don't want a 
-                                        # legend, use: colorkey = F
-          maxpixels=1e6 # sometimes blends pixels together
-          ) +
-  layer(sp.polygons(ag_fields, 
-                    col="black", # border color
-                    lwd=1, # line width
-                    alpha=1))
- -

- -

- -

Raster Arithmetic

- -

Note that NDVI goes from -1 to 1; MODIS scales this value by 10000 so we'll scale this back to -1 to 1 by multiplying values by 0.0001. There are two ways to do arithmetic on rasters:

- -
raster_new <- raster * 0.0001
- -

OR

- -
raster_new <- calc(raster, fun = function(x) x * 0.0001)
- -

The second way looks ugly, but it is more efficient.

- -

Plot Multiple Images

- -
library(rasterVis)
-library(RColorBrewer)
-
-ndvi_2012_a <- raster(file.path(finalData, "Tiffs", "ndvi_2012_0203.tif")) %>%
-  crop(ag_fields) %>%
-  mask(ag_fields) %>%
-  calc(fun=function(x) x*0.0001)
-ndvi_2012_b <- raster(file.path(finalData, "Tiffs", "ndvi_2012_0910.tif")) %>%
-  crop(ag_fields) %>%
-  mask(ag_fields) %>%
-  calc(fun=function(x) x*0.0001)
-ndvi_2016_a <- raster(file.path(finalData, "Tiffs", "ndvi_2016_0203.tif")) %>%
-  crop(ag_fields) %>%
-  mask(ag_fields) %>%
-  calc(fun=function(x) x*0.0001)
-ndvi_2016_b <- raster(file.path(finalData, "Tiffs", "ndvi_2016_0910.tif")) %>%
-  crop(ag_fields) %>%
-  mask(ag_fields) %>%
-  calc(fun=function(x) x*0.0001)
-
-ndvi_stack <- stack(ndvi_2012_a, ndvi_2012_b, 
-                    ndvi_2016_a, ndvi_2016_b)
-
-cols <- colorRampPalette(brewer.pal(9,"RdYlGn"))
- -

- -
levelplot(ndvi_stack,
-          main="NDVI Across Seasons",
-          col.regions=cols,
-          layout=c(2, 2))
- -

- -

Interactive Map with Rasters

- -
pal <- colorNumeric(c("red3", "yellow", "palegreen4"), values(ndvi_stack),
-                    na.color = "transparent")
-
-NDVI_interactive <- leaflet() %>% 
-  addProviderTiles("OpenStreetMap") %>%
-  addRasterImage(ndvi_2012_a, colors = pal, opacity = 1, group="2012 A") %>%
-  addRasterImage(ndvi_2012_b, colors = pal, opacity = 1, group="2012 B") %>%
-  addRasterImage(ndvi_2016_a, colors = pal, opacity = 1, group="2016 A") %>%
-  addRasterImage(ndvi_2016_b, colors = pal, opacity = 1, group="2016 B") %>%
-  addLegend(pal = pal, values = values(ndvi_stack),
-    title = "NDVI") %>%
-  addLayersControl(
-    overlayGroups = c("2012 A","2012 B","2016 A","2016 B"),
-    options = layersControlOptions(collapsed = FALSE))
- -

- -
NDVI_interactive
- -
- - -

Summarize NDVI By Plot

- -

Let's say we want to determine average NDVI values for each plot or the NDVI value under each household location. To do this, we use extract from the raster package.

- -
# Average NDVI Value per Plot
-ag_fields$NDVI_2012_a <- extract(ndvi_2012_a, ag_fields, fun=mean)
-
-# NDVI Value per HH Location
-hh_data_sdf$NDVI_2012_a <- extract(ndvi_2012_a, hh_data_sdf)
- -

Sometimes this process can be slow. To (dramatically) increase the speed of this task, use the velox package. This is built to work with spatial polygons (not points).

- -
    -
  • velox(raster) converts the raster object to a velox raster object. From the raster object you can use the extract method, where the inputs are the shapefile and summarize function.
  • -
- -
library(velox)
-ag_fields$ndvi_2012_a_velox <- velox(ndvi_2012_a)$extract(ag_fields, fun=mean)
- -

- -
subset(ag_fields@data, select=c(site, NDVI_2012_a, ndvi_2012_a_velox))
- -
##          site NDVI_2012_a ndvi_2012_a_velox
-## 1   Kayonza15   0.4951754         0.4951754
-## 2    Kayonza4   0.5363880         0.5363880
-## 3  Rwamagana2   0.5958732         0.5958732
-## 4 Rwamagana33   0.5472275         0.5472275
-## 5 Rwamagana34   0.5667049         0.5667049
-## 6 Rwamagana35   0.5793644         0.5793644
- -

Useful Projections

- -

Geographic Coordinate System

- -

The most common geographic coordinate system is the World Geodetic System (WGS84).

- -
"+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0"
- -

Projected Coordinate Systems

- -

Below are projections for equal area and equal distance. Replace [LATITUDE VALUE] and [LONGITUDE VALUE] with the center of the area that you're working with.

- -

Equal Area: Lambert Azimuthal Equal Area

- -
"+proj=laea +lat_0=[LATITUDE VALUE] +lon_0=[LONGITUDE VALUE]"
- -

Equal Distance: Azimuthal equidistant projection

- -
"+proj=aeqd +lat_0=[LATITUDE VALUE] +lon_0=[LONGITUDE VALUE]"
- -

Calculating Distances: Way 1

- -

A common task in GIS is to calculate distances. Here, let's calculate the distance of each household location to the nearest city (using a dataset of Rwanda's 3 largest cities). This same syntax can be used to calculate the shortest distance to polygons (e.g., a lake) or polylines (e.g., roads).

- -

Import city-level dataset

- -
rwa_cities <- read.csv(file.path(finalData, "rwa_cities.csv"))
-coordinates(rwa_cities) <- ~lon+lat
-crs(rwa_cities) <- CRS("+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0")
- -

Re-project to equal distant projection

- -
# Here, I used Kigali as the center
-eq_dist_projection <- "+proj=aeqd +lat_0=-1.943889 +lon_0=30.059444"
-
-rwa_cities_ed <- spTransform(rwa_cities, CRS(eq_dist_projection))
-hh_data_sdf_ed <- spTransform(hh_data_sdf, CRS(eq_dist_projection))
- -

- -

Creates distance matrix

- -
gDistance(rwa_cities_ed, hh_data_sdf_ed, byid=T) %>% head
- -
##          1         2        3
-## 1 39419.69 101785.29 74845.93
-## 2 41335.03 100056.47 75709.40
-## 3 41433.12 101555.78 76291.83
-## 4 40642.18 101794.90 75775.02
-## 5 40443.28 103553.65 76166.05
-## 6 39694.79  98492.55 73943.48
- -

Calculate shortest distance to any city.

- -
gDistance_cities_i <- function(i) gDistance(hh_data_sdf_ed[i,], rwa_cities_ed, byid=F)
-hh_data_sdf_ed$dist_city <- lapply(1:nrow(hh_data_sdf_ed), gDistance_cities_i) %>% unlist
- -

Calculating Distances: Way 2

- -

There are also ways to calculate distances by using a geographic coordinate system (ie, keeping the world a sphere) and taking into account the curvator of the earth. Haversine (good for short distances – think the distance of a short haul flight) and vincenty (slower but better for longer distances) are common formulas to calculate these distances.

- -

The below code uses the haversine method to calculate the distance between all the household locations and the first Rwandan city. The output is in meters.

- -
library(geosphere)
-distHaversine(hh_data_sdf, rwa_cities[1,]) 
- -

We can also calculate the distance between a point and a line using the dist2Line function from the geosphere package. Below shows an example between an arbitrary point and line shapefile using the haversine formula.

- -
dist2Line(points, line, distfun=distHaversine)
- - - - -
- - - - - - - - - diff --git a/Presentations/Lab_1_-_Intro_to_R.pdf b/Presentations/Lab_1_-_Intro_to_R.pdf new file mode 100644 index 0000000..67db1c2 Binary files /dev/null and b/Presentations/Lab_1_-_Intro_to_R.pdf differ diff --git a/Presentations/Lab_2_-_Descriptive_analysis.pdf b/Presentations/Lab_2_-_Descriptive_analysis.pdf new file mode 100644 index 0000000..86ae4e7 Binary files /dev/null and b/Presentations/Lab_2_-_Descriptive_analysis.pdf differ diff --git a/Presentations/Lab_3_-_Data_visualization.pdf b/Presentations/Lab_3_-_Data_visualization.pdf new file mode 100644 index 0000000..153c9a3 Binary files /dev/null and b/Presentations/Lab_3_-_Data_visualization.pdf differ diff --git a/Presentations/Lab_4_-_Coding_for_Reproducible_Research.pdf b/Presentations/Lab_4_-_Coding_for_Reproducible_Research.pdf new file mode 100644 index 0000000..982b66f Binary files /dev/null and b/Presentations/Lab_4_-_Coding_for_Reproducible_Research.pdf differ diff --git a/Presentations/Lab_5_-_Spatial_Data.pdf b/Presentations/Lab_5_-_Spatial_Data.pdf new file mode 100644 index 0000000..5abcecc Binary files /dev/null and b/Presentations/Lab_5_-_Spatial_Data.pdf differ diff --git a/Presentations/Lab_6_-_Data_Processing.pdf b/Presentations/Lab_6_-_Data_Processing.pdf new file mode 100644 index 0000000..2361c89 Binary files /dev/null and b/Presentations/Lab_6_-_Data_Processing.pdf differ diff --git a/Presentations/Lab_6_-_Spatial_Data.html b/Presentations/Lab_6_-_Spatial_Data.html deleted file mode 100644 index a96dbea..0000000 --- a/Presentations/Lab_6_-_Spatial_Data.html +++ /dev/null @@ -1,1136 +0,0 @@ - - - - Spatial Data in R - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-

-

-

-
-
- - - - - -

GIS Overview

- -

GIS Data

- -

There are two main types of spatial data: vector and raster data.

- -
    -
  • Vector: Vectors (also called shapefiles) consist of points, lines and polygons. These shapes are attached to a dataframe, where each row corresponds to a different spatial element.

  • -
  • Raster: Rasters are spatially-referenced grids where each cell has one value.

  • -
- -


- -
-

Shapefile

- -

Raster

- -

Coordinate Reference Systems

- -
    -
  • Coordinate reference systems map pairs of numbers to a location on the earth.
  • -
  • Geographic Coordinate Systems live on a sphere; here, the units are in decimal degrees
  • -
  • Using the WGS84 coordinate system the World Bank MC building is located at 38.89 degrees latitude and -77.04 degrees longitude.
  • -
  • Projected Coordinate Systems project the earth onto a flat surface. Doing this distorts the earth in some way (shape, area, distance or direction)
  • -
  • Using to the World Mercator projection, the World Bank is located 4680364.64 north and -8576320.73 east.
  • -
- -

- -

GIS in R System

- -

Here, we'll use the following packages

- -
    -
  • sp: Defines classes and methods for spatial objects.
  • -
  • rgdal: For processing spatial data, particularly for reading and writing spatial data.
  • -
  • rgeos: A vector processing library.
  • -
  • raster: Reading, writing, manipulating, analyzing and modeling of gridded spatial data.
  • -
- -

- -

Load and Map HH-Level Data

- -
# Packages and Filepaths -------------------------------------------------------
-library(sp)
-library(rgdal)
-library(rgeos)
-library(raster)
-
-hh_data <- read.csv(file.path(finalData,"HH_data.csv"))
-
-str(hh_data)
- -
## 'data.frame':    422 obs. of  5 variables:
-##  $ X                 : int  1 3 4 5 6 7 8 10 11 12 ...
-##  $ expend_food_yearly: num  250452 420029 484468 326109 131487 ...
-##  $ food_security     : Factor w/ 3 levels "Little Hunger",..: 1 2 1 1 3 3 1 2 3 1 ...
-##  $ longitude_scramble: num  30.4 30.4 30.4 30.4 30.4 ...
-##  $ latitude_scramble : num  -1.97 -2 -1.99 -1.98 -1.95 ...
- -

Make a Map

- -
library(ggplot2)
-ggplot() +
-  geom_point(data=hh_data, 
-             aes(x=longitude_scramble, y=latitude_scramble, color=food_security),
-             size=.7)
- -

- -

Make a Better Map

- -
hh_map1 <- ggplot() +
-
-  # Points
-  geom_point(data=hh_data, 
-             aes(x=longitude_scramble, y=latitude_scramble, color=food_security),
-             size=.7) +
-  
-  # Other elements to improve map
-  coord_quickmap() + # make sure the map doesn't look distorted. Can also use
-                     # coord_map(), but sometimes makes the process slow
-  theme_void() +
-  scale_color_manual(values=c("green", "orange", "red")) +
-  labs(title="Household Food Security", color="Food Security") +
-  theme(plot.title = element_text(hjust = 0.5, face="bold")) # center and bold title
- -

- -
hh_map1
- -

- -

Add a Basemap

- -
library(ggmap)
-basemap <- get_map(location = c(lon=mean(hh_data$longitude_scramble), 
-                                lat=mean(hh_data$latitude_scramble)), 
-                   zoom=10,
-                   maptype="roadmap") # roadmap, satellite, etc. See help(get_map)
-
-hh_map2 <- ggmap(basemap) +
-  geom_point(data=hh_data, 
-             aes(x=longitude_scramble, 
-                 y=latitude_scramble, 
-                 color=food_security),
-             size=.7) +
-  coord_quickmap() + # make sure the map doesn't look distorted. Can also use
-                     # coord_map(), but sometimes makes the process slow
-  theme_void() +
-  labs(title="Household Food Security", color="Food Security") +
-  theme(plot.title = element_text(hjust = 0.5, face="bold")) +
-  scale_color_manual(values=c("green", "orange", "red"))
- -

- -
hh_map2
- -

- -

- -
hh_map2 + 
-  scale_x_continuous(limits = c(min(hh_data$longitude_scramble), 
-                                max(hh_data$longitude_scramble)), 
-                     expand = c(.03, .03)) +
-  scale_y_continuous(limits = c(min(hh_data$latitude_scramble), 
-                                max(hh_data$latitude_scramble)), 
-                     expand = c(.03, .03))
- -

- -

Spatial Dataframe

- -

So far, we've just been working with a dataframe and using the longitude and latitude variables. However, for many other uses of spatial data we need to convert the dataframe into a spatial dataframe.

- -
    -
  • coordinates transforms a dataframe into a spatial dataframe. The syntax is: coordinates(DATA FRAME) <- ~LONGITUDE+LATITUDE, where LONGITUDE and LATITUDE are the names of variables in DATA FRAME.
  • -
  • After creating a spatial dataframe we need to tell it what coordinate reference system (CRS) the coordinates are in. We define the crs using crs.
  • -
- -
hh_data_sdf <- hh_data
-coordinates(hh_data_sdf) <- ~longitude_scramble+latitude_scramble
-crs(hh_data_sdf) <- CRS("+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0")
- -

Spatial Dataframe Structure

- -

Here, the variables are separated from the coordinates. Specifically, a spatial dataframe is a list, where one element of the list is a dataframe of variables and other is a dataframe of coordinates.

- -
hh_data_sdf
- -
## class       : SpatialPointsDataFrame 
-## features    : 422 
-## extent      : 30.26138, 30.67035, -2.094064, -1.934281  (xmin, xmax, ymin, ymax)
-## coord. ref. : +proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0 
-## variables   : 3
-## names       :   X, expend_food_yearly, food_security 
-## min values  :   1,              0.000, Little Hunger 
-## max values  : 553,        3034121.500, Severe Hunger
- -

- -

Access a dataframe containing the variables (except the coordinates)

- -
hh_data_sdf@data
- -


Access the coordinates of the points

- -
hh_data_sdf@coords
- -


We can still access variables like a normal dataframe

- -
hh_data_sdf@data$food_security
-hh_data_sdf$food_security # short-cut
- -

Quickly Plot Spatial Data

- -

We can quickly plot spatial dataframes.

- -
plot(hh_data_sdf)
- -

- -

Perform Spatial Operations

- -

See rgeos for a list of common functions to apply on vector data.

- -

Buffering

- -
library(rgeos)
-hh_data_sdf_buff <- gBuffer(hh_data_sdf, width= 0.5/111, byid=T) #buffer by about 10km
-plot(hh_data_sdf_buff)
- -

- -

- -

Reproject

- -
head(hh_data_sdf@coords)
- -
##   longitude_scramble latitude_scramble
-## 1           30.41310         -1.965660
-## 2           30.42626         -2.003319
-## 3           30.42944         -1.986508
-## 4           30.42331         -1.976507
-## 5           30.42285         -1.952778
-## 6           30.41069         -2.006962
- -
hh_data_sdf_newproj <- spTransform(hh_data_sdf, CRS("+init=epsg:3857"))
-head(hh_data_sdf_newproj@coords)
- -
##   longitude_scramble latitude_scramble
-## 1            3385571         -218859.2
-## 2            3387035         -223053.9
-## 3            3387390         -221181.4
-## 4            3386707         -220067.4
-## 5            3386657         -217424.3
-## 6            3385302         -223459.7
- -

- -

Calculating Distances

- -

Note: See appendix for ways to calculate distances in a more accurate way (e.g., by using an equal area projection or by taking into account the curvator of the earth)

- -
dist_matrix <- gDistance(hh_data_sdf_newproj, byid=T)
-dist_matrix[1:7,1:7]
- -
##          1         2        3        4        5        6         7
-## 1    0.000 4442.9586 2950.070 1658.693 1799.332 4608.262 4019.3068
-## 2 4442.959    0.0000 1905.795 3004.456 5642.253 1779.875  850.3531
-## 3 2950.070 1905.7947    0.000 1306.708 3828.044 3090.272 1193.2667
-## 4 1658.693 3004.4565 1306.708    0.000 2643.513 3671.725 2442.9956
-## 5 1799.332 5642.2528 3828.044 2643.513    0.000 6185.365 5019.0051
-## 6 4608.262 1779.8748 3090.272 3671.725 6185.365    0.000 2486.2690
-## 7 4019.307  850.3531 1193.267 2442.996 5019.005 2486.269    0.0000
- -

Interactive Maps

- -

So far, we've been making static maps. But those are boring. Let's make an interactive map using Leaflet. There's a bunch of different basemaps to choose from; from the list here, enter the name of the basemap in quotes in the addProviderTiles function.

- -
library(leaflet)
-library(dplyr)
-
-imap_1 <- leaflet() %>%
-  addProviderTiles("OpenStreetMap") %>% 
-  addCircles(data=hh_data_sdf)
- -

NOTE: leaflet assumes you are using the following coordinate reference system:

- -
"+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0" 
- -

- -
imap_1
- -
- - -

- -

We can save our interactive map outside of R. Here, we use saveWidget from the htmlwidgets package. Doubleclicking the html file will open the file in any browser.

- -
library(htmlwidgets)
-saveWidget(imap_1, file=file.path(Output,"interactive_map.html"))
- -

Lets make a better interactive map

- -
# Define Palette
-pal <- colorFactor(
-  palette = c("Green","Yellow","Red"),
-  domain = hh_data_sdf$food_security)
-
-imap_2 <- leaflet() %>%
-  addProviderTiles("Stamen.Terrain") %>% 
-  addCircleMarkers(data=hh_data_sdf,
-                 radius=3,
-                 fillOpacity=1,
-                 color=~pal(food_security),
-                 stroke=F, # remove outer-circle around circle
-                 popup=~food_security) %>% # variable to display when click feature
-  # Add legend for points layer
-  addLegend(pal = pal, 
-          values = hh_data_sdf$food_security,
-          title = "Food Security")
- -

- -
imap_2
- -
- - -

Polygon Data

- -
# Load and Prep Plot-Level Data ------------------------------------------------
-setwd(file.path(finalData,"Shapefiles")) # set filepath to folder with shapefile
-ag_fields <- readOGR(dsn=".", layer="allsitessubset", verbose = FALSE)
-
-# Plot File Projection
-crs(ag_fields)
- -
## CRS arguments:
-##  +proj=tmerc +lat_0=0 +lon_0=30 +k=0.9999 +x_0=500000 +y_0=5000000
-## +ellps=GRS80 +units=m +no_defs
- -
# HH Survey Location Projection
-crs(hh_data_sdf)
- -
## CRS arguments:
-##  +proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0
- -
# Reproject plot data to have same projection as HH survey data
-hh_dd_proj <- as.character(crs(hh_data_sdf))
-ag_fields <- spTransform(ag_fields, CRS(hh_dd_proj))
- -

- -
plot(ag_fields)
- -

- -

- -
ag_fields
- -
## class       : SpatialPolygonsDataFrame 
-## features    : 19 
-## extent      : 30.25664, 30.67621, -2.095173, -1.930931  (xmin, xmax, ymin, ymax)
-## coord. ref. : +proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0 
-## variables   : 1
-## names       :        site 
-## min values  :   Kayonza15 
-## max values  : Rwamagana35
- -
ag_fields@polygons Each polyon is defined by a matrix of vertices
- -

- -

Simplify the Shapefile

- -
head(ag_fields)
- -
##          site
-## 0 Rwamagana33
-## 1 Rwamagana33
-## 2   Kayonza15
-## 3   Kayonza15
-## 4 Rwamagana34
-## 5 Rwamagana34
- -

You'll notice that multiple features have the same name. Here, let's simplify the polygon so that each site name represents one feature.

- -

- -

- -

To aggregate (or dissolve) the polygon by the variable site, use the aggregate function from the raster package.

- -
ag_fields <- aggregate(ag_fields, by = "site")
- -
plot(ag_fields)
- -

- -

- -

In the previous slide, you'll notice there's a weird tic mark in one of the polygons. To get rid of this, let's buffer the polygon by a very small amount – which will help cover the hole in the field.

- -
ag_fields <- gBuffer(ag_fields, width=0.000001/111, byid=T)
-plot(ag_fields)
- -

- -

Prep Data for Map

- -

Before we make a map, let's merge in plot some data for each agriculture field. I've already prepared this data, but it includes average food securtiy and average food expenditure per agriculture field based on the household data.

- -
ag_fields_data <- read.csv(file.path(finalData, "plot_data.csv"))
-ag_fields <- merge(ag_fields, ag_fields_data, by="site")
-summary(ag_fields@data)
- -
##           site   expend_food_yearly_mean food_security_mean
-##  Kayonza15  :1   Min.   :189101          Min.   :1.468     
-##  Kayonza4   :1   1st Qu.:262529          1st Qu.:1.599     
-##  Rwamagana2 :1   Median :321472          Median :1.736     
-##  Rwamagana33:1   Mean   :304092          Mean   :1.726     
-##  Rwamagana34:1   3rd Qu.:354403          3rd Qu.:1.898     
-##  Rwamagana35:1   Max.   :384243          Max.   :1.915
- -

- -

ggplot can't interpret spatial dataframes. Consequently, we need to transform the data into a format that ggplot can understand. Here, we make a dataframe where each vertice of the polygon is an observation.

- -
library(broom)
-
-ag_fields_df <- tidy(ag_fields, region="site")
-head(ag_fields_df)
- -
##       long       lat order  hole piece       group        id
-## 1 30.58359 -1.936452     1 FALSE     1 Kayonza15.1 Kayonza15
-## 2 30.58364 -1.936513     2 FALSE     1 Kayonza15.1 Kayonza15
-## 3 30.58364 -1.936496     3 FALSE     1 Kayonza15.1 Kayonza15
-## 4 30.58366 -1.936071     4 FALSE     1 Kayonza15.1 Kayonza15
-## 5 30.58366 -1.936071     5 FALSE     1 Kayonza15.1 Kayonza15
-## 6 30.58366 -1.936032     6 FALSE     1 Kayonza15.1 Kayonza15
- -

The resulting dataframe from tidy:

- -
    -
  • Has the variable id, which is taken from the the variable in the region argument.
  • -
  • Doesn't have any of our other variables (e.g., food security).
  • -
  • We can merge our other variables (in ag_fields) into this dataframe using the site variable from ag_fields and the id variable from ag_fields_df.
  • -
- -

- -
ag_fields_df <- merge(ag_fields_df, ag_fields, by.x="id", by.y="site")
-head(ag_fields_df)
- -
##          id     long       lat order  hole piece       group
-## 1 Kayonza15 30.58359 -1.936452     1 FALSE     1 Kayonza15.1
-## 2 Kayonza15 30.58364 -1.936513     2 FALSE     1 Kayonza15.1
-## 3 Kayonza15 30.58364 -1.936496     3 FALSE     1 Kayonza15.1
-## 4 Kayonza15 30.58366 -1.936071     4 FALSE     1 Kayonza15.1
-## 5 Kayonza15 30.58366 -1.936071     5 FALSE     1 Kayonza15.1
-## 6 Kayonza15 30.58366 -1.936032     6 FALSE     1 Kayonza15.1
-##   expend_food_yearly_mean food_security_mean
-## 1                189101.4           1.914634
-## 2                189101.4           1.914634
-## 3                189101.4           1.914634
-## 4                189101.4           1.914634
-## 5                189101.4           1.914634
-## 6                189101.4           1.914634
- -

Now, let's make a map

- -
ggplot() + 
-  geom_polygon(data = ag_fields_df, aes(x = long, y=lat, group=group, 
-                                    fill=expend_food_yearly_mean))
- -

- -

Let's make a better map

- -
ag_map1 <- ggplot() + 
-  geom_polygon(data = ag_fields_df, aes(x = long, y=lat, group=group, 
-                                    fill=expend_food_yearly_mean),
-               color="black", size=0.25) + # Borders of polygons
-  
-  coord_quickmap() + # Make sure map doesn't look distorted
-  theme_void() + # Set theme of plot 
-  scale_fill_gradient(low = "orangered1", high = "green1") + # Color Gradient
-  labs(fill="Food\nExpenditure", title="Annual Food Expenditure") + # Labels
-  theme(plot.title = element_text(hjust = 0.5, face="bold")) # Center title
- -

- -
ag_map1
- -

- -

Lets use a different color palette

- - - -
library(RColorBrewer)
- -

- -

- -
ag_map2 <- ggplot() + 
-  geom_polygon(data = ag_fields_df, aes(x = long, y=lat, group=group, 
-                                    fill=expend_food_yearly_mean),
-               color="black", size=0.25) + # Borders of polygons
-  
-  coord_quickmap() + 
-  theme_void() + 
-  scale_fill_distiller(palette = "Spectral", direction = 1) + # Color Gradient
-  labs(fill="Food\nExpenditure", title="Annual Food Expenditure") + 
-  theme(plot.title = element_text(hjust = 0.5, face="bold")) 
- -

- -
ag_map2
- -

- -

ggplot colors scale summary

- -

Defing own color scale

- -
    -
  • scale_*_gradient(low, high): [For continuous variable] Manually define low/high colors
  • -
  • scale_*_gradientn(colors): [For continuous variable] Use a defined list of colors (e.g., c("purple","blue","yellow","white"))
  • -
  • scale_*_manual(colors): [For discrete variable] Use a defined list of colors, where the list is equal to the number of unique observations in the discrete variable.
  • -
- -

Using pre-defined palettes

- -
    -
  • scale_*_distiller(palette): [For contiuous variables]
  • -
  • scale_*_brewer(palette): [For discrete variables]
  • -
- -

Where * is either color or fill. Discrete variables = factor variables.

- -

Add text to map

- -

Now, let's add text to the map showing regions. To do this, we need to create a dataframe that includes the lat/lon of the center of each plot and a variable for the site name. To do this, we use:

- -
    -
  • gCentroid() which returns a shapefile of the center of each region
  • -
  • coordinates which returns a matrix of the coordinates of a shapefile, where the first column is longitude and the second column is latitude.
  • -
- -
# Create dataframe of center of each plot with site name as variable
-ag_fields_center <- gCentroid(ag_fields, byid=T) 
-ag_fields_center <- as.data.frame(coordinates(ag_fields_center))
-
-ag_fields_center$site <- ag_fields$site
-names(ag_fields_center) <- c("longitude","latitude","site")
- -

- -

Now, lets use geom_text to add text to our map.

- -
ag_map2 + geom_text(data=ag_fields_center, aes(label=site, x=longitude, y=latitude),
-            check_overlap = TRUE) # makes sure text doesn't overlap
- -

- -

- -

If you have lots of cluttered text, use geom_text_repel from ggrepel.

- -
library(ggrepel)
-
-ag_map2 + geom_text_repel(data=ag_fields_center, aes(label=site, x=longitude, y=latitude))
- -

- -

Let's add HH locations to the map

- -
ag_map2 <- ggplot() + 
-  geom_polygon(data = ag_fields_df, aes(x = long, y=lat, group=group, 
-                                    fill=expend_food_yearly_mean),
-               color="black", size=0.25) + # Borders of polygons
-  geom_point(data=hh_data, 
-           aes(x=longitude_scramble, 
-               y=latitude_scramble, 
-               color="HH Location"), # Name an aesthetic want you want to 
-                                     # appear on legend
-           size=.1, alpha=.6) +
-  scale_color_manual(values="black") + # Manually define color of points
-  coord_quickmap() + 
-  theme_void() +
-  scale_fill_gradient(low = "orangered1", high = "green1") + 
-  labs(fill="Food\nExpenditure", title="Annual Food Expenditure", 
-       color="") + # Setting color="" makes the title above the legend item blank
-  theme(plot.title = element_text(hjust = 0.5, face="bold")) 
- -

- -
ag_map2
- -

- -

Add Polygon to Interactive Map

- -
leaflet() %>%
-  addProviderTiles("OpenStreetMap") %>% 
-  addPolygons(data=ag_fields)
- -
- - -

- -
pal_foodexpend <- colorNumeric("RdYlGn", ag_fields$expend_food_yearly_mean)
-
-imap_3 <- leaflet() %>%
-  addProviderTiles("OpenStreetMap") %>%
-  addPolygons(data=ag_fields,
-            fillColor = ~pal_foodexpend(expend_food_yearly_mean),
-            fillOpacity = 0.6,
-            color="black", # color of line around polygons
-            weight=1, # width of line around polygons
-            popup=~site)
- -

- -
imap_3
- -
- - -

Map Using Admin Data

- -

Now, let's make a map using administrative-level data. The get_data function from the raster package allows us to download data from GADM (the Database of Global Administrative Areas). Let's download the third administrative division for Rwanda.

- -
rwa_adm <- getData('GADM', country='RWA', level=3)
-rwa_adm
- -
## class       : SpatialPolygonsDataFrame 
-## features    : 422 
-## extent      : 28.86171, 30.89907, -2.839973, -1.04745  (xmin, xmax, ymin, ymax)
-## coord. ref. : +proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0 
-## variables   : 16
-## names       : OBJECTID, ID_0, ISO, NAME_0, ID_1,           NAME_1, ID_2,    NAME_2, ID_3, NAME_3, CCN_3, CCA_3,     TYPE_3,  ENGTYPE_3, NL_NAME_3, ... 
-## min values  :        1,  189, RWA, Rwanda,    1,     Amajyaruguru,    1,  Bugesera,    1,   Base,     0,     0,     Sector,     Sector,          , ... 
-## max values  :      422,  189, RWA, Rwanda,    5, Umujyi wa Kigali,   30, Rwamagana,  422,   Zaza,  5715,  5715, Water body, Water body,          , ...
- -

- -
head(rwa_adm)
- -
##   OBJECTID ID_0 ISO NAME_0 ID_1       NAME_1 ID_2 NAME_2 ID_3  NAME_3
-## 1        1  189 RWA Rwanda    1 Amajyaruguru    1 Burera    1  Bungwe
-## 2        2  189 RWA Rwanda    1 Amajyaruguru    1 Burera    2  Butaro
-## 3        3  189 RWA Rwanda    1 Amajyaruguru    1 Burera    3 Cyanika
-## 4        4  189 RWA Rwanda    1 Amajyaruguru    1 Burera    4   Cyeru
-## 5        5  189 RWA Rwanda    1 Amajyaruguru    1 Burera    5 Gahunga
-## 6        6  189 RWA Rwanda    1 Amajyaruguru    1 Burera    6  Gatebe
-##   CCN_3 CCA_3 TYPE_3 ENGTYPE_3 NL_NAME_3 VARNAME_3
-## 1  4401  4401 Sector    Sector                    
-## 2  4402  4402 Sector    Sector                    
-## 3  4403  4403 Sector    Sector                    
-## 4  4404  4404 Sector    Sector                    
-## 5  4405  4405 Sector    Sector                    
-## 6  4406  4406 Sector    Sector
- -

- -

I've preppared a dataset of average vegetation levels per administrative unit in Rwanda. Here, 0 means no vegetation and 1 means significant vegetation.

- -
rwa_veg <- read.csv(file.path(finalData, "rwanda_vegetation.csv"))
-head(rwa_veg)
- -
##   X OBJECTID vegetation
-## 1 1        1      0.542
-## 2 2        2      0.602
-## 3 3        3      0.677
-## 4 4        4      0.604
-## 5 5        5      0.591
-## 6 6        6      0.546
- -

Exercise

- -

Make an administrative-level map of vegetation levels. Here's a broad outline of steps you'll need to implement:

- -
    -
  • Merge the vegetation data with the Rwanda spatial dataframe.
  • -
  • Convert the spatial dataframe into a dataframe that is interpretable by ggplot
  • -
  • Make the map using ggplot.
  • -
  • Note: There are some NA values; think about effective ways to map those.
  • -
  • Hint: If your computer is talking a long time to plot, run the below code which simplifies the polygon (i.e., removes some of the vertices to simplify the shape). Larger values of tol make the polygon more simplified. I played around with values and found 0.003 to be adequate. Note that large values of tol might reduce the number of polygons!
  • -
- -
rwa_adm_simple <- gSimplify(rwa_adm, tol=0.003) # simplifies the polygon
-
-# Resulting layer from gSimplify has no variables, so need to add a variable back in
-rwa_adm_simple$OBJECTID <- rwa_adm$OBJECTID 
- -

Solution

- -
# 1. Merge data together -------------------------------------------------------
-rwa_adm_simple <- merge(rwa_adm_simple, rwa_veg, by="OBJECTID")
-
-# 2. Make dataframe interpretable by ggplot ------------------------------------
-rwa_adm_df <- tidy(rwa_adm_simple, region="OBJECTID")
-rwa_adm_df <- merge(rwa_adm_df, rwa_adm_simple, by.x="id", by.y="OBJECTID")
-
-# 3. Make Plot -----------------------------------------------------------------
-veg_map <- ggplot() +
-  geom_polygon(data=rwa_adm_df, aes(x=long, y=lat, group=group), 
-               fill="white", color="black",size=.1) +
-  geom_polygon(data=rwa_adm_df[!is.na(rwa_adm_df$vegetation),], 
-             aes(x=long, y=lat, group=group, fill=vegetation),
-             color="black",size=.1) +
-  theme_void() +
-  scale_fill_distiller(palette="RdYlGn", direction=1) +
-  labs(fill="Vegetation", title="Vegetation in Rwanda") +
-  theme(plot.title = element_text(hjust = 0.5, face="bold"))
- -

- -
veg_map
- -

- -

APPENDIX

- -

Raster Files

- -

Satellite imagery often comes in multiple bands. Each band captures a different part of the electromagnetic spectrum. Different combinations of bands are used to create spectral indices which are used to highlight different land cover elements, such as vegetation, built-up area, or burnt areas.

- -
-

- -

- -

\[NDVI = \frac{NIR-Red}{NIR+Red}\]

- -

Some common satellites include

- -
    -
  • Landsat: Captures images across the earth every 16 days at a 30 meter resolution across multiple spectral bands. Landsat began collecting images in 1972 and continues to the present.

  • -
  • Sentinel: Captures images across the earth every 5 days (at the equator) at a 10 meter resolution across multiple sepctral bands. Began in 2014 and continues to the present.

  • -
  • MODIS: Captures daily to bi-weekly imagery ranging from a 250m to 500m resolution.

  • -
  • VIIRS: Monthly nighttime lights at a 750m resolution from April 2012 to present.

  • -
- -

Load and Plot NDVI Around Rwanda

- -
ndvi_2012_a <- raster(file.path(finalData, "Tiffs", "ndvi_2012_0203.tif"))
-plot(ndvi_2012_a) 
-plot(ag_fields,add=T)
- -

- -

Crop Image to Agricultural Fields

- -
ndvi_2012_a <- crop(ndvi_2012_a,  ag_fields)
-plot(ndvi_2012_a)
-plot(ag_fields, add=T)
- -

- -

Mask Image to Agricultural Fields

- -
ndvi_2012_a <- mask(ndvi_2012_a,  ag_fields)
-plot(ndvi_2012_a)
- -

- -

Plot Raster Using ggplot

- -

You can also plot rasters using ggplot. Here, we need to convert the raster into a dataframe with latitude, longitude and the values. Then, we use geom_tile.

- -
ndvi_2012_a_df <- as(ndvi_2012_a, "SpatialPixelsDataFrame")
-ndvi_2012_a_df <- as.data.frame(ndvi_2012_a_df)
-
-ggplot() +
-  geom_tile(data=ndvi_2012_a_df, aes(x=x, y=y, fill=ndvi_2012_0203),alpha=1) +
-  scale_fill_gradient(low="red",high="green") +
-  coord_quickmap() + 
-  theme_void() +
-  labs(fill="NDVI", title="NDVI")
- -

- -

- -

Plot Raster Using rasterVis

- - - -

We can also plot the raster using other packages that don't require first converting the object to a dataframe. Here, we use rastervis.

- -
library(rasterVis)
-
-cols <- colorRampPalette(brewer.pal(9,"RdYlGn"))
-
-levelplot(ndvi_2012_a,
-          col.regions = cols)
- -

- -

- -

Make better map

- -
levelplot(ndvi_2012_a,
-          col.regions = cols,
-          margin=F, # No histograms on side
-          par.settings=
-             list(axis.line=list(col='transparent')), # No box around image
-          scales=list(draw=FALSE), # No latitude and longitude markers
-          xlab="",
-          ylab="",
-          main="NDVI 2012 (Season A)",
-          colorkey=list(space="right"), # legend location. can change to bottom,
-                                        # left and top. If you don't want a 
-                                        # legend, use: colorkey = F
-          maxpixels=1e6 # sometimes blends pixels together
-          ) +
-  layer(sp.polygons(ag_fields, 
-                    col="black", # border color
-                    lwd=1, # line width
-                    alpha=1))
- -

- -

- -

Raster Arithmetic

- -

Note that NDVI goes from -1 to 1; MODIS scales this value by 10000 so we'll scale this back to -1 to 1 by multiplying values by 0.0001. There are two ways to do arithmetic on rasters:

- -
raster_new <- raster * 0.0001
- -

OR

- -
raster_new <- calc(raster, fun = function(x) x * 0.0001)
- -

The second way looks ugly, but it is more efficient.

- -

Plot Multiple Images

- -
library(rasterVis)
-library(RColorBrewer)
-
-ndvi_2012_a <- raster(file.path(finalData, "Tiffs", "ndvi_2012_0203.tif")) %>%
-  crop(ag_fields) %>%
-  mask(ag_fields) %>%
-  calc(fun=function(x) x*0.0001)
-ndvi_2012_b <- raster(file.path(finalData, "Tiffs", "ndvi_2012_0910.tif")) %>%
-  crop(ag_fields) %>%
-  mask(ag_fields) %>%
-  calc(fun=function(x) x*0.0001)
-ndvi_2016_a <- raster(file.path(finalData, "Tiffs", "ndvi_2016_0203.tif")) %>%
-  crop(ag_fields) %>%
-  mask(ag_fields) %>%
-  calc(fun=function(x) x*0.0001)
-ndvi_2016_b <- raster(file.path(finalData, "Tiffs", "ndvi_2016_0910.tif")) %>%
-  crop(ag_fields) %>%
-  mask(ag_fields) %>%
-  calc(fun=function(x) x*0.0001)
-
-ndvi_stack <- stack(ndvi_2012_a, ndvi_2012_b, 
-                    ndvi_2016_a, ndvi_2016_b)
-
-cols <- colorRampPalette(brewer.pal(9,"RdYlGn"))
- -

- -
levelplot(ndvi_stack,
-          main="NDVI Across Seasons",
-          col.regions=cols,
-          layout=c(2, 2))
- -

- -

Interactive Map with Rasters

- -
pal <- colorNumeric(c("red3", "yellow", "palegreen4"), values(ndvi_stack),
-                    na.color = "transparent")
-
-NDVI_interactive <- leaflet() %>% 
-  addProviderTiles("OpenStreetMap") %>%
-  addRasterImage(ndvi_2012_a, colors = pal, opacity = 1, group="2012 A") %>%
-  addRasterImage(ndvi_2012_b, colors = pal, opacity = 1, group="2012 B") %>%
-  addRasterImage(ndvi_2016_a, colors = pal, opacity = 1, group="2016 A") %>%
-  addRasterImage(ndvi_2016_b, colors = pal, opacity = 1, group="2016 B") %>%
-  addLegend(pal = pal, values = values(ndvi_stack),
-    title = "NDVI") %>%
-  addLayersControl(
-    overlayGroups = c("2012 A","2012 B","2016 A","2016 B"),
-    options = layersControlOptions(collapsed = FALSE))
- -

- -
NDVI_interactive
- -
- - -

Summarize NDVI By Plot

- -

Let's say we want to determine average NDVI values for each plot or the NDVI value under each household location. To do this, we use extract from the raster package.

- -
# Average NDVI Value per Plot
-ag_fields$NDVI_2012_a <- extract(ndvi_2012_a, ag_fields, fun=mean)
-
-# NDVI Value per HH Location
-hh_data_sdf$NDVI_2012_a <- extract(ndvi_2012_a, hh_data_sdf)
- -

Sometimes this process can be slow. To (dramatically) increase the speed of this task, use the velox package. This is built to work with spatial polygons (not points).

- -
    -
  • velox(raster) converts the raster object to a velox raster object. From the raster object you can use the extract method, where the inputs are the shapefile and summarize function.
  • -
- -
library(velox)
-ag_fields$ndvi_2012_a_velox <- velox(ndvi_2012_a)$extract(ag_fields, fun=mean)
- -

- -
subset(ag_fields@data, select=c(site, NDVI_2012_a, ndvi_2012_a_velox))
- -
##          site NDVI_2012_a ndvi_2012_a_velox
-## 1   Kayonza15   0.4951754         0.4951754
-## 2    Kayonza4   0.5363880         0.5363880
-## 3  Rwamagana2   0.5958732         0.5958732
-## 4 Rwamagana33   0.5472275         0.5472275
-## 5 Rwamagana34   0.5667049         0.5667049
-## 6 Rwamagana35   0.5793644         0.5793644
- -

Useful Projections

- -

Geographic Coordinate System

- -

The most common geographic coordinate system is the World Geodetic System (WGS84).

- -
"+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0"
- -

Projected Coordinate Systems

- -

Below are projections for equal area and equal distance. Replace [LATITUDE VALUE] and [LONGITUDE VALUE] with the center of the area that you're working with.

- -

Equal Area: Lambert Azimuthal Equal Area

- -
"+proj=laea +lat_0=[LATITUDE VALUE] +lon_0=[LONGITUDE VALUE]"
- -

Equal Distance: Azimuthal equidistant projection

- -
"+proj=aeqd +lat_0=[LATITUDE VALUE] +lon_0=[LONGITUDE VALUE]"
- -

Calculating Distances: Way 1

- -

A common task in GIS is to calculate distances. Here, let's calculate the distance of each household location to the nearest city (using a dataset of Rwanda's 3 largest cities). This same syntax can be used to calculate the shortest distance to polygons (e.g., a lake) or polylines (e.g., roads).

- -

Import city-level dataset

- -
rwa_cities <- read.csv(file.path(finalData, "rwa_cities.csv"))
-coordinates(rwa_cities) <- ~lon+lat
-crs(rwa_cities) <- CRS("+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0")
- -

Re-project to equal distant projection

- -
# Here, I used Kigali as the center
-eq_dist_projection <- "+proj=aeqd +lat_0=-1.943889 +lon_0=30.059444"
-
-rwa_cities_ed <- spTransform(rwa_cities, CRS(eq_dist_projection))
-hh_data_sdf_ed <- spTransform(hh_data_sdf, CRS(eq_dist_projection))
- -

- -

Creates distance matrix

- -
gDistance(rwa_cities_ed, hh_data_sdf_ed, byid=T) %>% head
- -
##          1         2        3
-## 1 39419.69 101785.29 74845.93
-## 2 41335.03 100056.47 75709.40
-## 3 41433.12 101555.78 76291.83
-## 4 40642.18 101794.90 75775.02
-## 5 40443.28 103553.65 76166.05
-## 6 39694.79  98492.55 73943.48
- -

Calculate shortest distance to any city.

- -
gDistance_cities_i <- function(i) gDistance(hh_data_sdf_ed[i,], rwa_cities_ed, byid=F)
-hh_data_sdf_ed$dist_city <- lapply(1:nrow(hh_data_sdf_ed), gDistance_cities_i) %>% unlist
- -

Calculating Distances: Way 2

- -

There are also ways to calculate distances by using a geographic coordinate system (ie, keeping the world a sphere) and taking into account the curvator of the earth. Haversine (good for short distances – think the distance of a short haul flight) and vincenty (slower but better for longer distances) are common formulas to calculate these distances.

- -

The below code uses the haversine method to calculate the distance between all the household locations and the first Rwandan city. The output is in meters.

- -
library(geosphere)
-distHaversine(hh_data_sdf, rwa_cities[1,]) 
- -

We can also calculate the distance between a point and a line using the dist2Line function from the geosphere package. Below shows an example between an arbitrary point and line shapefile using the haversine formula.

- -
dist2Line(points, line, distfun=distHaversine)
- - - - -
- - - - - - - - - diff --git a/Presentations/header.tex b/Presentations/header.tex new file mode 100644 index 0000000..513e8d4 --- /dev/null +++ b/Presentations/header.tex @@ -0,0 +1,27 @@ +\AtBeginSection[] +{ + \begin{frame} + \frametitle{Outline} + \tableofcontents[currentsection] + \end{frame} +} + +\titlegraphic{ + \begin{figure} + \begin{minipage}[b]{0.3\textwidth} + \includegraphics[width=0.7\linewidth]{img/DIME_logo.png} + \end{minipage} + \hspace{1cm} + \begin{minipage}[b]{0.3\textwidth} + \includegraphics[width=0.9\linewidth]{img/wbg.png} + \vspace{0.5cm} + \end{minipage} + \end{figure} + } + +\usepackage{float} +\usepackage{adjustbox} +\usepackage{colortbl} + +\definecolor{myColor}{rgb}{.05,.58,.71} +\usecolortheme[named=myColor]{structure} diff --git a/Presentations/img/Attribute_Table.png b/Presentations/img/Attribute_Table.png new file mode 100644 index 0000000..8c5f61f Binary files /dev/null and b/Presentations/img/Attribute_Table.png differ diff --git a/Presentations/img/L05_fig48.gif b/Presentations/img/L05_fig48.gif new file mode 100644 index 0000000..4e50021 Binary files /dev/null and b/Presentations/img/L05_fig48.gif differ diff --git a/DataWork/DataSets/Final/placeholder.txt b/Presentations/img/New/place_holder.txt similarity index 100% rename from DataWork/DataSets/Final/placeholder.txt rename to Presentations/img/New/place_holder.txt diff --git a/Presentations/img/Options1.png b/Presentations/img/Options1.png new file mode 100644 index 0000000..8a4341f Binary files /dev/null and b/Presentations/img/Options1.png differ diff --git a/Presentations/img/Options2.png b/Presentations/img/Options2.png new file mode 100644 index 0000000..d672cd5 Binary files /dev/null and b/Presentations/img/Options2.png differ diff --git a/Presentations/img/Options3.png b/Presentations/img/Options3.png new file mode 100644 index 0000000..35797da Binary files /dev/null and b/Presentations/img/Options3.png differ diff --git a/Presentations/img/README.md b/Presentations/img/README.md index 0b16ae5..ce3f610 100644 --- a/Presentations/img/README.md +++ b/Presentations/img/README.md @@ -1,22 +1,22 @@ -# Placeholder file - -This file has been created automatically by the command **iegitaddmd** from Stata package [**ietoolkit**](https://worldbank.github.io/ietoolkit) to make GitHub sync this folder. GitHub does not sync empty folders or folders that only contain ignored files, but in research projects it is often important to share the full standardized folder structure along with the actual files. This command is intended to be used with **iefolder**, but it can be used in any folder structure intended to be shared on GitHub. - -In recently started projects, it is typical to create data folders, script folders (do-files, r-files) and output folders, among others. The output folders are initially empty, but the script files might include a file path to them for later use. If an output folder is empty and another collaborator clones the repository, then the output folder expected by the scripts will not be included by GitHub in the cloned repository, and the script will not run properly. - -## You should replace the content of this placeholder file - -The text in this file should be replaced with text that describes the intended use of this folder. This file is written in markdown (.md) format, which is suitable for GitHub syncing (unlike .doc/.docx). If the file is named *README.md*, GitHub automatically displays its contents when someone opens the containing folder on GitHub.com using a web browser. - -If you are new to markup languages (markdown, html etc.) then this [Markdown Tutorial](https://www.markdowntutorial.com/) is a great place to start. If you have some experience with markup languages, then this [Markdown Cheat Sheet](https://guides.github.com/pdfs/markdown-cheatsheet-online.pdf) is great place to start. - -## Add similar files in other folders - -The Stata command **iegitaddmd** does not add anything to folders that already have content unless this is explicitly requested using the option `all`. It is best practice to create a *README.md* file in any folders that have content, for example, in folders whose purpose might not obvious to someone using the repository for the first time. Again, if the file is named *README.md*, then the content of the file will be shown in the browser when someone explores the repository on GitHub.com. - -Another great use of a *README.md* file is to use it as a documentation on how the folder it sits in and its subfolders are organized, where its content can be found, and where new content is meant to be saved. For example, if you have a folder called `/Baseline/`, then you can give a short description of the activities conducted during the baseline and where data, scripts and outputs related to it can be found. - -## Removing this file - -Removing this file is not recommended, as GitHub may stop syncing the parent folder unless the folder now has other content. However, even when content is added and the file can be removed without breaking the GitHub functionality, our recommendation is to replace the content with more relevant content rather than deleting it. - +# Placeholder file + +This file has been created automatically by the command **iegitaddmd** from Stata package [**ietoolkit**](https://worldbank.github.io/ietoolkit) to make GitHub sync this folder. GitHub does not sync empty folders or folders that only contain ignored files, but in research projects it is often important to share the full standardized folder structure along with the actual files. This command is intended to be used with **iefolder**, but it can be used in any folder structure intended to be shared on GitHub. + +In recently started projects, it is typical to create data folders, script folders (do-files, r-files) and output folders, among others. The output folders are initially empty, but the script files might include a file path to them for later use. If an output folder is empty and another collaborator clones the repository, then the output folder expected by the scripts will not be included by GitHub in the cloned repository, and the script will not run properly. + +## You should replace the content of this placeholder file + +The text in this file should be replaced with text that describes the intended use of this folder. This file is written in markdown (.md) format, which is suitable for GitHub syncing (unlike .doc/.docx). If the file is named *README.md*, GitHub automatically displays its contents when someone opens the containing folder on GitHub.com using a web browser. + +If you are new to markup languages (markdown, html etc.) then this [Markdown Tutorial](https://www.markdowntutorial.com/) is a great place to start. If you have some experience with markup languages, then this [Markdown Cheat Sheet](https://guides.github.com/pdfs/markdown-cheatsheet-online.pdf) is great place to start. + +## Add similar files in other folders + +The Stata command **iegitaddmd** does not add anything to folders that already have content unless this is explicitly requested using the option `all`. It is best practice to create a *README.md* file in any folders that have content, for example, in folders whose purpose might not obvious to someone using the repository for the first time. Again, if the file is named *README.md*, then the content of the file will be shown in the browser when someone explores the repository on GitHub.com. + +Another great use of a *README.md* file is to use it as a documentation on how the folder it sits in and its subfolders are organized, where its content can be found, and where new content is meant to be saved. For example, if you have a folder called `/Baseline/`, then you can give a short description of the activities conducted during the baseline and where data, scripts and outputs related to it can be found. + +## Removing this file + +Removing this file is not recommended, as GitHub may stop syncing the parent folder unless the folder now has other content. However, even when content is added and the file can be removed without breaking the GitHub functionality, our recommendation is to replace the content with more relevant content rather than deleting it. + diff --git a/Presentations/img/View.png b/Presentations/img/View.png new file mode 100644 index 0000000..c1df7bd Binary files /dev/null and b/Presentations/img/View.png differ diff --git a/Presentations/img/Warning.PNG b/Presentations/img/Warning.PNG new file mode 100644 index 0000000..3f2a856 Binary files /dev/null and b/Presentations/img/Warning.PNG differ diff --git a/Presentations/img/enviroment_2vars.png b/Presentations/img/enviroment_2vars.png new file mode 100644 index 0000000..8ea24ca Binary files /dev/null and b/Presentations/img/enviroment_2vars.png differ diff --git a/Presentations/img/enviroment_factors.png b/Presentations/img/enviroment_factors.png new file mode 100644 index 0000000..de71c29 Binary files /dev/null and b/Presentations/img/enviroment_factors.png differ diff --git a/Presentations/img/geoscripting-system-overview.svg b/Presentations/img/geoscripting-system-overview.svg index 947aa54..3b776e1 100644 --- a/Presentations/img/geoscripting-system-overview.svg +++ b/Presentations/img/geoscripting-system-overview.svg @@ -1,21 +1,21 @@ - -Packages -System Overview -Integrated Development -Environment (IDE) -RStudio, -RKWard, -R GUI, ... -Engine -Bindings -Libraries -Remote -repository -Local data -rgeos -rgdal -sp, raster, maptools, ... -GEOS -OGR -proj - + +Packages +System Overview +Integrated Development +Environment (IDE) +RStudio, +RKWard, +R GUI, ... +Engine +Bindings +Libraries +Remote +repository +Local data +rgeos +rgdal +sp, raster, maptools, ... +GEOS +OGR +proj + diff --git a/Presentations/img/ggplotly1.png b/Presentations/img/ggplotly1.png new file mode 100644 index 0000000..8d14213 Binary files /dev/null and b/Presentations/img/ggplotly1.png differ diff --git a/Presentations/img/ggplotly2.png b/Presentations/img/ggplotly2.png new file mode 100644 index 0000000..c738ada Binary files /dev/null and b/Presentations/img/ggplotly2.png differ diff --git a/Presentations/img/happy_table_xlsx.PNG b/Presentations/img/happy_table_xlsx.PNG new file mode 100644 index 0000000..5ab77b2 Binary files /dev/null and b/Presentations/img/happy_table_xlsx.PNG differ diff --git a/Presentations/img/panel.PNG b/Presentations/img/panel.PNG index eb25e0a..c319443 100644 Binary files a/Presentations/img/panel.PNG and b/Presentations/img/panel.PNG differ diff --git a/Presentations/img/raster_r_example.png b/Presentations/img/raster_r_example.png new file mode 100644 index 0000000..8b6cd54 Binary files /dev/null and b/Presentations/img/raster_r_example.png differ diff --git a/Presentations/img/rm.PNG b/Presentations/img/rm.PNG new file mode 100644 index 0000000..2457cd5 Binary files /dev/null and b/Presentations/img/rm.PNG differ diff --git a/Presentations/img/scritpt4.png b/Presentations/img/scritpt4.png new file mode 100644 index 0000000..d928159 Binary files /dev/null and b/Presentations/img/scritpt4.png differ diff --git a/Presentations/img/worldmap_plot.png b/Presentations/img/worldmap_plot.png new file mode 100644 index 0000000..f4af4d4 Binary files /dev/null and b/Presentations/img/worldmap_plot.png differ diff --git a/README.md b/README.md index b122b63..25fb69b 100644 --- a/README.md +++ b/README.md @@ -3,22 +3,24 @@ ## Background This material was developed by the [DIME Analytics](https://worldbank.github.io/dimeanalytics/) team as an introduction to R Statistical Package for its staff. It builds upon knowledge of Stata to explore features of R with impact evaluation applications in mind. It also assumes some degree of familiarity with DIME's [coding practices](https://dimewiki.worldbank.org/wiki/Stata_Coding_Practices). -## Version -The master branch version was adopted in DIME's R Couse in November/December 2018. +## Current version +The development branch version is being adopted in DIME's R training in November and December 2018. As the training is on going, the content is being updated in real time as each presentation has a final version. -## Content -It currently includes the following content: -1. Intro to R: introduction to RStudio, R syntax, objects and classes. -1. Descriptive Analysis: how to create and export descriptive statistics and regression tables in R. -1. Data Visualization: an introduction to creating and export graphs in ggplot2. -1. Spatial Data: an overview of R resources on spatial data and maps creation using ggmap. -1. Data Processing: basic functions for processing data (subsetting, merging, appending). +Other presentations were adopted in DIME's [Field Coordinator Training](http://www.worldbank.org/en/events/2018/04/09/manage-successful-impact-evaluations) in June 18. + +## FC training content +1. Intro to R: introduction to RStudio, R syntax, objects and classes. Designed for a 3h session. +1. Coding Best Practices in R: uses and R master script to explore code organization, loops, custom functions and if statements. Designed for a 1h30 session. +1. Data Processing: basic functions for porcessing data. Designed for a 2h session. +1. Descriptive Statistics: how to create and export descriptive statistics table in R. Designed for a 1h30 session. +1. Data Visualization: an introduction to creating and export graphs in ggplot2. Designed for a 1h30 session. +1. Geopatial Data: an overview of R resources on GIS. Desined for a 3h session. ## License This material is developed under MIT license. See http://adampritchard.mit-license.org/ or see [the `LICENSE` file](https://github.com/worldbank/ietoolkit/blob/master/LICENSE) for details. ## Main Contact -Luiza Cardoso de Andrade (lcardosodeandrad@worldbank.org) +Luiza Cardoso de Andrade (lcardoso@worldbank.org) ## Authors Luiza Cardoso de Andrade, Robert A. Marty, Leonardo Teixeira Viotti