add ID data and scripts for 2013, 2014, 2018, issue #23

public-salaries · Jan 18, 2018 · f023c42 · f023c42
1 parent 413ecd0
commit f023c42
Show file tree

Hide file tree

Showing 11 changed files with 468 additions and 67 deletions.
diff --git a/README.md b/README.md
@@ -28,7 +28,7 @@ In all, our aim is to investigate the extent to which compensation for different
 |              [Florida](fl/)              |    1995--2014    |         1997--2011         | 1997--2014                 |         1997--2014         |             2014             |
 |              [Georgia](ga/)              |    2010--2016    |                            |                            |                            |                              |
 |              [Hawaii](hi/)               |       2016       |                            |                            |                            |                              |
-|               [Idaho](id/)               | 2008, 2013, 2017 |                            |                            |                            |                              |
+|               [Idaho](id/)               | 2008, 2013--2014, 2017--2018 |                            |                            |                            |                              |
 |             [Illinois](il/)              |                  |                            |                            |         2009--2012         |                              |
 | [Indiana](https://github.com/public-salaries/in_salaries) |                  |                            |                            |                            |                              |
 |               [Iowa](ia/)                |    2006--2016    |                            |                            |                            |                              |

diff --git a/id/2013/scripts/id_read_pdf_state_2013.R b/id/2013/scripts/id_read_pdf_state_2013.R
@@ -1,5 +1,4 @@
 
-library(pdftools)
 library(tibble)
 library(magrittr)
 
@@ -10,13 +9,16 @@ cwd <- getwd()
 
 # Define data directory file path, and path to the PDF file..
 data_dir <- file.path(cwd, "id", "2013")
-pdf_file <- file.path(data_dir, "state.pdf")
+pdf_file <- file.path(data_dir, "state_2013.pdf")
 
 # Read in functions that will be used throughout this script.
 source(file.path(cwd, "id", "id_functions.R"))
 
-# Read in the pdf doc.
-txt <- pdftools::pdf_text(pdf_file)
+# Read in the pdf doc. Using a hacky function for this, as pdftools::pdf_text
+# is not working for any of the ID pdf docs. This function uses base::system2
+# as a way to call "pdftotext" via the cmd prompt on the input pdf doc. It 
+# has been tested on a PC, it has NOT been tested on a Mac or Linux.
+txt <- manual_read_pdf(pdf_file)
 
 # Establish col headers.
 cols <- c(
@@ -31,13 +33,14 @@ cols <- c(
 )
 
 # For each page of the doc, extract each relevant observation as a char vector.
-observations <- vector(mode = "character", length = length(txt) * 30)
+observations <- vector(mode = "character", length = length(txt) * 35)
 counter <- 1
 for (page in txt) {
   obs <- page %>% 
-    strsplit("\n", fixed = TRUE) %>% 
+    strsplit("\r\n\r\n", fixed = TRUE) %>% 
     unlist(FALSE, FALSE) %>% 
-    .[!grepl("^\\s{8,}", .)]
+    .[!grepl("^\\s+classified$|^\\s{10,}|name\\s+job title\\s+agency\\s+", ., 
+             ignore.case = TRUE)]
   for (i in obs) {
     observations[counter] <- i
     counter <- counter + 1
@@ -47,21 +50,38 @@ observations <- observations[observations != ""]
 
 # For each observation within the pdf, extract relevant data and save output 
 # as a single-row data frame. Each of these df's will be compiled into a list, 
-# which will then be rbind together using do.call().
+# which will then be rbind together using do.call.
 obs_list <- lapply(observations, function(j) {
   # Split obs up into a vector of elements (as strings).
   obs <- j %>% 
     strsplit("\\s{2,}") %>% 
     unlist(., FALSE, FALSE)
 
-  # If obs is length 2 or less, skip it and move on to the next iteration.
-  if (length(obs) <= 2) {
-    return(data.frame())
+  # If length of obs is greater than 8, redo the strsplit step with a delim of 
+  # "3 or more" spaces.
+  if (length(obs) > 8) {
+    obs <- j %>% 
+      strsplit("\\s{3,}") %>% 
+      unlist(., FALSE, FALSE) %>% 
+      gsub("\\s{2,}", " ", .)
+  }
+
+  # Check to see if the name string was split up by mistake.
+  if (grepl(".*,$", obs[1]) || grepl(".*,.*\\.$|[A-Z]\\.|\\.", obs[2])) {
+    obs[1] <- paste(obs[1], obs[2])
+    obs <- obs[-2]
+  }
+
+  # If obs is length 3 or less, return the values as they are and move on.
+  if (length(obs) <= 3) {
+    return(get_single_obs_df(obs, col_names = cols))
   }
 
   # If obs is vector of length 8, this means the elements of obs were cleanly 
   # delimited, and can safely return the relevant elements.
-  if (length(obs) == 8) {
+  if (length(obs) == 8 && 
+      grepl("^NON-$|^CLASSIFIED$", obs[4], ignore.case = TRUE)) {
+    obs <- gsub("^NON-$", "non-classified", obs, ignore.case = TRUE)
     return(get_single_obs_df(obs))
   }
 
@@ -80,6 +100,15 @@ obs_list <- lapply(observations, function(j) {
   obs <- job_title$obs
   job_title <- job_title$job_title
 
+  # If length of obs is 7, this means the value for variable agency or job 
+  # title is split into two values. Combine all of the values in obs[2:4] into 
+  # a single str (the agency and job_title should get properly separated 
+  # during the "get_agency" step, which happens later).
+  if (length(obs) == 7) {
+    obs[2] <- paste(obs[2:4], collapse = " ")
+    obs <- obs[-c(3:4)]
+  }
+
   # Return obs and other values as a data frame.
   return(get_single_obs_df(obs, type_appt, comp, job_title))
 })
@@ -92,20 +121,20 @@ obs_df <- obs_list %>%
   tibble::as_data_frame() %>% 
   .[apply(., 1, function(x) !all(is.na(x))), ]
 
-# Establish an agency data dictionary.
-agency_dd <- obs_df$agency %>% 
-  unique %>% 
-  gsub("classified$|non-$", "", ., ignore.case = TRUE) %>% 
-  trimws %>% 
-  unique %>% 
-  .[!is.na(.)] %>% 
-  .[order(nchar(.), decreasing = TRUE)]
-
-## Try to fill in NA's in col agency using the newly created data dict.
-for (row in which(is.na(obs_df$agency))) {
-  #obs <- obs_df[row, ]
-  # Try to extract the job title value from the other values in vect.
-  obs_df[row, ] <- get_agency(obs_df[row, ], agency_dd)
+# Fix instances in which a portion of a person's name was split off into a new
+# row (when a string is too long for a single cell, the PDF will wrap the 
+# string into a second line within the "cell", and this causes pdftotext or R 
+# or both to read that as a completely new observation/row).
+obs_df <- stitch_broken_name_strings(obs_df)
+
+# Go back and extract/fill-in NA's within variable agency.
+obs_df <- fill_missing_agency(obs_df)
+
+# For agency values that have a leading single digit, the digit is actually 
+# part of the job_title value. Remove the digit from the agency value and 
+# append it to the end of the job_title value.
+for (row in which(grepl("^\\d", obs_df$agency))) {
+  obs_df[row, ] <- digit_move(obs_df[row, ])
 }
 
 ## Split up the values in the name variable, and create three new variables:
@@ -118,6 +147,6 @@ obs_df$middle_initial <- vapply(full_names, function(x) x[[2]], character(1))
 # Write obs_df to file.
 write.csv(
   obs_df, 
-  file.path(data_dir, "state.csv"), 
+  file.path(data_dir, "state_2013.csv"), 
   row.names = FALSE
 )
diff --git a/id/2013/state.csv.7z b/id/2013/state.csv.7z
diff --git a/id/2013/state_2013.7z b/id/2013/state_2013.7z
diff --git a/id/2013/state.pdf → id/2013/state_2013.pdf b/id/2013/state.pdf → id/2013/state_2013.pdf
diff --git a/id/2014/scripts/id_read_pdf_state_2014.R b/id/2014/scripts/id_read_pdf_state_2014.R
@@ -1,5 +1,4 @@
 
-library(pdftools)
 library(tibble)
 library(magrittr)
 
@@ -15,8 +14,11 @@ pdf_file <- file.path(data_dir, "state_2014.pdf")
 # Read in functions that will be used throughout this script.
 source(file.path(cwd, "id", "id_functions.R"))
 
-# Read in the pdf doc.
-txt <- pdftools::pdf_text(pdf_file)
+# Read in the pdf doc. Using a hacky function for this, as pdftools::pdf_text
+# is not working for any of the ID pdf docs. This function uses base::system2
+# as a way to call "pdftotext" via the cmd prompt on the input pdf doc. It 
+# has been tested on a PC, it has NOT been tested on a Mac or Linux.
+txt <- manual_read_pdf(pdf_file)
 
 # Establish col headers.
 cols <- c(
@@ -31,13 +33,14 @@ cols <- c(
 )
 
 # For each page of the doc, extract each relevant observation as a char vector.
-observations <- vector(mode = "character", length = length(txt) * 30)
+observations <- vector(mode = "character", length = length(txt) * 35)
 counter <- 1
 for (page in txt) {
   obs <- page %>% 
-    strsplit("\n", fixed = TRUE) %>% 
+    strsplit("\r\n\r\n", fixed = TRUE) %>% 
     unlist(FALSE, FALSE) %>% 
-    .[!grepl("^\\s{8,}", .)]
+    .[!grepl("^\\s+classified$|^\\s{10,}|name\\s+job title\\s+agency\\s+", ., 
+             ignore.case = TRUE)]
   for (i in obs) {
     observations[counter] <- i
     counter <- counter + 1
@@ -47,21 +50,38 @@ observations <- observations[observations != ""]
 
 # For each observation within the pdf, extract relevant data and save output 
 # as a single-row data frame. Each of these df's will be compiled into a list, 
-# which will then be rbind together using do.call().
+# which will then be rbind together using do.call.
 obs_list <- lapply(observations, function(j) {
   # Split obs up into a vector of elements (as strings).
   obs <- j %>% 
     strsplit("\\s{2,}") %>% 
     unlist(., FALSE, FALSE)
 
-  # If obs is length 2 or less, skip it and move on to the next iteration.
-  if (length(obs) <= 2) {
-    return(data.frame())
+  # If length of obs is greater than 8, redo the strsplit step with a delim of 
+  # "3 or more" spaces.
+  if (length(obs) > 8) {
+    obs <- j %>% 
+      strsplit("\\s{3,}") %>% 
+      unlist(., FALSE, FALSE) %>% 
+      gsub("\\s{2,}", " ", .)
+  }
+
+  # Check to see if the name string was split up by mistake.
+  if (grepl(".*,$", obs[1]) || grepl(".*,.*\\.$|[A-Z]\\.|\\.", obs[2])) {
+    obs[1] <- paste(obs[1], obs[2])
+    obs <- obs[-2]
+  }
+
+  # If obs is length 3 or less, return the values as they are and move on.
+  if (length(obs) <= 3) {
+    return(get_single_obs_df(obs, col_names = cols))
   }
 
   # If obs is vector of length 8, this means the elements of obs were cleanly 
   # delimited, and can safely return the relevant elements.
-  if (length(obs) == 8) {
+  if (length(obs) == 8 && 
+      grepl("^NON-$|^CLASSIFIED$", obs[4], ignore.case = TRUE)) {
+    obs <- gsub("^NON-$", "non-classified", obs, ignore.case = TRUE)
     return(get_single_obs_df(obs))
   }
 
@@ -80,6 +100,15 @@ obs_list <- lapply(observations, function(j) {
   obs <- job_title$obs
   job_title <- job_title$job_title
 
+  # If length of obs is 7, this means the value for variable agency or job 
+  # title is split into two values. Combine all of the values in obs[2:4] into 
+  # a single str (the agency and job_title should get properly separated 
+  # during the "get_agency" step, which happens later).
+  if (length(obs) == 7) {
+    obs[2] <- paste(obs[2:4], collapse = " ")
+    obs <- obs[-c(3:4)]
+  }
+
   # Return obs and other values as a data frame.
   return(get_single_obs_df(obs, type_appt, comp, job_title))
 })
@@ -92,20 +121,21 @@ obs_df <- obs_list %>%
   tibble::as_data_frame() %>% 
   .[apply(., 1, function(x) !all(is.na(x))), ]
 
-# Establish an agency data dictionary.
-agency_dd <- obs_df$agency %>% 
-  unique %>% 
-  gsub("classified$|non-$", "", ., ignore.case = TRUE) %>% 
-  trimws %>% 
-  unique %>% 
-  .[!is.na(.)] %>% 
-  .[order(nchar(.), decreasing = TRUE)]
-
-## Try to fill in NA's in col agency using the newly created data dict.
-for (row in which(is.na(obs_df$agency))) {
-  #obs <- obs_df[row, ]
-  # Try to extract the job title value from the other values in vect.
-  obs_df[row, ] <- get_agency(obs_df[row, ], agency_dd)
+
+# Fix instances in which a portion of a person's name was split off into a new
+# row (when a string is too long for a single cell, the PDF will wrap the 
+# string into a second line within the "cell", and this causes pdftotext or R 
+# or both to read that as a completely new observation/row).
+obs_df <- stitch_broken_name_strings(obs_df)
+
+# Go back and extract/fill-in NA's within variable agency.
+obs_df <- fill_missing_agency(obs_df)
+
+# For agency values that have a leading single digit, the digit is actually 
+# part of the job_title value. Remove the digit from the agency value and 
+# append it to the end of the job_title value.
+for (row in which(grepl("^\\d", obs_df$agency))) {
+  obs_df[row, ] <- digit_move(obs_df[row, ])
 }
 
 ## Split up the values in the name variable, and create three new variables:

diff --git a/id/2014/state_2014.7z b/id/2014/state_2014.7z