Skip to content

Commit

Permalink
add ID data and scripts for 2013, 2014, 2018, issue #23
Browse files Browse the repository at this point in the history
  • Loading branch information
ChrisMuir committed Jan 18, 2018
1 parent 413ecd0 commit f023c42
Show file tree
Hide file tree
Showing 11 changed files with 468 additions and 67 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ In all, our aim is to investigate the extent to which compensation for different
| [Florida](fl/) | 1995--2014 | 1997--2011 | 1997--2014 | 1997--2014 | 2014 |
| [Georgia](ga/) | 2010--2016 | | | | |
| [Hawaii](hi/) | 2016 | | | | |
| [Idaho](id/) | 2008, 2013, 2017 | | | | |
| [Idaho](id/) | 2008, 2013--2014, 2017--2018 | | | | |
| [Illinois](il/) | | | | 2009--2012 | |
| [Indiana](https://github.com/public-salaries/in_salaries) | | | | | |
| [Iowa](ia/) | 2006--2016 | | | | |
Expand Down
83 changes: 56 additions & 27 deletions id/2013/scripts/id_read_pdf_state_2013.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@

library(pdftools)
library(tibble)
library(magrittr)

Expand All @@ -10,13 +9,16 @@ cwd <- getwd()

# Define data directory file path, and path to the PDF file..
data_dir <- file.path(cwd, "id", "2013")
pdf_file <- file.path(data_dir, "state.pdf")
pdf_file <- file.path(data_dir, "state_2013.pdf")

# Read in functions that will be used throughout this script.
source(file.path(cwd, "id", "id_functions.R"))

# Read in the pdf doc.
txt <- pdftools::pdf_text(pdf_file)
# Read in the pdf doc. Using a hacky function for this, as pdftools::pdf_text
# is not working for any of the ID pdf docs. This function uses base::system2
# as a way to call "pdftotext" via the cmd prompt on the input pdf doc. It
# has been tested on a PC, it has NOT been tested on a Mac or Linux.
txt <- manual_read_pdf(pdf_file)

# Establish col headers.
cols <- c(
Expand All @@ -31,13 +33,14 @@ cols <- c(
)

# For each page of the doc, extract each relevant observation as a char vector.
observations <- vector(mode = "character", length = length(txt) * 30)
observations <- vector(mode = "character", length = length(txt) * 35)
counter <- 1
for (page in txt) {
obs <- page %>%
strsplit("\n", fixed = TRUE) %>%
strsplit("\r\n\r\n", fixed = TRUE) %>%
unlist(FALSE, FALSE) %>%
.[!grepl("^\\s{8,}", .)]
.[!grepl("^\\s+classified$|^\\s{10,}|name\\s+job title\\s+agency\\s+", .,
ignore.case = TRUE)]
for (i in obs) {
observations[counter] <- i
counter <- counter + 1
Expand All @@ -47,21 +50,38 @@ observations <- observations[observations != ""]

# For each observation within the pdf, extract relevant data and save output
# as a single-row data frame. Each of these df's will be compiled into a list,
# which will then be rbind together using do.call().
# which will then be rbind together using do.call.
obs_list <- lapply(observations, function(j) {
# Split obs up into a vector of elements (as strings).
obs <- j %>%
strsplit("\\s{2,}") %>%
unlist(., FALSE, FALSE)

# If obs is length 2 or less, skip it and move on to the next iteration.
if (length(obs) <= 2) {
return(data.frame())
# If length of obs is greater than 8, redo the strsplit step with a delim of
# "3 or more" spaces.
if (length(obs) > 8) {
obs <- j %>%
strsplit("\\s{3,}") %>%
unlist(., FALSE, FALSE) %>%
gsub("\\s{2,}", " ", .)
}

# Check to see if the name string was split up by mistake.
if (grepl(".*,$", obs[1]) || grepl(".*,.*\\.$|[A-Z]\\.|\\.", obs[2])) {
obs[1] <- paste(obs[1], obs[2])
obs <- obs[-2]
}

# If obs is length 3 or less, return the values as they are and move on.
if (length(obs) <= 3) {
return(get_single_obs_df(obs, col_names = cols))
}

# If obs is vector of length 8, this means the elements of obs were cleanly
# delimited, and can safely return the relevant elements.
if (length(obs) == 8) {
if (length(obs) == 8 &&
grepl("^NON-$|^CLASSIFIED$", obs[4], ignore.case = TRUE)) {
obs <- gsub("^NON-$", "non-classified", obs, ignore.case = TRUE)
return(get_single_obs_df(obs))
}

Expand All @@ -80,6 +100,15 @@ obs_list <- lapply(observations, function(j) {
obs <- job_title$obs
job_title <- job_title$job_title

# If length of obs is 7, this means the value for variable agency or job
# title is split into two values. Combine all of the values in obs[2:4] into
# a single str (the agency and job_title should get properly separated
# during the "get_agency" step, which happens later).
if (length(obs) == 7) {
obs[2] <- paste(obs[2:4], collapse = " ")
obs <- obs[-c(3:4)]
}

# Return obs and other values as a data frame.
return(get_single_obs_df(obs, type_appt, comp, job_title))
})
Expand All @@ -92,20 +121,20 @@ obs_df <- obs_list %>%
tibble::as_data_frame() %>%
.[apply(., 1, function(x) !all(is.na(x))), ]

# Establish an agency data dictionary.
agency_dd <- obs_df$agency %>%
unique %>%
gsub("classified$|non-$", "", ., ignore.case = TRUE) %>%
trimws %>%
unique %>%
.[!is.na(.)] %>%
.[order(nchar(.), decreasing = TRUE)]

## Try to fill in NA's in col agency using the newly created data dict.
for (row in which(is.na(obs_df$agency))) {
#obs <- obs_df[row, ]
# Try to extract the job title value from the other values in vect.
obs_df[row, ] <- get_agency(obs_df[row, ], agency_dd)
# Fix instances in which a portion of a person's name was split off into a new
# row (when a string is too long for a single cell, the PDF will wrap the
# string into a second line within the "cell", and this causes pdftotext or R
# or both to read that as a completely new observation/row).
obs_df <- stitch_broken_name_strings(obs_df)

# Go back and extract/fill-in NA's within variable agency.
obs_df <- fill_missing_agency(obs_df)

# For agency values that have a leading single digit, the digit is actually
# part of the job_title value. Remove the digit from the agency value and
# append it to the end of the job_title value.
for (row in which(grepl("^\\d", obs_df$agency))) {
obs_df[row, ] <- digit_move(obs_df[row, ])
}

## Split up the values in the name variable, and create three new variables:
Expand All @@ -118,6 +147,6 @@ obs_df$middle_initial <- vapply(full_names, function(x) x[[2]], character(1))
# Write obs_df to file.
write.csv(
obs_df,
file.path(data_dir, "state.csv"),
file.path(data_dir, "state_2013.csv"),
row.names = FALSE
)
Binary file removed id/2013/state.csv.7z
Binary file not shown.
Binary file added id/2013/state_2013.7z
Binary file not shown.
File renamed without changes.
80 changes: 55 additions & 25 deletions id/2014/scripts/id_read_pdf_state_2014.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@

library(pdftools)
library(tibble)
library(magrittr)

Expand All @@ -15,8 +14,11 @@ pdf_file <- file.path(data_dir, "state_2014.pdf")
# Read in functions that will be used throughout this script.
source(file.path(cwd, "id", "id_functions.R"))

# Read in the pdf doc.
txt <- pdftools::pdf_text(pdf_file)
# Read in the pdf doc. Using a hacky function for this, as pdftools::pdf_text
# is not working for any of the ID pdf docs. This function uses base::system2
# as a way to call "pdftotext" via the cmd prompt on the input pdf doc. It
# has been tested on a PC, it has NOT been tested on a Mac or Linux.
txt <- manual_read_pdf(pdf_file)

# Establish col headers.
cols <- c(
Expand All @@ -31,13 +33,14 @@ cols <- c(
)

# For each page of the doc, extract each relevant observation as a char vector.
observations <- vector(mode = "character", length = length(txt) * 30)
observations <- vector(mode = "character", length = length(txt) * 35)
counter <- 1
for (page in txt) {
obs <- page %>%
strsplit("\n", fixed = TRUE) %>%
strsplit("\r\n\r\n", fixed = TRUE) %>%
unlist(FALSE, FALSE) %>%
.[!grepl("^\\s{8,}", .)]
.[!grepl("^\\s+classified$|^\\s{10,}|name\\s+job title\\s+agency\\s+", .,
ignore.case = TRUE)]
for (i in obs) {
observations[counter] <- i
counter <- counter + 1
Expand All @@ -47,21 +50,38 @@ observations <- observations[observations != ""]

# For each observation within the pdf, extract relevant data and save output
# as a single-row data frame. Each of these df's will be compiled into a list,
# which will then be rbind together using do.call().
# which will then be rbind together using do.call.
obs_list <- lapply(observations, function(j) {
# Split obs up into a vector of elements (as strings).
obs <- j %>%
strsplit("\\s{2,}") %>%
unlist(., FALSE, FALSE)

# If obs is length 2 or less, skip it and move on to the next iteration.
if (length(obs) <= 2) {
return(data.frame())
# If length of obs is greater than 8, redo the strsplit step with a delim of
# "3 or more" spaces.
if (length(obs) > 8) {
obs <- j %>%
strsplit("\\s{3,}") %>%
unlist(., FALSE, FALSE) %>%
gsub("\\s{2,}", " ", .)
}

# Check to see if the name string was split up by mistake.
if (grepl(".*,$", obs[1]) || grepl(".*,.*\\.$|[A-Z]\\.|\\.", obs[2])) {
obs[1] <- paste(obs[1], obs[2])
obs <- obs[-2]
}

# If obs is length 3 or less, return the values as they are and move on.
if (length(obs) <= 3) {
return(get_single_obs_df(obs, col_names = cols))
}

# If obs is vector of length 8, this means the elements of obs were cleanly
# delimited, and can safely return the relevant elements.
if (length(obs) == 8) {
if (length(obs) == 8 &&
grepl("^NON-$|^CLASSIFIED$", obs[4], ignore.case = TRUE)) {
obs <- gsub("^NON-$", "non-classified", obs, ignore.case = TRUE)
return(get_single_obs_df(obs))
}

Expand All @@ -80,6 +100,15 @@ obs_list <- lapply(observations, function(j) {
obs <- job_title$obs
job_title <- job_title$job_title

# If length of obs is 7, this means the value for variable agency or job
# title is split into two values. Combine all of the values in obs[2:4] into
# a single str (the agency and job_title should get properly separated
# during the "get_agency" step, which happens later).
if (length(obs) == 7) {
obs[2] <- paste(obs[2:4], collapse = " ")
obs <- obs[-c(3:4)]
}

# Return obs and other values as a data frame.
return(get_single_obs_df(obs, type_appt, comp, job_title))
})
Expand All @@ -92,20 +121,21 @@ obs_df <- obs_list %>%
tibble::as_data_frame() %>%
.[apply(., 1, function(x) !all(is.na(x))), ]

# Establish an agency data dictionary.
agency_dd <- obs_df$agency %>%
unique %>%
gsub("classified$|non-$", "", ., ignore.case = TRUE) %>%
trimws %>%
unique %>%
.[!is.na(.)] %>%
.[order(nchar(.), decreasing = TRUE)]

## Try to fill in NA's in col agency using the newly created data dict.
for (row in which(is.na(obs_df$agency))) {
#obs <- obs_df[row, ]
# Try to extract the job title value from the other values in vect.
obs_df[row, ] <- get_agency(obs_df[row, ], agency_dd)

# Fix instances in which a portion of a person's name was split off into a new
# row (when a string is too long for a single cell, the PDF will wrap the
# string into a second line within the "cell", and this causes pdftotext or R
# or both to read that as a completely new observation/row).
obs_df <- stitch_broken_name_strings(obs_df)

# Go back and extract/fill-in NA's within variable agency.
obs_df <- fill_missing_agency(obs_df)

# For agency values that have a leading single digit, the digit is actually
# part of the job_title value. Remove the digit from the agency value and
# append it to the end of the job_title value.
for (row in which(grepl("^\\d", obs_df$agency))) {
obs_df[row, ] <- digit_move(obs_df[row, ])
}

## Split up the values in the name variable, and create three new variables:
Expand Down
Binary file added id/2014/state_2014.7z
Binary file not shown.
Loading

0 comments on commit f023c42

Please sign in to comment.