Skip to content

Commit

Permalink
install missing packages, script for project cycle
Browse files Browse the repository at this point in the history
  • Loading branch information
pr130 committed Sep 27, 2023
1 parent 0716543 commit 6c19aa6
Show file tree
Hide file tree
Showing 3 changed files with 348 additions and 586 deletions.
163 changes: 163 additions & 0 deletions project_cycle.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
library(kbtbr)
library(readr)
library(stringr)
library(dplyr)
library(here)
library(tidyr)
library(janitor)
# for reports
library(rmarkdown)
library(forcats)
library(ggplot2)
library(correltools)
library(patchwork)

source("utils.R") # custom functions
# please refer to the readme for what the project id is
# this value should be used as a label within the kobo form - for both questions
PROJECTCYCLE_PREFIX <- "2023-10" # EDIT HERE

# kobo instance
kobo <- kbtbr::Kobo$new("kobo.correlaid.org")
all_surveys <- kobo$get_surveys()

# get survey id
survey_id <- all_surveys %>%
filter(name == "Applications for CorrelAid Projects") %>%
pull(uid)


applications <- kobo$get_submissions(survey_id)


# Kobotoolbox handles data very "wide format-y", so we have to do quite a bit of data wrangling
# this is code from the clean_kobo function in projectutils: https://github.com/CorrelAid/projectutils/blob/cd118871ae5d50c5116fd86935aa11e95a4edf25/R/applications.R#L29
# it is copied here to allow for easier modifications to the code

# rename
applications <- applications %>% dplyr::rename(applicant_id = `_id`,
motivation_why_involved = motivation_why)

# if the gender self identification variable does not exist, then create it but put NA
if (!"gender_self_identification" %in% colnames(applications)) {
applications$gender_self_identification <- NA
}

# people can apply to multiple projects at once, data is not stored in separate rows by KoboToolbox
# --> pull "applied to" information into separate rows and data frame
project_ids_df <- applications %>%
dplyr::select(applicant_id, project_id) %>%
dplyr::mutate(applied_to = project_id) %>%
tidyr::separate_rows(project_id, sep = " ") %>%
dplyr::mutate(project_id = unify_project_id_formats(project_id)) %>%
dplyr::distinct()

# project role: each project has its own column --> make into long data frame
project_roles_df <- applications %>%
dplyr::select(applicant_id, dplyr::starts_with("project_role")) %>%
tidyr::pivot_longer(dplyr::starts_with("project_role"), names_to = "project_id_unclean", values_to = "project_role") %>%
dplyr::mutate(project_id = project_id_unclean %>%
extract_ids_from_kobo_columnnames()) %>%
dplyr::filter(project_role != "DNA") %>%
dplyr::distinct() %>%
dplyr::select(-project_id_unclean)

# personal informaton and skills
# select variables and rename columns
personal_info_df <- applications %>%
dplyr::select(
applicant_id,
dplyr::starts_with("gender"),
first_name,
last_name,
email = email_address,
german_skills,
dplyr::starts_with("rating"),
dplyr::starts_with("motivation"),
consent_privacy_policy,
dplyr::starts_with("past_")
) %>%
dplyr::distinct() %>%
janitor::clean_names() %>%
dplyr::rename_with(
~ stringr::str_replace_all(.x,
"rating_technologies_tools", "skills"),
dplyr::starts_with("rating_technologies_tools")
) %>%
dplyr::rename_with( ~ stringr::str_replace_all(.x, "rating_",
""),
dplyr::starts_with("rating"))

# gender
personal_info_df <-
personal_info_df %>% dplyr::mutate(gender = dplyr::if_else(gender ==
"self_identification", NA_character_, gender))
personal_info_df$gender <- dplyr::coalesce(personal_info_df$gender,
personal_info_df$gender_self_identification)

# join the data frames
cleaned_df <- project_ids_df %>%
dplyr::left_join(project_roles_df, by = c("applicant_id", "project_id")) %>%
dplyr::left_join(personal_info_df, by = "applicant_id")

cleaned_df$project_id %>% table()

# filter for projectcycle
applications_proj_cycle <- cleaned_df %>%
filter(str_starts(project_id, PROJECTCYCLE_PREFIX))

## how many people applied
applications_proj_cycle$applicant_id %>% unique() %>% length()
# number of applications for each project
applications_proj_cycle$project_id %>% table()



# FOR EACH PROJECT - CREATE REPORT AND SAVE DATASETS
project_ids <- unique(applications_proj_cycle$project_id)

for (PROJECT_ID in project_ids) {
PROJECT_FOLDER <- here::here("projects/", PROJECT_ID)
# create folder for project
if (!dir.exists(PROJECT_FOLDER)) {
dir.create(PROJECT_FOLDER)
}

# now finally filter for our project!
appl_proj <- applications_proj_cycle %>% dplyr::filter(project_id == .env$PROJECT_ID)

appl_proj <- appl_proj %>%
dplyr::select(
applicant_id,
gender,
email,
dplyr::ends_with("name"),
dplyr::starts_with("project"),
applied_to,
dplyr::starts_with("past"),
dplyr::starts_with("skills"),
dplyr::starts_with("techniques"),
dplyr::starts_with("topics"),
dplyr::everything()
)

# anonmyize and save
appl_anon <- appl_proj %>%
select(-email, -first_name, -last_name)

anon_path <- here::here(PROJECT_FOLDER, "applications_anon.csv")
appl_anon %>% readr::write_csv(anon_path)

# mapping from email / name to applicant_id
mapping <- appl_proj %>%
select(applicant_id, email, first_name, last_name)
mapping_path <- here::here(PROJECT_FOLDER, "mapping.csv")
mapping %>% readr::write_csv(mapping_path)
# google sheets upload
# TODO

# knit report
rmarkdown::render(here::here("templates/template_applications_report.Rmd"),
output_dir = PROJECT_FOLDER,
params = list(project_id = PROJECT_ID, anon_path = anon_path))
}
Loading

0 comments on commit 6c19aa6

Please sign in to comment.