-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathFinal Code 1.Rmd
550 lines (464 loc) · 18.2 KB
/
Final Code 1.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
---
title: "ACS Table"
author: "Shiheng Xu"
date: "2024-11-11"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
### load packages
```{r}
library(dplyr)
library(readr)
library(stringr)
library(purrr)
# Set your main directory path
main_dir <- "/Users/xushiheng/Downloads"
extract_year <- function(file_name) {
str_extract(file_name, "\\d{4}")
}
```
### Define a function to abstract data
```{r}
library(dplyr)
library(readr)
library(tidyr) # Ensure `replace_na()` is available
process_variable_folder <- function(folder_name, column_name, sum_columns = NULL, new_sum_column_name = NULL) {
folder_path <- file.path(main_dir, folder_name)
files <- list.files(folder_path, full.names = TRUE, pattern = "-Data.csv$")
data_list <- map(files, function(file) {
# Extract year from file name
year <- extract_year(basename(file))
# Read data, ensuring the first row is treated as column names
data <- read_csv(file, col_names = TRUE, show_col_types = FALSE)
# Check if GEO_ID column exists
if (!"GEO_ID" %in% names(data)) {
stop("The column 'GEO_ID' is missing in file:", file)
}
# Add Year column
data <- data %>%
mutate(Year = as.integer(year))
# If sum_columns are provided, calculate the sum and rename the resulting column
if (!is.null(sum_columns) && !is.null(new_sum_column_name)) {
# Check if all required columns exist in the current file
available_columns <- sum_columns[sum_columns %in% names(data)]
if (length(available_columns) > 0) {
if (folder_name == "B25106") {
# Convert columns to numeric, replacing non-numeric values with 0
data <- data %>%
mutate(across(all_of(available_columns), ~ suppressWarnings(as.numeric(.)), .names = "numeric_{col}")) %>%
mutate(across(starts_with("numeric_"), ~ replace_na(., 0))) %>%
rowwise() %>%
mutate(!!new_sum_column_name := sum(c_across(starts_with("numeric_")), na.rm = TRUE)) %>%
ungroup() %>%
select(GEO_ID, Year, !!new_sum_column_name)
} else {
# For other folders, keep NA in non-numeric entries
data <- data %>%
mutate(across(all_of(available_columns), as.numeric)) %>%
rowwise() %>%
mutate(!!new_sum_column_name := sum(c_across(all_of(available_columns)), na.rm = TRUE)) %>%
ungroup() %>%
select(GEO_ID, Year, !!new_sum_column_name)
}
} else {
warning("None of the specified columns are available in file: ", file)
data <- data %>%
mutate(!!new_sum_column_name := NA) %>%
select(GEO_ID, Year, !!new_sum_column_name)
}
} else {
# For other folders, select only the required columns
data <- data %>%
select(GEO_ID, Year, all_of(column_name))
}
return(data)
})
# Bind rows from all years
bind_rows(data_list)
}
```
```{r}
#-------------------------------------VERSION 1--------------------------------------------
# # Total Population
# population_data <- process_variable_folder("B01003", "B01003_001E")
#
# # Median Gross Rent
# rent_data <- process_variable_folder("B25064", "B25064_001E")
#
# # Median Household Income
# income_data <- process_variable_folder("B19013", "B19013_001E")
#
# # Poverty Rate
# poverty_data <- process_variable_folder("B17001", "B17001_001E")
#
# # Rental Vacancy Rate
# vacancy_data <- process_variable_folder("B25004", "B25004_001E")
#
# # Percent Renter Households
# renter_data <- process_variable_folder("B25008", "B25008_001E")
#
# # Percent Cost-Burdened Renters (summing specified columns and renaming)
# cost_burdened_data <- process_variable_folder(
# "B25106",
# column_name = NULL, # Not needed for this case
# sum_columns = c("B25106_028E", "B25106_032E", "B25106_036E", "B25106_040E", "B25106_044E"),
# new_sum_column_name = "B25106_001E"
# )
#
# # Unemployment Rate
# unemployment_data <- process_variable_folder("B23001", "B23001_001E")
```
### Only select indicated vairables
```{r}
#--------------------------------------VERSION 2----------------------------------------------
# Total Population
population_data <- process_variable_folder("B01003", "B01003_001E")
# Median Gross Rent
rent_data <- process_variable_folder("B25064", "B25064_001E")
# Median Household Income
income_data <- process_variable_folder("B19013", "B19013_001E")
# Poverty Rate
poverty_data <- process_variable_folder("B17001", "B17001_002E")
# Vacancy Rate Variables
total_housing_data <- process_variable_folder("B25002", "B25002_001E") # Total housing units
vacant_housing_data <- process_variable_folder("B25002", "B25002_003E") # Vacant housing units
# Percent Renter Households
renter_data <- process_variable_folder("B25003", "B25003_003E") # Renter-occupied housing units
total_occupied_data <- process_variable_folder("B25003", "B25003_001E") # Total occupied housing units
# Percent Cost-Burdened Renters
cost_burdened_data <- process_variable_folder(
"B25106",
column_name = NULL, # Not needed for this case
sum_columns = c("B25106_028E", "B25106_032E", "B25106_036E", "B25106_040E", "B25106_044E"),
new_sum_column_name = "B25106_001E"
)
# Unemployment Rate Variables
total_labor_force_data <- process_variable_folder("B23025", "B23025_003E") # Total labor force
unemployed_data <- process_variable_folder("B23025", "B23025_005E") # Unemployed individuals
# Duplicate 2011 data for 2010 in total labor force
total_labor_force_2011 <- total_labor_force_data %>%
filter(Year == 2011)
total_labor_force_2010 <- total_labor_force_2011 %>%
mutate(Year = 2010)
# Add duplicated 2010 data to total labor force
total_labor_force_data <- total_labor_force_data %>%
bind_rows(total_labor_force_2010)
# Duplicate 2011 data for 2010 in unemployed data
unemployed_2011 <- unemployed_data %>%
filter(Year == 2011)
unemployed_2010 <- unemployed_2011 %>%
mutate(Year = 2010)
# Add duplicated 2010 data to unemployed individuals
unemployed_data <- unemployed_data %>%
bind_rows(unemployed_2010)
```
```{r}
#----------------------------------------VERSION 1-------------------------------------------------
# # Merge all variable-specific datasets by GEO_ID and Year
# final_data <- population_data %>%
# left_join(rent_data, by = c("GEO_ID", "Year")) %>%
# left_join(income_data, by = c("GEO_ID", "Year")) %>%
# left_join(poverty_data, by = c("GEO_ID", "Year")) %>%
# left_join(vacancy_data, by = c("GEO_ID", "Year")) %>%
# left_join(renter_data, by = c("GEO_ID", "Year")) %>%
# left_join(cost_burdened_data, by = c("GEO_ID", "Year")) %>%
# left_join(unemployment_data, by = c("GEO_ID", "Year"))
#
# # Inspect the merged dataset
# head(final_data)
```
### Merge all dataset together
```{r}
#--------------------------------------VERSION 2-------------------------------------------------------------
# Merge all variable-specific datasets by GEO_ID and Year
final_data_2 <- population_data %>%
left_join(rent_data, by = c("GEO_ID", "Year")) %>%
left_join(income_data, by = c("GEO_ID", "Year")) %>%
left_join(poverty_data, by = c("GEO_ID", "Year")) %>%
left_join(total_housing_data, by = c("GEO_ID", "Year")) %>% # For Vacancy Rate
left_join(vacant_housing_data, by = c("GEO_ID", "Year")) %>% # For Vacancy Rate
left_join(renter_data, by = c("GEO_ID", "Year")) %>% # For Renter Household Rate
left_join(total_occupied_data, by = c("GEO_ID", "Year")) %>% # For Renter Household Rate
left_join(cost_burdened_data, by = c("GEO_ID", "Year")) %>% # For Cost-Burdened Rate
left_join(total_labor_force_data, by = c("GEO_ID", "Year")) %>% # For Unemployment Rate
left_join(unemployed_data, by = c("GEO_ID", "Year")) # For Unemployment Rate
# Inspect the merged dataset
head(final_data_2)
```
### Extend data 2007-2023
```{r}
# Duplicate 2010 data for 2007, 2008, and 2009
data_2010_2 <- final_data_2 %>%
filter(Year == 2010)
# Create duplicates for 2007, 2008, and 2009
data_2007_2009_2 <- data_2010_2 %>%
mutate(Year = 2007) %>%
bind_rows(mutate(data_2010_2, Year = 2008)) %>%
bind_rows(mutate(data_2010_2, Year = 2009))
# Duplicate 2022 data for 2023
data_2022_2 <- final_data_2 %>%
filter(Year == 2022)
data_2023_2 <- data_2022_2 %>%
mutate(Year = 2023)
# Combine original data with the new duplicated years
final_data_extended_2 <- final_data_2 %>%
bind_rows(data_2007_2009_2) %>%
bind_rows(data_2023_2)
# Inspect the final extended dataset
head(final_data_extended_2)
```
### Mapping GEO_ID to CoC
```{r}
# Load the mapping files and rename GEOID to GEO_ID, then select only relevant columns
geo_to_coc_2022 <- read_csv(file.path(main_dir, "tract_coc_match_2022.csv")) %>%
rename(GEO_ID = GEOID) %>%
select(GEO_ID, COCNUM)
geo_to_coc_2019 <- read_csv(file.path(main_dir, "tract_coc_match_to_2019_tracts.csv")) %>%
rename(GEO_ID = GEOID) %>%
select(GEO_ID, COCNUM)
# Modify final_data_extended to keep only the last 11 digits of GEO_ID
final_data_extended_2 <- final_data_extended_2 %>%
mutate(GEO_ID = substr(GEO_ID, nchar(GEO_ID) - 10, nchar(GEO_ID)))
# Split the dataset into two subsets: years <= 2019 and years >= 2020
final_data_pre_2020_2 <- final_data_extended_2 %>%
filter(Year <= 2019)
final_data_post_2019_2 <- final_data_extended_2 %>%
filter(Year >= 2020)
# Merge GEO_ID to CoC_Number for years <= 2019 using geo_to_coc_2019
final_data_pre_2020_2 <- final_data_pre_2020_2 %>%
left_join(geo_to_coc_2019, by = "GEO_ID") %>%
rename(CoC_Number = COCNUM)
# Merge GEO_ID to CoC_Number for years >= 2020 using geo_to_coc_2022
final_data_post_2019_2 <- final_data_post_2019_2 %>%
left_join(geo_to_coc_2022, by = "GEO_ID") %>%
rename(CoC_Number = COCNUM)
# Combine the two subsets back together
final_data_with_coc_2 <- bind_rows(final_data_pre_2020_2, final_data_post_2019_2)
# Count the number of unmatched GEO_IDs (those without a CoC Number) for each year
unmatched_geo_count_by_year_2 <- final_data_with_coc_2 %>%
filter(is.na(CoC_Number)) %>%
group_by(Year) %>%
summarise(unmatched_count = n())
# Print the count of unmatched GEO_IDs by year
print(unmatched_geo_count_by_year_2)
# Inspect a few rows of the final merged dataset
head(final_data_with_coc_2)
```
### Cleaning dataset
```{r}
library(dplyr)
# Step 1: Identify columns starting with "B"
columns_of_interest <- names(final_data_with_coc_2)[grepl("^B", names(final_data_with_coc_2))]
# Step 2: Dataset with at least one NA, non-numeric, or 0
rows_with_issues <- final_data_with_coc_2 %>%
rowwise() %>%
filter(any(sapply(across(all_of(columns_of_interest)), function(col) {
val <- suppressWarnings(as.numeric(col)) # Convert to numeric
is.na(val) || val == 0 # Check for NA or 0
}))) %>%
ungroup()
# Step 3: Dataset with all other rows
clean_rows <- final_data_with_coc_2 %>%
rowwise() %>%
filter(all(sapply(across(all_of(columns_of_interest)), function(col) {
val <- suppressWarnings(as.numeric(col)) # Convert to numeric
!is.na(val) && val != 0 # Keep only numeric and non-zero values
}))) %>%
ungroup()
```
### Aggregating dataset
```{r}
library(dplyr)
# Aggregate clean_rows to CoC level
aggregated_clean_rows <- clean_rows %>%
group_by(CoC_Number, Year) %>%
summarise(
# Sum variables (column names remain unchanged)
B01003_001E = sum(as.numeric(B01003_001E), na.rm = TRUE), # Total Population
B17001_002E = sum(as.numeric(B17001_002E), na.rm = TRUE), # Total Poverty Population
B25002_001E = sum(as.numeric(B25002_001E), na.rm = TRUE), # Total Housing Units
B25002_003E = sum(as.numeric(B25002_003E), na.rm = TRUE), # Vacant Housing Units
B25003_003E = sum(as.numeric(B25003_003E), na.rm = TRUE), # Renter Occupied Units
B25003_001E = sum(as.numeric(B25003_001E), na.rm = TRUE), # Total Occupied Units
B25106_001E = sum(as.numeric(B25106_001E), na.rm = TRUE), # Cost Burdened Households
B23025_003E = sum(as.numeric(B23025_003E), na.rm = TRUE), # Civilian Labor Force
B23025_005E = sum(as.numeric(B23025_005E), na.rm = TRUE), # Unemployed Individuals
# Population-weighted averages (column names remain unchanged)
B25064_001E = sum(as.numeric(B25064_001E) * as.numeric(B01003_001E), na.rm = TRUE) /
sum(as.numeric(B01003_001E), na.rm = TRUE), # Weighted Median Rent
B19013_001E = sum(as.numeric(B19013_001E) * as.numeric(B01003_001E), na.rm = TRUE) /
sum(as.numeric(B01003_001E), na.rm = TRUE), # Weighted Median Income
.groups = "drop" # Explicitly ungroup the data after summarisation
)
# Inspect the aggregated dataset
head(aggregated_clean_rows)
write_csv(aggregated_clean_rows, file.path(main_dir, "aggregated_clean_rows_by_CoC.csv"))
```
### Calculate Rates
```{r}
# Add calculated rate columns, skipping rows where Year is non-numeric
# final_data_with_coc_2 <- final_data_with_coc_2 %>%
library(dplyr)
# Add calculated rate columns to aggregated_clean_rows
aggregated_clean_rows <- aggregated_clean_rows %>%
mutate(
# Unemployment Rate
Unemployment_Rate = if_else(
!is.na(B23025_003E) & B23025_003E > 0,
(B23025_005E / B23025_003E) * 100,
NA_real_
),
# Vacancy Rate
Vacancy_Rate = if_else(
!is.na(B25002_001E) & B25002_001E > 0,
(B25002_003E / B25002_001E) * 100,
NA_real_
),
# Renter Household Rate
Renter_Household_Rate = if_else(
!is.na(B25003_001E) & B25003_001E > 0,
(B25003_003E / B25003_001E) * 100,
NA_real_
),
# Cost-Burdened Renter Rate
Cost_Burdened_Rate = if_else(
!is.na(B25003_003E) & B25003_003E > 0,
(B25106_001E / B25003_003E) * 100,
NA_real_
),
# Poverty Rate
Poverty_Rate = if_else(
!is.na(B01003_001E) & B01003_001E > 0,
(B17001_002E / B01003_001E) * 100,
NA_real_
)
)
# Inspect the updated dataset
head(aggregated_clean_rows)
```
### Adding descrption
```{r}
# # Convert all columns in final_data_with_coc to character for binding
# final_data_with_coc <- final_data_with_coc %>%
# mutate(across(everything(), as.character))
#
# final_data_with_coc <- final_data_with_coc %>%
# slice(-1)
# # Define the descriptive row in the correct order with uppercase column names
# descriptive_row <- data.frame(
# GEO_ID = "Geography",
# Year = "Year of data",
# B01003_001E = "Total Population",
# B25064_001E = "Median Gross Rent (Dollars)",
# B19013_001E = "Median Household Income in the Past 12 Months (in 2022 Inflation-Adjusted Dollars)",
# B17001_001E = "Poverty Status in the Past 12 Months by Sex by Age",
# B25004_001E = "Total Vacancy Status",
# B25008_001E = "Total Population in Occupied Housing Units by Tenure",
# B25106_001E = "Tenure by Housing Costs greater than 30 percent of Household Income in the Past 12 Months",
# B23001_001E = "Sex by Age by Employment Status for the Population 16 Years and Over",
# CoC_Number = "Continuum of Care (CoC) identification number"
# )
#
# # Ensure that `descriptive_row` matches the structure of `final_data_with_coc`
# descriptive_row <- descriptive_row[1, , drop = FALSE] # Ensure it is a single-row data frame
#
# # Bind the descriptive row as the second row in the dataset
# final_data_with_descriptions <- bind_rows(descriptive_row, final_data_with_coc)
#
# # View the first few rows to confirm the structure
# head(final_data_with_descriptions, n = 5)
#
# # Save the dataset with descriptions as the second row if needed
# write_csv(final_data_with_descriptions, file.path(main_dir, "final_data_with_descriptions.csv"))
```
```{r}
library(readxl)
library(dplyr)
library(openxlsx)
```
### Merging to PIT
```{r}
library(readxl)
library(dplyr)
# Path to the .xlsx file
pit_counts_path <- file.path(main_dir, "2007-2023-PIT-Counts-by-CoC.xlsx")
# Get the sheet names (years)
sheet_names <- excel_sheets(pit_counts_path)
# Initialize an empty list to store data from each sheet
pit_data_list <- list()
# Loop through each sheet, extract data, and add to the list
for (sheet in sheet_names) {
# Read data from the current sheet
sheet_data <- read_excel(pit_counts_path, sheet = sheet)
# Ensure all required columns are present
required_columns <- c(
"Overall Homeless",
"Overall Homeless Individuals",
"Overall Homeless People in Families",
"Unsheltered Homeless",
"Sheltered Total Homeless",
"CoC Number"
)
if (all(required_columns %in% colnames(sheet_data))) {
# Add Year column, rename `CoC Number` to `CoC_Number`, and select relevant columns
sheet_data <- sheet_data %>%
mutate(Year = as.integer(sheet)) %>% # Add Year from sheet name
rename(CoC_Number = `CoC Number`) %>% # Rename the column
select(
Year,
CoC_Number,
`Overall Homeless`,
`Overall Homeless Individuals`,
`Overall Homeless People in Families`,
`Unsheltered Homeless`,
`Sheltered Total Homeless`
)
# Append to the list
pit_data_list[[sheet]] <- sheet_data
}
}
# Combine all sheets into a single data frame
pit_counts_combined <- bind_rows(pit_data_list)
# Merge the PIT counts data with aggregated_clean_rows
ACS_PIT <- aggregated_clean_rows %>%
left_join(pit_counts_combined, by = c("Year", "CoC_Number"))
# Inspect the merged dataset
head(ACS_PIT)
```
```{r}
# # Convert Year in final_data_with_descriptions to integer
# final_data_with_coc_2 <- final_data_with_coc_2 %>%
# mutate(Year = as.integer(Year))
#
# # Merge datasets
# merged_data_2 <- final_data_with_coc_2 %>%
# left_join(pit_counts_combined, by = c("Year", "CoC_Number"))
# merged_data_2 <- merged_data_2 %>%
# slice(-1)
#
# # Inspect the merged dataset
# head(merged_data_2)
#
# # Save the merged dataset to a CSV file
# write_csv(merged_data_2, file.path(main_dir, "final_data_with_pit_counts_2.csv"))
```
### Mering to HIC
```{r}
# Path to the new file
hic_counts_path <- file.path(main_dir, "2007-2023-HIC-Counts-by-CoC_merged.csv")
# Load the new dataset
hic_counts_data <- read_csv(hic_counts_path)
# Rename the column in the HIC dataset to match `merged_data_2`
hic_counts_data <- hic_counts_data %>%
rename(CoC_Number = `CoC Number`)
# Inspect the updated HIC dataset
head(hic_counts_data)
# Perform the merge
ACS_PIT_HIC <- ACS_PIT %>%
left_join(hic_counts_data, by = c("Year", "CoC_Number"))
# Inspect the merged dataset
head(ACS_PIT_HIC)
write_csv(ACS_PIT_HIC, file.path(main_dir, "ACS_PIT_HIC.csv"))
```