Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

IIJA Building CA Dataset #1376

Merged
merged 1 commit into from
Feb 14, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions dla/iija/_data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ def update_program_code_list_2025():
# Load September 2023 codes
program_codes_sept_2023 = load_program_codes_sept_2023()

# Merge original + September first
# Merge original + September 2023 first
m1 = pd.merge(
program_codes_sept_2023,
original_codes_df,
Expand Down Expand Up @@ -179,7 +179,7 @@ def update_program_code_list_2025():
}
)

# Add program to another program names without the string "program"
# Add the string "program" to values in the column "program_name"
m2["program_name"] = m2.apply(add_program_to_row, axis=1)
return m2

Expand Down
106 changes: 65 additions & 41 deletions dla/iija/_script_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,13 +147,8 @@ def identify_agency(df, identifier_col):

return full_df

def identify_agency2(df: pd.DataFrame) -> pd.DataFrame:
"""
Fill in locodes, using the column rk_locode first
then using the original function from Natalie.
"""
# Load dataframe with locodes
locodes_df = to_snakecase(
def load_locodes()->pd.DataFrame:
df = to_snakecase(
pd.read_excel(
f"gs://calitp-analytics-data/data-analyses/dla/e-76Obligated/locodes_updated7122021.xlsx"
)
Expand All @@ -162,7 +157,54 @@ def identify_agency2(df: pd.DataFrame) -> pd.DataFrame:
"agency_name": "implementing_agency",
}
)
return df

def load_county()->pd.DataFrame:
df = to_snakecase(
pd.read_excel(
f"{GCS_FILE_PATH}/Copy of County.xlsx", sheet_name="County", header=[1]
)
)[["recipient_name", "county_description", "county_code"]]

df['county_description'] = df['county_description'] + " County"
return df

def county_district_crosswalk()->pd.DataFrame:
"""
Aggregate locodes dataset to find which
districts a county lies in.
"""
# Load locodes
locodes_df = load_locodes()

# Load counties
county_base = load_county()

county_district = (
locodes_df
>> group_by(_.district, _.county_name)
>> count(_.county_name)
>> select(_.district, _.county_name)
>> filter(_.county_name != "Multi-County", _.district != 53)
)

county_info = pd.merge(
county_base,
county_district,
how="left",
left_on="county_description",
right_on="county_name",
).drop(columns=["county_name"])
return county_info

def identify_agency2(df: pd.DataFrame) -> pd.DataFrame:
"""
Fill in locodes, using the column rk_locode first
then use the original function from Natalie.
"""
# Load dataframe with locodes
locodes_df = load_locodes()

# Filter out for rows in which rk_locode is filled
filled_locode_df = df.loc[df.rk_locode.notna()].reset_index(drop=True)

Expand All @@ -181,7 +223,6 @@ def identify_agency2(df: pd.DataFrame) -> pd.DataFrame:
# Clean
filled_locode_df2 = filled_locode_df2.rename(
columns={
"agency_name": "implementing_agency",
"rk_locode": "implementing_agency_locode",
}
).drop(
Expand All @@ -199,50 +240,33 @@ def identify_agency2(df: pd.DataFrame) -> pd.DataFrame:
)

# Fill in summary_recipient_defined_text_field_1_value
missing_locode_df.summary_recipient_defined_text_field_1_value = (
missing_locode_df.summary_recipient_defined_text_field_1_value.fillna("None")
)
#missing_locode_df.summary_recipient_defined_text_field_1_value = (
# missing_locode_df.summary_recipient_defined_text_field_1_value.fillna("None")
#)

# Try add_name_from_locode from _data_utils
missing_locode_df2 = _data_utils.add_name_from_locode(
missing_locode_df, "summary_recipient_defined_text_field_1_value"
)

# Concat all the dataframes
final_df = pd.concat([filled_locode_df2, missing_locode_df2])
display("Do the # of rows match?")
display(len(final_df) == len(df))

# More cleaning
county_base = to_snakecase(pd.read_excel(f"{GCS_FILE_PATH}/Copy of County.xlsx", sheet_name='County', header=[1]))
county_base.drop(columns =['unnamed:_0', 'unnamed:_4'], axis=1, inplace=True)
county_base['county_description'] = county_base['county_description'] + " County"
# Manually add in info for any rows that are still missing info
county_info = county_district_crosswalk()

county_district = (
locodes_df
>> group_by(_.district, _.county_name)
>> count(_.county_name)
>> select(_.district, _.county_name)
>> filter(_.county_name != "Multi-County", _.district != 53)
)
county_info = pd.merge(
county_base,
county_district,
how="left",
left_on="county_description",
right_on="county_name",
).drop(columns=["county_name"])
mapping1 = dict(county_info[["county_code", "county_description"]].values)
mapping2 = dict(county_info[["county_code", "recipient_name"]].values)
mapping3 = dict(county_info[["county_code", "district"]].values)

final_df["county_description"] = final_df.county_code.map(mapping1)
final_df["recipient_name"] = final_df.county_code.map(mapping2)
final_df["district"] = final_df.county_code.map(mapping3)

final_df.loc[
final_df.county_name == "Statewide County", "county_name"] = "Statewide"
missing_locode_df2["county_description"] = missing_locode_df2.county_code.map(mapping1)
missing_locode_df2["district"] = missing_locode_df2.county_code.map(mapping3)
missing_locode_df2["implementing_agency"] = missing_locode_df2.county_code.map(mapping2)

# Concat all the dataframes
final_df = pd.concat([filled_locode_df2, missing_locode_df2])
display("Do the # of rows match?")
display(len(final_df) == len(df))

# Clean & fill in nans with Unknown
final_df.loc[final_df.county_name == "Statewide County", "county_name"] = "Statewide"
final_df["implementing_agency"] = final_df[
"implementing_agency"
].fillna(value="Unknown")
Expand Down Expand Up @@ -599,7 +623,7 @@ def get_clean_data(df, full_or_agg = ''):
aggdf = add_new_description_col(aggdf)

##asserting that the there is one row for each project id in the new
assert len(aggdf) == df.project_number.nunique()
display(len(aggdf) == df.project_number.nunique())

return aggdf

Expand Down
Loading
Loading