Skip to content

Commit

Permalink
rm duplicate organization names w/o erasing truly unique feeds
Browse files Browse the repository at this point in the history
  • Loading branch information
amandaha8 committed Nov 27, 2024
1 parent 3cf21cb commit ef69f13
Show file tree
Hide file tree
Showing 14 changed files with 2,353 additions and 2,821 deletions.
4,959 changes: 2,167 additions & 2,792 deletions gtfs_digest/42_explore_ct_district.ipynb

Large diffs are not rendered by default.

167 changes: 162 additions & 5 deletions gtfs_digest/district_report.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,163 @@
").reset_index(drop=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e733cb10-3185-4468-9ebe-1c2cb854cd21",
"metadata": {},
"outputs": [],
"source": [
"# De duplicate\n",
"# First find any organizations_names with more than 2 names per \n",
"orgs_agg = (\n",
" operator_df.groupby([\"caltrans_district\", \"organization_name\"])\n",
" .agg({\"name\": \"nunique\"})\n",
" .reset_index()\n",
")\n",
"orgs_agg2 = orgs_agg.loc[orgs_agg.name > 1]\n",
"orgs_with_2_names = list(orgs_agg2.organization_name.unique())"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f2197f14-0623-46b4-bdda-a6da1581a9d2",
"metadata": {},
"outputs": [],
"source": [
"# Delete out these organizations from the original df so we can manipulate them.\n",
"operator_df2 = operator_df.loc[\n",
" ~operator_df.organization_name.isin(orgs_with_2_names)\n",
"].reset_index(drop=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e7ee27ca-fafc-41a5-b15c-eead1c11785f",
"metadata": {},
"outputs": [],
"source": [
"# Filter for these organizations with more than 2 names in their own df.\n",
"orgs_with_2_names_df = operator_df.loc[\n",
" operator_df.organization_name.isin(orgs_with_2_names)\n",
"].reset_index(drop=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ad8f31e7-f7b0-4f8e-9ee8-6479ea1af654",
"metadata": {},
"outputs": [],
"source": [
"three_month_reference = operator_df2[\"service_date\"].max() - pd.DateOffset(\n",
" months=3\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9d2e2807-801a-48f5-9933-3adaad4ebfef",
"metadata": {},
"outputs": [],
"source": [
"orgs_with_2_names_df = orgs_with_2_names_df[\n",
" orgs_with_2_names_df[\"service_date\"] >= three_month_reference\n",
"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c93e5f3d-1b87-495d-a63e-2454d7089754",
"metadata": {},
"outputs": [],
"source": [
"# Filter out any rows in which `vp_per_min_agency` and `spatial_accuracy_agency` is equal than 0\n",
"# and still has 2+ names\n",
"orgs_agg = (\n",
" orgs_with_2_names_df.groupby([\"organization_name\"])\n",
" .agg({\"name\": \"nunique\"})\n",
" .reset_index()\n",
" .rename(columns={\"name\": \"n_names\"})\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a81b77e8-90dd-4626-9077-45cb87ea2d0f",
"metadata": {},
"outputs": [],
"source": [
"orgs_with_2_names_df = pd.merge(\n",
" orgs_with_2_names_df, orgs_agg, on=\"organization_name\", how=\"left\"\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4f0eccf0-3731-4f93-9d7d-c4cc54617bbb",
"metadata": {},
"outputs": [],
"source": [
"orgs_with_2_names_df2 = orgs_with_2_names_df[\n",
" (orgs_with_2_names_df.vp_per_min_agency > 0)\n",
" & (orgs_with_2_names_df.spatial_accuracy_agency > 0)\n",
" & (orgs_with_2_names_df.n_names > 1)\n",
"].reset_index(drop=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "15712b8e-f547-4668-8f77-c9912207ef7b",
"metadata": {},
"outputs": [],
"source": [
"# Keep rows that meet service_date\n",
"service_date = operator_df2.service_date.max()\n",
"orgs_with_2_names_df3 = orgs_with_2_names_df2.loc[\n",
" orgs_with_2_names_df2.service_date == service_date\n",
"]\n",
"final_names = list(orgs_with_2_names_df3.organization_name.unique())"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "67b1e0cd-7ecf-43ca-80eb-b9139f7f49f8",
"metadata": {},
"outputs": [],
"source": [
"## Concat back\n",
"orgs_with_2_names_df = orgs_with_2_names_df.loc[~orgs_with_2_names_df.organization_name.isin(final_names)]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d591e85f-18f4-4613-8330-c1c6978c0507",
"metadata": {},
"outputs": [],
"source": [
"orgs_with_2_names_df_final = pd.concat([orgs_with_2_names_df, orgs_with_2_names_df3])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fcf1647b-892e-4323-8b69-e7bed38951b7",
"metadata": {},
"outputs": [],
"source": [
"operator_df2 = pd.concat([operator_df2, orgs_with_2_names_df_final])"
]
},
{
"cell_type": "markdown",
"id": "41f0a970-4646-4147-a596-d41d15517127",
Expand All @@ -110,7 +267,7 @@
"outputs": [],
"source": [
"district_summary = _report_utils.district_stats(\n",
" operator_df, \n",
" operator_df2, \n",
" \"caltrans_district\"\n",
")"
]
Expand Down Expand Up @@ -174,7 +331,7 @@
"metadata": {},
"outputs": [],
"source": [
"operators_in_district = operator_df.schedule_gtfs_dataset_key.unique()"
"operators_in_district = operator_df2.name.unique()"
]
},
{
Expand All @@ -186,7 +343,7 @@
"source": [
"operator_route_gdf = gpd.read_parquet(\n",
" f\"{RT_SCHED_GCS}{OPERATOR_ROUTE}.parquet\",\n",
" filters = [[\"schedule_gtfs_dataset_key\", \"in\", operators_in_district]],\n",
" filters = [[\"name\", \"in\", operators_in_district]],\n",
" columns = [\"name\", \"service_date\", \n",
" \"route_combined_name\", \"geometry\"]\n",
").sort_values(\n",
Expand Down Expand Up @@ -245,9 +402,9 @@
"metadata": {},
"outputs": [],
"source": [
"gtfs_table_df = operator_df[\n",
"gtfs_table_df = operator_df2[\n",
" shared_cols + gtfs_service_cols\n",
"].pipe(_report_utils.replace_column_names)"
"].pipe(_report_utils.replace_column_names).reset_index(drop = True)"
]
},
{
Expand Down
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown
Git LFS file not shown

0 comments on commit ef69f13

Please sign in to comment.