Skip to content

Commit d107b54

Browse files
authored
Merge pull request #1750 from cal-itp/1726-mtc-copy-files-script
1726 mtc copy files script
2 parents 4a01fb6 + a40e60c commit d107b54

File tree

1 file changed

+356
-0
lines changed

1 file changed

+356
-0
lines changed
Lines changed: 356 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,356 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 2,
6+
"id": "701f2fa1-cdec-4027-a35c-22160c82869d",
7+
"metadata": {
8+
"tags": []
9+
},
10+
"outputs": [],
11+
"source": [
12+
"import geopandas as gpd\n",
13+
"import pandas as pd\n",
14+
"from calitp_data_analysis.gcs_geopandas import GCSGeoPandas\n",
15+
"from shared_utils import catalog_utils, rt_dates, portfolio_utils\n",
16+
"from segment_speed_utils import helpers\n",
17+
"g = GCSGeoPandas()"
18+
]
19+
},
20+
{
21+
"cell_type": "markdown",
22+
"id": "45583f17-cf3c-4580-9231-9e3b410d0c75",
23+
"metadata": {},
24+
"source": [
25+
"# Copy speedmaps data into public bucket to support MTC"
26+
]
27+
},
28+
{
29+
"cell_type": "code",
30+
"execution_count": 3,
31+
"id": "58fe3aeb-5490-4744-a5b5-b315166c9c92",
32+
"metadata": {
33+
"tags": []
34+
},
35+
"outputs": [
36+
{
37+
"data": {
38+
"text/plain": [
39+
"['2025-09-23',\n",
40+
" '2025-09-24',\n",
41+
" '2025-09-25',\n",
42+
" '2025-10-14',\n",
43+
" '2025-10-15',\n",
44+
" '2025-10-16']"
45+
]
46+
},
47+
"execution_count": 3,
48+
"metadata": {},
49+
"output_type": "execute_result"
50+
}
51+
],
52+
"source": [
53+
"relevant_dates = [\n",
54+
" rt_dates.DATES[i]\n",
55+
" for i in [\"sep2025a\", \"sep2025\", \"sep2025b\", \"oct2025a\", \"oct2025\", \"oct2025b\"]\n",
56+
"]\n",
57+
"relevant_dates"
58+
]
59+
},
60+
{
61+
"cell_type": "code",
62+
"execution_count": 4,
63+
"id": "698f3c19-afc0-4d32-bdfb-e934a1aebd4d",
64+
"metadata": {
65+
"tags": []
66+
},
67+
"outputs": [],
68+
"source": [
69+
"gtfs_yml = catalog_utils.get_catalog(\"gtfs_analytics_data\")\n",
70+
"aggregated_parent = (\n",
71+
" f\"{gtfs_yml.speedmap_segments.dir}{gtfs_yml.speedmap_segments.segment_timeofday}\"\n",
72+
")\n",
73+
"trip_speeds_parent = (\n",
74+
" f\"{gtfs_yml.speedmap_segments.dir}{gtfs_yml.speedmap_segments.stage4}\"\n",
75+
")"
76+
]
77+
},
78+
{
79+
"cell_type": "code",
80+
"execution_count": 5,
81+
"id": "5b2dfe84-3231-41e0-8a4c-38d89c339f98",
82+
"metadata": {
83+
"tags": []
84+
},
85+
"outputs": [
86+
{
87+
"data": {
88+
"text/plain": [
89+
"['gs://calitp-analytics-data/data-analyses/rt_segment_speeds/rollup_singleday/speeds_shape_timeofday_speedmap_segments_2025-09-23.parquet',\n",
90+
" 'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/rollup_singleday/speeds_shape_timeofday_speedmap_segments_2025-09-24.parquet',\n",
91+
" 'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/rollup_singleday/speeds_shape_timeofday_speedmap_segments_2025-09-25.parquet',\n",
92+
" 'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/rollup_singleday/speeds_shape_timeofday_speedmap_segments_2025-10-14.parquet',\n",
93+
" 'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/rollup_singleday/speeds_shape_timeofday_speedmap_segments_2025-10-15.parquet',\n",
94+
" 'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/rollup_singleday/speeds_shape_timeofday_speedmap_segments_2025-10-16.parquet']"
95+
]
96+
},
97+
"execution_count": 5,
98+
"metadata": {},
99+
"output_type": "execute_result"
100+
}
101+
],
102+
"source": [
103+
"# Get aggregated speedmaps segments paths\n",
104+
"aggregated_uris = [f\"{aggregated_parent}_{date}.parquet\" for date in relevant_dates]\n",
105+
"aggregated_uris"
106+
]
107+
},
108+
{
109+
"cell_type": "code",
110+
"execution_count": 6,
111+
"id": "66efcd91-9d18-455a-82c7-0124b17ad675",
112+
"metadata": {},
113+
"outputs": [
114+
{
115+
"data": {
116+
"text/plain": [
117+
"[{'date': '2025-09-23',\n",
118+
" 'uri': 'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/speedmap/speeds_2025-09-23.parquet'},\n",
119+
" {'date': '2025-09-24',\n",
120+
" 'uri': 'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/speedmap/speeds_2025-09-24.parquet'},\n",
121+
" {'date': '2025-09-25',\n",
122+
" 'uri': 'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/speedmap/speeds_2025-09-25.parquet'},\n",
123+
" {'date': '2025-10-14',\n",
124+
" 'uri': 'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/speedmap/speeds_2025-10-14.parquet'},\n",
125+
" {'date': '2025-10-15',\n",
126+
" 'uri': 'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/speedmap/speeds_2025-10-15.parquet'},\n",
127+
" {'date': '2025-10-16',\n",
128+
" 'uri': 'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/speedmap/speeds_2025-10-16.parquet'}]"
129+
]
130+
},
131+
"execution_count": 6,
132+
"metadata": {},
133+
"output_type": "execute_result"
134+
}
135+
],
136+
"source": [
137+
"# Get trip level speeds paths\n",
138+
"# trip_speeds_uris = [f\"{trip_speeds_parent}_{date}.parquet\" for date in relevant_dates]\n",
139+
"trip_speeds_uris = [{\"date\": date, \"uri\": f\"{trip_speeds_parent}_{date}.parquet\"} for date in relevant_dates]\n",
140+
"trip_speeds_uris"
141+
]
142+
},
143+
{
144+
"cell_type": "markdown",
145+
"id": "d5895977-be56-433d-bc88-5e9385c09916",
146+
"metadata": {
147+
"jp-MarkdownHeadingCollapsed": true,
148+
"tags": []
149+
},
150+
"source": [
151+
"### validation\n",
152+
"\n",
153+
"Doesn't exactly match script aggregation but we're comfortable these are the right files. Could be revised.\n"
154+
]
155+
},
156+
{
157+
"cell_type": "code",
158+
"execution_count": null,
159+
"id": "b0ea229d-9d2e-4279-a621-3fe1338b0a3d",
160+
"metadata": {
161+
"tags": []
162+
},
163+
"outputs": [],
164+
"source": [
165+
"\n",
166+
"# # Validate that aggregated speeds match trip speeds in a majority of cases\n",
167+
"# GROUP_COLUMNS = [\"time_of_day\", \"segment_id\", \"shape_id\"]\n",
168+
"# quantile_to_speed_column = {0.2: \"p20_mph\", 0.5: \"p50_mph\", 0.8: \"p80_mph\"}\n",
169+
"# for aggregated_uri, trip_speeds_uri in zip(aggregated_uris, trip_speeds_uris):\n",
170+
"# # read aggregated and precursor data\n",
171+
"# aggregated = g.read_parquet(aggregated_uri)\n",
172+
"# trip_speeds = pd.read_parquet(trip_speeds_uri)\n",
173+
"# # reindex aggregated_speeds and group trip_speeds_grouped so they theoretically have the same data\n",
174+
"# aggregated_speeds = aggregated.set_index(GROUP_COLUMNS)[\n",
175+
"# list(quantile_to_speed_column.values())\n",
176+
"# ]\n",
177+
"# trip_speeds_grouped = trip_speeds.groupby(GROUP_COLUMNS)\n",
178+
"# trip_speeds_percentiles = pd.concat(\n",
179+
"# [\n",
180+
"# trip_speeds_grouped[\"speed_mph\"].quantile(q).rename(name).round(2)\n",
181+
"# for q, name in quantile_to_speed_column.items()\n",
182+
"# ],\n",
183+
"# axis=1,\n",
184+
"# )\n",
185+
"# # the aggregated data appears to contain a subset of the trip speeds data, so only look at that subset\n",
186+
"# trip_speeds_percentiles_subset = trip_speeds_percentiles.loc[\n",
187+
"# aggregated_speeds.index\n",
188+
"# ]\n",
189+
"# # find the rows where at least one speed doesn't match\n",
190+
"# non_equal = (trip_speeds_percentiles_subset != aggregated_speeds).any(axis=1)\n",
191+
" \n",
192+
"# print(\n",
193+
"# f\"for {aggregated_uri}, {non_equal.sum()} out of {len(aggregated_speeds.index)} speeds did not match\"\n",
194+
"# )"
195+
]
196+
},
197+
{
198+
"cell_type": "markdown",
199+
"id": "dc750c2a-d8b4-4c22-9b10-5d4a3619d8f3",
200+
"metadata": {},
201+
"source": [
202+
"## export"
203+
]
204+
},
205+
{
206+
"cell_type": "code",
207+
"execution_count": 7,
208+
"id": "9abf5609-17c4-42e0-a24d-3b10b1fbee64",
209+
"metadata": {
210+
"tags": []
211+
},
212+
"outputs": [],
213+
"source": [
214+
"# Map source uris to output uris\n",
215+
"output_aggregated_uris = {\n",
216+
" old_uri: f\"gs://calitp-publish-data-analysis/mtc_collab_2025/speeds/{old_uri.split('/')[-1]}\"\n",
217+
" for old_uri in aggregated_uris\n",
218+
"}\n",
219+
"# output_trip_speeds_uris = {\n",
220+
"# old_uri['uri']: f\"gs://calitp-publish-data-analysis/mtc_collab_2025/speeds/{old_uri.split('/')[-1]}\"\n",
221+
"# for old_uri in trip_speeds_uris\n",
222+
"# }"
223+
]
224+
},
225+
{
226+
"cell_type": "code",
227+
"execution_count": 13,
228+
"id": "0bbbbfcc-5e9b-4c11-8ecd-03dcea935fdb",
229+
"metadata": {
230+
"tags": []
231+
},
232+
"outputs": [],
233+
"source": [
234+
"for entry in trip_speeds_uris: entry['output_uri'] = f\"gs://calitp-publish-data-analysis/mtc_collab_2025/speeds/{entry['uri'].split('/')[-1]}\""
235+
]
236+
},
237+
{
238+
"cell_type": "code",
239+
"execution_count": 14,
240+
"id": "5f5d3f1b-ab6f-437b-9877-dda155d50330",
241+
"metadata": {
242+
"tags": []
243+
},
244+
"outputs": [
245+
{
246+
"data": {
247+
"text/plain": [
248+
"[{'date': '2025-09-23',\n",
249+
" 'uri': 'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/speedmap/speeds_2025-09-23.parquet',\n",
250+
" 'output_uri': 'gs://calitp-publish-data-analysis/mtc_collab_2025/speeds/speeds_2025-09-23.parquet'},\n",
251+
" {'date': '2025-09-24',\n",
252+
" 'uri': 'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/speedmap/speeds_2025-09-24.parquet',\n",
253+
" 'output_uri': 'gs://calitp-publish-data-analysis/mtc_collab_2025/speeds/speeds_2025-09-24.parquet'},\n",
254+
" {'date': '2025-09-25',\n",
255+
" 'uri': 'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/speedmap/speeds_2025-09-25.parquet',\n",
256+
" 'output_uri': 'gs://calitp-publish-data-analysis/mtc_collab_2025/speeds/speeds_2025-09-25.parquet'},\n",
257+
" {'date': '2025-10-14',\n",
258+
" 'uri': 'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/speedmap/speeds_2025-10-14.parquet',\n",
259+
" 'output_uri': 'gs://calitp-publish-data-analysis/mtc_collab_2025/speeds/speeds_2025-10-14.parquet'},\n",
260+
" {'date': '2025-10-15',\n",
261+
" 'uri': 'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/speedmap/speeds_2025-10-15.parquet',\n",
262+
" 'output_uri': 'gs://calitp-publish-data-analysis/mtc_collab_2025/speeds/speeds_2025-10-15.parquet'},\n",
263+
" {'date': '2025-10-16',\n",
264+
" 'uri': 'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/speedmap/speeds_2025-10-16.parquet',\n",
265+
" 'output_uri': 'gs://calitp-publish-data-analysis/mtc_collab_2025/speeds/speeds_2025-10-16.parquet'}]"
266+
]
267+
},
268+
"execution_count": 14,
269+
"metadata": {},
270+
"output_type": "execute_result"
271+
}
272+
],
273+
"source": [
274+
"trip_speeds_uris"
275+
]
276+
},
277+
{
278+
"cell_type": "code",
279+
"execution_count": 15,
280+
"id": "423a4975-227d-4426-a6bf-5d57540183fd",
281+
"metadata": {
282+
"tags": []
283+
},
284+
"outputs": [],
285+
"source": [
286+
"def add_speeds_identifiers(speeds_df, analysis_date):\n",
287+
" trips = helpers.import_scheduled_trips(analysis_date, columns=['trip_instance_key', 'trip_id', 'gtfs_dataset_key'])\n",
288+
" speeds_df = (speeds_df.merge(trips, on=['trip_instance_key', 'schedule_gtfs_dataset_key'])\n",
289+
" .pipe(portfolio_utils.standardize_operator_info_for_exports, date=analysis_date)\n",
290+
" )\n",
291+
" return speeds_df"
292+
]
293+
},
294+
{
295+
"cell_type": "code",
296+
"execution_count": 16,
297+
"id": "eca3638b-eac0-47ff-a457-d6564cad7943",
298+
"metadata": {
299+
"tags": []
300+
},
301+
"outputs": [
302+
{
303+
"name": "stdout",
304+
"output_type": "stream",
305+
"text": [
306+
"gs://calitp-publish-data-analysis/mtc_collab_2025/speeds/speeds_2025-09-23.parquet\n",
307+
"gs://calitp-publish-data-analysis/mtc_collab_2025/speeds/speeds_2025-09-24.parquet\n",
308+
"gs://calitp-publish-data-analysis/mtc_collab_2025/speeds/speeds_2025-09-25.parquet\n",
309+
"gs://calitp-publish-data-analysis/mtc_collab_2025/speeds/speeds_2025-10-14.parquet\n",
310+
"gs://calitp-publish-data-analysis/mtc_collab_2025/speeds/speeds_2025-10-15.parquet\n",
311+
"gs://calitp-publish-data-analysis/mtc_collab_2025/speeds/speeds_2025-10-16.parquet\n"
312+
]
313+
}
314+
],
315+
"source": [
316+
"# Copy files from the private bucket to the public / mtc-visible bucket\n",
317+
"fs = g.gcs_filesystem\n",
318+
"# for old_uri, new_uri in output_aggregated_uris.items():\n",
319+
"# fs.copy(old_uri, new_uri)\n",
320+
"for uri_group in trip_speeds_uris:\n",
321+
" speeds_df = pd.read_parquet(uri_group['uri'])\n",
322+
" speeds_df = add_speeds_identifiers(speeds_df, analysis_date=uri_group['date'])\n",
323+
" speeds_df.to_parquet(uri_group['output_uri'])\n",
324+
" print(uri_group['output_uri'])"
325+
]
326+
}
327+
],
328+
"metadata": {
329+
"kernelspec": {
330+
"display_name": "Python 3 (ipykernel)",
331+
"language": "python",
332+
"name": "python3"
333+
},
334+
"language_info": {
335+
"codemirror_mode": {
336+
"name": "ipython",
337+
"version": 3
338+
},
339+
"file_extension": ".py",
340+
"mimetype": "text/x-python",
341+
"name": "python",
342+
"nbconvert_exporter": "python",
343+
"pygments_lexer": "ipython3",
344+
"version": "3.11.10"
345+
},
346+
"widgets": {
347+
"application/vnd.jupyter.widget-state+json": {
348+
"state": {},
349+
"version_major": 2,
350+
"version_minor": 0
351+
}
352+
}
353+
},
354+
"nbformat": 4,
355+
"nbformat_minor": 5
356+
}

0 commit comments

Comments
 (0)