|
| 1 | +{ |
| 2 | + "cells": [ |
| 3 | + { |
| 4 | + "cell_type": "code", |
| 5 | + "execution_count": 2, |
| 6 | + "id": "701f2fa1-cdec-4027-a35c-22160c82869d", |
| 7 | + "metadata": { |
| 8 | + "tags": [] |
| 9 | + }, |
| 10 | + "outputs": [], |
| 11 | + "source": [ |
| 12 | + "import geopandas as gpd\n", |
| 13 | + "import pandas as pd\n", |
| 14 | + "from calitp_data_analysis.gcs_geopandas import GCSGeoPandas\n", |
| 15 | + "from shared_utils import catalog_utils, rt_dates, portfolio_utils\n", |
| 16 | + "from segment_speed_utils import helpers\n", |
| 17 | + "g = GCSGeoPandas()" |
| 18 | + ] |
| 19 | + }, |
| 20 | + { |
| 21 | + "cell_type": "markdown", |
| 22 | + "id": "45583f17-cf3c-4580-9231-9e3b410d0c75", |
| 23 | + "metadata": {}, |
| 24 | + "source": [ |
| 25 | + "# Copy speedmaps data into public bucket to support MTC" |
| 26 | + ] |
| 27 | + }, |
| 28 | + { |
| 29 | + "cell_type": "code", |
| 30 | + "execution_count": 3, |
| 31 | + "id": "58fe3aeb-5490-4744-a5b5-b315166c9c92", |
| 32 | + "metadata": { |
| 33 | + "tags": [] |
| 34 | + }, |
| 35 | + "outputs": [ |
| 36 | + { |
| 37 | + "data": { |
| 38 | + "text/plain": [ |
| 39 | + "['2025-09-23',\n", |
| 40 | + " '2025-09-24',\n", |
| 41 | + " '2025-09-25',\n", |
| 42 | + " '2025-10-14',\n", |
| 43 | + " '2025-10-15',\n", |
| 44 | + " '2025-10-16']" |
| 45 | + ] |
| 46 | + }, |
| 47 | + "execution_count": 3, |
| 48 | + "metadata": {}, |
| 49 | + "output_type": "execute_result" |
| 50 | + } |
| 51 | + ], |
| 52 | + "source": [ |
| 53 | + "relevant_dates = [\n", |
| 54 | + " rt_dates.DATES[i]\n", |
| 55 | + " for i in [\"sep2025a\", \"sep2025\", \"sep2025b\", \"oct2025a\", \"oct2025\", \"oct2025b\"]\n", |
| 56 | + "]\n", |
| 57 | + "relevant_dates" |
| 58 | + ] |
| 59 | + }, |
| 60 | + { |
| 61 | + "cell_type": "code", |
| 62 | + "execution_count": 4, |
| 63 | + "id": "698f3c19-afc0-4d32-bdfb-e934a1aebd4d", |
| 64 | + "metadata": { |
| 65 | + "tags": [] |
| 66 | + }, |
| 67 | + "outputs": [], |
| 68 | + "source": [ |
| 69 | + "gtfs_yml = catalog_utils.get_catalog(\"gtfs_analytics_data\")\n", |
| 70 | + "aggregated_parent = (\n", |
| 71 | + " f\"{gtfs_yml.speedmap_segments.dir}{gtfs_yml.speedmap_segments.segment_timeofday}\"\n", |
| 72 | + ")\n", |
| 73 | + "trip_speeds_parent = (\n", |
| 74 | + " f\"{gtfs_yml.speedmap_segments.dir}{gtfs_yml.speedmap_segments.stage4}\"\n", |
| 75 | + ")" |
| 76 | + ] |
| 77 | + }, |
| 78 | + { |
| 79 | + "cell_type": "code", |
| 80 | + "execution_count": 5, |
| 81 | + "id": "5b2dfe84-3231-41e0-8a4c-38d89c339f98", |
| 82 | + "metadata": { |
| 83 | + "tags": [] |
| 84 | + }, |
| 85 | + "outputs": [ |
| 86 | + { |
| 87 | + "data": { |
| 88 | + "text/plain": [ |
| 89 | + "['gs://calitp-analytics-data/data-analyses/rt_segment_speeds/rollup_singleday/speeds_shape_timeofday_speedmap_segments_2025-09-23.parquet',\n", |
| 90 | + " 'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/rollup_singleday/speeds_shape_timeofday_speedmap_segments_2025-09-24.parquet',\n", |
| 91 | + " 'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/rollup_singleday/speeds_shape_timeofday_speedmap_segments_2025-09-25.parquet',\n", |
| 92 | + " 'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/rollup_singleday/speeds_shape_timeofday_speedmap_segments_2025-10-14.parquet',\n", |
| 93 | + " 'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/rollup_singleday/speeds_shape_timeofday_speedmap_segments_2025-10-15.parquet',\n", |
| 94 | + " 'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/rollup_singleday/speeds_shape_timeofday_speedmap_segments_2025-10-16.parquet']" |
| 95 | + ] |
| 96 | + }, |
| 97 | + "execution_count": 5, |
| 98 | + "metadata": {}, |
| 99 | + "output_type": "execute_result" |
| 100 | + } |
| 101 | + ], |
| 102 | + "source": [ |
| 103 | + "# Get aggregated speedmaps segments paths\n", |
| 104 | + "aggregated_uris = [f\"{aggregated_parent}_{date}.parquet\" for date in relevant_dates]\n", |
| 105 | + "aggregated_uris" |
| 106 | + ] |
| 107 | + }, |
| 108 | + { |
| 109 | + "cell_type": "code", |
| 110 | + "execution_count": 6, |
| 111 | + "id": "66efcd91-9d18-455a-82c7-0124b17ad675", |
| 112 | + "metadata": {}, |
| 113 | + "outputs": [ |
| 114 | + { |
| 115 | + "data": { |
| 116 | + "text/plain": [ |
| 117 | + "[{'date': '2025-09-23',\n", |
| 118 | + " 'uri': 'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/speedmap/speeds_2025-09-23.parquet'},\n", |
| 119 | + " {'date': '2025-09-24',\n", |
| 120 | + " 'uri': 'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/speedmap/speeds_2025-09-24.parquet'},\n", |
| 121 | + " {'date': '2025-09-25',\n", |
| 122 | + " 'uri': 'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/speedmap/speeds_2025-09-25.parquet'},\n", |
| 123 | + " {'date': '2025-10-14',\n", |
| 124 | + " 'uri': 'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/speedmap/speeds_2025-10-14.parquet'},\n", |
| 125 | + " {'date': '2025-10-15',\n", |
| 126 | + " 'uri': 'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/speedmap/speeds_2025-10-15.parquet'},\n", |
| 127 | + " {'date': '2025-10-16',\n", |
| 128 | + " 'uri': 'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/speedmap/speeds_2025-10-16.parquet'}]" |
| 129 | + ] |
| 130 | + }, |
| 131 | + "execution_count": 6, |
| 132 | + "metadata": {}, |
| 133 | + "output_type": "execute_result" |
| 134 | + } |
| 135 | + ], |
| 136 | + "source": [ |
| 137 | + "# Get trip level speeds paths\n", |
| 138 | + "# trip_speeds_uris = [f\"{trip_speeds_parent}_{date}.parquet\" for date in relevant_dates]\n", |
| 139 | + "trip_speeds_uris = [{\"date\": date, \"uri\": f\"{trip_speeds_parent}_{date}.parquet\"} for date in relevant_dates]\n", |
| 140 | + "trip_speeds_uris" |
| 141 | + ] |
| 142 | + }, |
| 143 | + { |
| 144 | + "cell_type": "markdown", |
| 145 | + "id": "d5895977-be56-433d-bc88-5e9385c09916", |
| 146 | + "metadata": { |
| 147 | + "jp-MarkdownHeadingCollapsed": true, |
| 148 | + "tags": [] |
| 149 | + }, |
| 150 | + "source": [ |
| 151 | + "### validation\n", |
| 152 | + "\n", |
| 153 | + "Doesn't exactly match script aggregation but we're comfortable these are the right files. Could be revised.\n" |
| 154 | + ] |
| 155 | + }, |
| 156 | + { |
| 157 | + "cell_type": "code", |
| 158 | + "execution_count": null, |
| 159 | + "id": "b0ea229d-9d2e-4279-a621-3fe1338b0a3d", |
| 160 | + "metadata": { |
| 161 | + "tags": [] |
| 162 | + }, |
| 163 | + "outputs": [], |
| 164 | + "source": [ |
| 165 | + "\n", |
| 166 | + "# # Validate that aggregated speeds match trip speeds in a majority of cases\n", |
| 167 | + "# GROUP_COLUMNS = [\"time_of_day\", \"segment_id\", \"shape_id\"]\n", |
| 168 | + "# quantile_to_speed_column = {0.2: \"p20_mph\", 0.5: \"p50_mph\", 0.8: \"p80_mph\"}\n", |
| 169 | + "# for aggregated_uri, trip_speeds_uri in zip(aggregated_uris, trip_speeds_uris):\n", |
| 170 | + "# # read aggregated and precursor data\n", |
| 171 | + "# aggregated = g.read_parquet(aggregated_uri)\n", |
| 172 | + "# trip_speeds = pd.read_parquet(trip_speeds_uri)\n", |
| 173 | + "# # reindex aggregated_speeds and group trip_speeds_grouped so they theoretically have the same data\n", |
| 174 | + "# aggregated_speeds = aggregated.set_index(GROUP_COLUMNS)[\n", |
| 175 | + "# list(quantile_to_speed_column.values())\n", |
| 176 | + "# ]\n", |
| 177 | + "# trip_speeds_grouped = trip_speeds.groupby(GROUP_COLUMNS)\n", |
| 178 | + "# trip_speeds_percentiles = pd.concat(\n", |
| 179 | + "# [\n", |
| 180 | + "# trip_speeds_grouped[\"speed_mph\"].quantile(q).rename(name).round(2)\n", |
| 181 | + "# for q, name in quantile_to_speed_column.items()\n", |
| 182 | + "# ],\n", |
| 183 | + "# axis=1,\n", |
| 184 | + "# )\n", |
| 185 | + "# # the aggregated data appears to contain a subset of the trip speeds data, so only look at that subset\n", |
| 186 | + "# trip_speeds_percentiles_subset = trip_speeds_percentiles.loc[\n", |
| 187 | + "# aggregated_speeds.index\n", |
| 188 | + "# ]\n", |
| 189 | + "# # find the rows where at least one speed doesn't match\n", |
| 190 | + "# non_equal = (trip_speeds_percentiles_subset != aggregated_speeds).any(axis=1)\n", |
| 191 | + " \n", |
| 192 | + "# print(\n", |
| 193 | + "# f\"for {aggregated_uri}, {non_equal.sum()} out of {len(aggregated_speeds.index)} speeds did not match\"\n", |
| 194 | + "# )" |
| 195 | + ] |
| 196 | + }, |
| 197 | + { |
| 198 | + "cell_type": "markdown", |
| 199 | + "id": "dc750c2a-d8b4-4c22-9b10-5d4a3619d8f3", |
| 200 | + "metadata": {}, |
| 201 | + "source": [ |
| 202 | + "## export" |
| 203 | + ] |
| 204 | + }, |
| 205 | + { |
| 206 | + "cell_type": "code", |
| 207 | + "execution_count": 7, |
| 208 | + "id": "9abf5609-17c4-42e0-a24d-3b10b1fbee64", |
| 209 | + "metadata": { |
| 210 | + "tags": [] |
| 211 | + }, |
| 212 | + "outputs": [], |
| 213 | + "source": [ |
| 214 | + "# Map source uris to output uris\n", |
| 215 | + "output_aggregated_uris = {\n", |
| 216 | + " old_uri: f\"gs://calitp-publish-data-analysis/mtc_collab_2025/speeds/{old_uri.split('/')[-1]}\"\n", |
| 217 | + " for old_uri in aggregated_uris\n", |
| 218 | + "}\n", |
| 219 | + "# output_trip_speeds_uris = {\n", |
| 220 | + "# old_uri['uri']: f\"gs://calitp-publish-data-analysis/mtc_collab_2025/speeds/{old_uri.split('/')[-1]}\"\n", |
| 221 | + "# for old_uri in trip_speeds_uris\n", |
| 222 | + "# }" |
| 223 | + ] |
| 224 | + }, |
| 225 | + { |
| 226 | + "cell_type": "code", |
| 227 | + "execution_count": 13, |
| 228 | + "id": "0bbbbfcc-5e9b-4c11-8ecd-03dcea935fdb", |
| 229 | + "metadata": { |
| 230 | + "tags": [] |
| 231 | + }, |
| 232 | + "outputs": [], |
| 233 | + "source": [ |
| 234 | + "for entry in trip_speeds_uris: entry['output_uri'] = f\"gs://calitp-publish-data-analysis/mtc_collab_2025/speeds/{entry['uri'].split('/')[-1]}\"" |
| 235 | + ] |
| 236 | + }, |
| 237 | + { |
| 238 | + "cell_type": "code", |
| 239 | + "execution_count": 14, |
| 240 | + "id": "5f5d3f1b-ab6f-437b-9877-dda155d50330", |
| 241 | + "metadata": { |
| 242 | + "tags": [] |
| 243 | + }, |
| 244 | + "outputs": [ |
| 245 | + { |
| 246 | + "data": { |
| 247 | + "text/plain": [ |
| 248 | + "[{'date': '2025-09-23',\n", |
| 249 | + " 'uri': 'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/speedmap/speeds_2025-09-23.parquet',\n", |
| 250 | + " 'output_uri': 'gs://calitp-publish-data-analysis/mtc_collab_2025/speeds/speeds_2025-09-23.parquet'},\n", |
| 251 | + " {'date': '2025-09-24',\n", |
| 252 | + " 'uri': 'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/speedmap/speeds_2025-09-24.parquet',\n", |
| 253 | + " 'output_uri': 'gs://calitp-publish-data-analysis/mtc_collab_2025/speeds/speeds_2025-09-24.parquet'},\n", |
| 254 | + " {'date': '2025-09-25',\n", |
| 255 | + " 'uri': 'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/speedmap/speeds_2025-09-25.parquet',\n", |
| 256 | + " 'output_uri': 'gs://calitp-publish-data-analysis/mtc_collab_2025/speeds/speeds_2025-09-25.parquet'},\n", |
| 257 | + " {'date': '2025-10-14',\n", |
| 258 | + " 'uri': 'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/speedmap/speeds_2025-10-14.parquet',\n", |
| 259 | + " 'output_uri': 'gs://calitp-publish-data-analysis/mtc_collab_2025/speeds/speeds_2025-10-14.parquet'},\n", |
| 260 | + " {'date': '2025-10-15',\n", |
| 261 | + " 'uri': 'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/speedmap/speeds_2025-10-15.parquet',\n", |
| 262 | + " 'output_uri': 'gs://calitp-publish-data-analysis/mtc_collab_2025/speeds/speeds_2025-10-15.parquet'},\n", |
| 263 | + " {'date': '2025-10-16',\n", |
| 264 | + " 'uri': 'gs://calitp-analytics-data/data-analyses/rt_segment_speeds/speedmap/speeds_2025-10-16.parquet',\n", |
| 265 | + " 'output_uri': 'gs://calitp-publish-data-analysis/mtc_collab_2025/speeds/speeds_2025-10-16.parquet'}]" |
| 266 | + ] |
| 267 | + }, |
| 268 | + "execution_count": 14, |
| 269 | + "metadata": {}, |
| 270 | + "output_type": "execute_result" |
| 271 | + } |
| 272 | + ], |
| 273 | + "source": [ |
| 274 | + "trip_speeds_uris" |
| 275 | + ] |
| 276 | + }, |
| 277 | + { |
| 278 | + "cell_type": "code", |
| 279 | + "execution_count": 15, |
| 280 | + "id": "423a4975-227d-4426-a6bf-5d57540183fd", |
| 281 | + "metadata": { |
| 282 | + "tags": [] |
| 283 | + }, |
| 284 | + "outputs": [], |
| 285 | + "source": [ |
| 286 | + "def add_speeds_identifiers(speeds_df, analysis_date):\n", |
| 287 | + " trips = helpers.import_scheduled_trips(analysis_date, columns=['trip_instance_key', 'trip_id', 'gtfs_dataset_key'])\n", |
| 288 | + " speeds_df = (speeds_df.merge(trips, on=['trip_instance_key', 'schedule_gtfs_dataset_key'])\n", |
| 289 | + " .pipe(portfolio_utils.standardize_operator_info_for_exports, date=analysis_date)\n", |
| 290 | + " )\n", |
| 291 | + " return speeds_df" |
| 292 | + ] |
| 293 | + }, |
| 294 | + { |
| 295 | + "cell_type": "code", |
| 296 | + "execution_count": 16, |
| 297 | + "id": "eca3638b-eac0-47ff-a457-d6564cad7943", |
| 298 | + "metadata": { |
| 299 | + "tags": [] |
| 300 | + }, |
| 301 | + "outputs": [ |
| 302 | + { |
| 303 | + "name": "stdout", |
| 304 | + "output_type": "stream", |
| 305 | + "text": [ |
| 306 | + "gs://calitp-publish-data-analysis/mtc_collab_2025/speeds/speeds_2025-09-23.parquet\n", |
| 307 | + "gs://calitp-publish-data-analysis/mtc_collab_2025/speeds/speeds_2025-09-24.parquet\n", |
| 308 | + "gs://calitp-publish-data-analysis/mtc_collab_2025/speeds/speeds_2025-09-25.parquet\n", |
| 309 | + "gs://calitp-publish-data-analysis/mtc_collab_2025/speeds/speeds_2025-10-14.parquet\n", |
| 310 | + "gs://calitp-publish-data-analysis/mtc_collab_2025/speeds/speeds_2025-10-15.parquet\n", |
| 311 | + "gs://calitp-publish-data-analysis/mtc_collab_2025/speeds/speeds_2025-10-16.parquet\n" |
| 312 | + ] |
| 313 | + } |
| 314 | + ], |
| 315 | + "source": [ |
| 316 | + "# Copy files from the private bucket to the public / mtc-visible bucket\n", |
| 317 | + "fs = g.gcs_filesystem\n", |
| 318 | + "# for old_uri, new_uri in output_aggregated_uris.items():\n", |
| 319 | + "# fs.copy(old_uri, new_uri)\n", |
| 320 | + "for uri_group in trip_speeds_uris:\n", |
| 321 | + " speeds_df = pd.read_parquet(uri_group['uri'])\n", |
| 322 | + " speeds_df = add_speeds_identifiers(speeds_df, analysis_date=uri_group['date'])\n", |
| 323 | + " speeds_df.to_parquet(uri_group['output_uri'])\n", |
| 324 | + " print(uri_group['output_uri'])" |
| 325 | + ] |
| 326 | + } |
| 327 | + ], |
| 328 | + "metadata": { |
| 329 | + "kernelspec": { |
| 330 | + "display_name": "Python 3 (ipykernel)", |
| 331 | + "language": "python", |
| 332 | + "name": "python3" |
| 333 | + }, |
| 334 | + "language_info": { |
| 335 | + "codemirror_mode": { |
| 336 | + "name": "ipython", |
| 337 | + "version": 3 |
| 338 | + }, |
| 339 | + "file_extension": ".py", |
| 340 | + "mimetype": "text/x-python", |
| 341 | + "name": "python", |
| 342 | + "nbconvert_exporter": "python", |
| 343 | + "pygments_lexer": "ipython3", |
| 344 | + "version": "3.11.10" |
| 345 | + }, |
| 346 | + "widgets": { |
| 347 | + "application/vnd.jupyter.widget-state+json": { |
| 348 | + "state": {}, |
| 349 | + "version_major": 2, |
| 350 | + "version_minor": 0 |
| 351 | + } |
| 352 | + } |
| 353 | + }, |
| 354 | + "nbformat": 4, |
| 355 | + "nbformat_minor": 5 |
| 356 | +} |
0 commit comments