Skip to content

Commit

Permalink
feat: initial batching of travel times
Browse files Browse the repository at this point in the history
  • Loading branch information
ethan-moss committed Oct 19, 2023
1 parent 6f9ed81 commit 79d39c1
Show file tree
Hide file tree
Showing 2 changed files with 80 additions and 0 deletions.
78 changes: 78 additions & 0 deletions notebooks/metrics/metrics_experiments.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
# %% [markdown] noqa: D212, D400, D415
"""
# Metrics Experiments
A notebook developed for exerimenting with different approaches to calculating
transport performance metrics.
## Preamble
Call in script wide imports and the configuration information.
"""

# %%
import os
import pandas as pd

from pyprojroot import here
from tqdm import tqdm

from transport_performance.utils.defence import (
_check_parent_dir_exists,
)

# %%
# name of area and source of metrics inputs
AREA_NAME = "newport"
metrics_input_dir = here(
f"data/processed/analyse_network/newport_e2e/experiments/{AREA_NAME}"
)

# %% [markdown] noqa: D212, D400, D415
"""
## Preprocessing Inputs
This section looks to preprocess the inputs needed of a `metrics` module. It
takes an OD `r5py` result (in this case the Newport bus example), and converts
it to a collection of parquet files (as per the output `analyse_network`).
These files can then be used to experiment with different python modules when
calculating the transport performance.
> Note: this section only needs to be run as a 'one-off'.
"""

# %%
# outputs from the analyse_network stage, to use during the experiment
ANALYSE_NETWORK_OUTPUTS = here(
"data/processed/analyse_network/newport_e2e/travel_times.pkl"
)
BATCH_BY_COL = "from_id"

# %%
# read in the travel times
travel_times = pd.read_pickle(ANALYSE_NETWORK_OUTPUTS)
travel_times.head()

# %%
# batch travel_times into individual parquet files
ids = travel_times[BATCH_BY_COL].unique()

# create the parent dir if it doesnt exist - dummy needed to create parent dir
_check_parent_dir_exists(
os.path.join(metrics_input_dir, "dummy.txt"),
"metrics_input_dir",
create=True,
)

for id in tqdm(ids, total=len(ids)):

# get a batch
batch_df = travel_times[travel_times[BATCH_BY_COL] == id]

# create the output filepath and check if parent exists in first pass
batch_filepath = os.path.join(
metrics_input_dir, f"{AREA_NAME}_id{id}.parquet"
)

# create batched parquet file
batch_df.to_parquet(batch_filepath)

# %%
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -31,4 +31,6 @@ kaleido
numpy>=1.25.0 # test suite will fail if user installed lower than this
sphinx
sphinx-rtd-theme
pyarrow
tqdm
-e .

0 comments on commit 79d39c1

Please sign in to comment.