Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: support for any EMC2 HTS #292

Open
wants to merge 2 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,11 @@ These steps are described in the following documents:
- [How to run a MATSim simulation of Île-de-France](docs/simulation.md)

Furthermore, we provide documentation on how to make use of the code to create
popuations and run simulations of other places in France. While these are
examples, the code can be adapted to any other scenarios as well:
popuations and run simulations of other places in France.

Documentation for any region: [Generic regions](docs/cases/generic.md)

Documentation for specific regions:
- [Toulouse and Occitanie region](docs/cases/toulouse.md)
- [Lyon and neighboring departments](docs/cases/lyon.md)
- [Nantes and Loire Atlantique](docs/cases/nantes.md)
Expand Down
224 changes: 224 additions & 0 deletions data/hts/emc2/cleaned.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,224 @@
import pandas as pd
import numpy as np
import data.hts.hts as hts

"""
This stage cleans the generic EMC2.
"""


def configure(context):
context.stage("data.hts.emc2.raw")


PURPOSE_MAP = {
"home": [1, 2],
"work": [11, 12, 13, 14, 81],
"education": [21, 22, 23, 24, 25, 26, 27, 28, 29, 96, 97],
"shop": [30, 31, 32, 33, 34, 35, 82, 98],
"leisure": [51, 52, 53, 54],
"other": [41, 42, 43, 61, 62, 63, 64, 71, 72, 73, 74, 91],
}

MODES_MAP = {
"car": [13, 15, 19, 21, 81],
"car_passenger": [14, 16, 20, 22, 82],
"pt": [31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 51, 52, 53, 54, 61, 71, 91, 92, 94, 95],
"bike": [10, 11, 17, 12, 18, 93, 96, 97],
"walk": [1, 2], # Actually, 2 is not really explained, but we assume it is walk
}


def execute(context):
df_households, df_persons, df_trips, df_spatial = context.stage("data.hts.emc2.raw")

# Merge departement into households
df_spatial["departement_id"] = df_spatial["INSEE"].str[:2]
df_spatial = df_spatial[["ZF", "departement_id"]]

df_households = pd.merge(df_households, df_spatial, left_on="ZFM", right_on="ZF", how="left")
df_households["departement_id"] = df_households["departement_id"].fillna("unknown")

# Transform original IDs to integer (they are hierarchichal)
df_households["edgt_household_id"] = (
df_households["ZFM"].astype(str) + df_households["ECH"] + df_households["METH"]
).astype(int)
df_persons["edgt_person_id"] = df_persons["PER"].astype(int)
df_persons["edgt_household_id"] = (
df_persons["ZFP"].astype(str) + df_persons["ECH"] + df_persons["PMET"]
).astype(int)
df_trips["edgt_person_id"] = df_trips["PER"].astype(int)
df_trips["edgt_household_id"] = (
df_trips["ZFD"].astype(str) + df_trips["ECH"] + df_trips["DMET"]
).astype(int)
df_trips["edgt_trip_id"] = df_trips["NDEP"].astype(int)

# Construct new IDs for households, persons and trips (which are unique globally)
df_households["household_id"] = np.arange(len(df_households))

df_persons = pd.merge(
df_persons,
df_households[["edgt_household_id", "household_id", "departement_id"]],
on="edgt_household_id",
).sort_values(by=["household_id", "edgt_person_id"])
df_persons["person_id"] = np.arange(len(df_persons))

df_trips = pd.merge(
df_trips,
df_persons[["edgt_person_id", "edgt_household_id", "person_id", "household_id"]],
on=["edgt_person_id", "edgt_household_id"],
).sort_values(by=["household_id", "person_id", "edgt_trip_id"])
df_trips["trip_id"] = np.arange(len(df_trips))

# Trip flags
df_trips = hts.compute_first_last(df_trips)

# Weight
df_persons["person_weight"] = df_persons["COEP"].astype(float)
df_households["household_weight"] = df_households["COE0"].astype(float)

# Clean age
df_persons["age"] = df_persons["P4"].astype(int)

# Clean sex
df_persons.loc[df_persons["P2"] == 1, "sex"] = "male"
df_persons.loc[df_persons["P2"] == 2, "sex"] = "female"
df_persons["sex"] = df_persons["sex"].astype("category")

# Household size
df_size = df_persons.groupby("household_id").size().reset_index(name="household_size")
df_households = pd.merge(df_households, df_size, on="household_id")

# Clean departement
df_trips = pd.merge(
df_trips,
df_spatial.rename(columns={"ZF": "D3", "departement_id": "origin_departement_id"}),
on="D3",
how="left",
)

df_trips = pd.merge(
df_trips,
df_spatial.rename(columns={"ZF": "D7", "departement_id": "destination_departement_id"}),
on="D7",
how="left",
)

df_trips["origin_departement_id"] = df_trips["origin_departement_id"].fillna("unknown")
df_trips["destination_departement_id"] = df_trips["destination_departement_id"].fillna(
"unknown"
)

df_households["departement_id"] = df_households["departement_id"].astype("category")
df_persons["departement_id"] = df_persons["departement_id"].astype("category")
df_trips["origin_departement_id"] = df_trips["origin_departement_id"].astype("category")
df_trips["destination_departement_id"] = df_trips["destination_departement_id"].astype(
"category"
)

# Clean employment
df_persons["employed"] = df_persons["P9"].isin(["1", "2"])

# Studies
df_persons["studies"] = df_persons["P9"].isin(["3", "4", "5"])

# Number of vehicles
df_households["number_of_vehicles"] = df_households["M6"] + df_households["M14"]
df_households["number_of_vehicles"] = df_households["number_of_vehicles"].astype(int)
df_households["number_of_bikes"] = df_households["M21"].astype(int)

# License
df_persons["has_license"] = df_persons["P7"] == "1"

# Has subscription
df_persons["has_pt_subscription"] = df_persons["P12"].isin(["1", "2", "3", "5", "6"])

# Survey respondents
# PENQ 1 : fully awnsered the travel questionary section, having a chain or non-movers
# PENQ 2 : nonrespondent of travel questionary section
df_persons["PENQ"] = df_persons["PENQ"].fillna("2").astype("int")
df_persons.loc[df_persons["PENQ"] == 1, "travel_respondent"] = True
df_persons.loc[df_persons["PENQ"] == 2, "travel_respondent"] = False

assert np.count_nonzero(df_persons["travel_respondent"].isna()) == 0

# Trip purpose
df_trips["following_purpose"] = "invalid"
df_trips["preceding_purpose"] = "invalid"

for purpose, values in PURPOSE_MAP.items():
df_trips.loc[df_trips["D5A"].isin(values), "following_purpose"] = purpose
df_trips.loc[df_trips["D2A"].isin(values), "preceding_purpose"] = purpose

assert np.count_nonzero(df_trips["following_purpose"] == "invalid") == 0
assert np.count_nonzero(df_trips["preceding_purpose"] == "invalid") == 0

df_trips["following_purpose"] = df_trips["following_purpose"].astype("category")
df_trips["preceding_purpose"] = df_trips["preceding_purpose"].astype("category")

# Trip mode
df_trips["mode"] = "invalid"
for mode, values in MODES_MAP.items():
df_trips.loc[df_trips["MODP"].isin(values), "mode"] = mode

assert np.count_nonzero(df_trips["mode"] == "invalid") == 0
df_trips["mode"] = df_trips["mode"].astype("category")

# Further trip attributes
df_trips["euclidean_distance"] = df_trips["D11"]
df_trips["routed_distance"] = df_trips["D12"]

# Trip times
df_trips["departure_time"] = 3600.0 * (df_trips["D4"] // 100) # hour
df_trips["departure_time"] += 60.0 * (df_trips["D4"] % 100) # minute

df_trips["arrival_time"] = 3600.0 * (df_trips["D8"] // 100) # hour
df_trips["arrival_time"] += 60.0 * (df_trips["D8"] % 100) # minute

df_trips = df_trips.sort_values(by=["household_id", "person_id", "trip_id"])
df_trips = hts.fix_trip_times(df_trips)

# Durations
df_trips["trip_duration"] = df_trips["arrival_time"] - df_trips["departure_time"]
hts.compute_activity_duration(df_trips)

# Add weight to trips
df_trips = pd.merge(
df_trips, df_persons[["person_id", "COE1"]], on="person_id", how="left"
).rename(columns={"COE1": "trip_weight"})
df_persons["trip_weight"] = df_persons["COE1"]

# Chain length
df_count = df_trips.groupby("person_id").size().reset_index(name="number_of_trips")
# People with at least one trip (number_of_trips > 0)
df_persons = pd.merge(df_persons, df_count, on="person_id", how="left")
# People that answered the travel questionary section but stayed at home (number_of_trips = 0)
df_persons.loc[
(df_persons["travel_respondent"] == True) & (df_persons["number_of_trips"].isna()),
"number_of_trips",
] = 0
# Nonrespondent of travel questionary section (number_of_trips = -1)
df_persons["number_of_trips"] = df_persons["number_of_trips"].fillna(-1).astype(int)

# Passenger attribute
df_persons["is_passenger"] = df_persons["person_id"].isin(
df_trips[df_trips["mode"] == "car_passenger"]["person_id"].unique()
)

# Calculate consumption units
hts.check_household_size(df_households, df_persons)
df_households = pd.merge(
df_households, hts.calculate_consumption_units(df_persons), on="household_id"
)

# Socioprofessional class
df_persons["socioprofessional_class"] = df_persons["PCSC"].fillna(8).astype(int)

# Check departure and arrival times
assert np.count_nonzero(df_trips["departure_time"].isna()) == 0
assert np.count_nonzero(df_trips["arrival_time"].isna()) == 0

# Fix activity types (because of inconsistent EGT data and removing in the timing fixing step)
hts.fix_activity_types(df_trips)

return df_households, df_persons, df_trips
49 changes: 49 additions & 0 deletions data/hts/emc2/filtered.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import data.hts.hts as hts

"""
This stage filters out observations which live or work outside of the area.
"""


def configure(context):
context.stage("data.hts.emc2.cleaned")

context.stage("data.spatial.codes")

context.config("filter_hts", True)


def execute(context):
filter_edgt = context.config("filter_hts")
df_codes = context.stage("data.spatial.codes")
df_households, df_persons, df_trips = context.stage("data.hts.emc2.cleaned")

if filter_edgt:
# Filter for non-residents
requested_departments = df_codes["departement_id"].unique()
f = df_persons["departement_id"].astype(str).isin(requested_departments)
df_persons = df_persons[f]

# Filter for people going outside of the area
remove_ids = set(
df_trips[
~df_trips["origin_departement_id"].astype(str).isin(requested_departments)
| ~df_trips["destination_departement_id"].astype(str).isin(requested_departments)
]["person_id"].unique()
)

df_persons = df_persons[~df_persons["person_id"].isin(remove_ids)]

# Only keep trips and households that still have a person
df_trips = df_trips[df_trips["person_id"].isin(df_persons["person_id"].unique())]
df_households = df_households[
df_households["household_id"].isin(df_persons["household_id"])
]

# Finish up
df_households = df_households[hts.HOUSEHOLD_COLUMNS]
df_persons = df_persons[hts.PERSON_COLUMNS]
df_trips = df_trips[hts.TRIP_COLUMNS + ["routed_distance", "euclidean_distance"]]

hts.check(df_households, df_persons, df_trips)
return df_households, df_persons, df_trips
Loading