-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Updated Random Forest model code and results
- Loading branch information
1 parent
65e8796
commit 6ae28d2
Showing
30 changed files
with
525,163 additions
and
349,344 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,124 @@ | ||
# import packages | ||
import pandas as pd | ||
import os | ||
import numpy as np | ||
import datetime | ||
import csv | ||
|
||
from sklearn.model_selection import train_test_split, GridSearchCV, KFold | ||
from sklearn.ensemble import RandomForestRegressor | ||
from sklearn.metrics import mean_squared_error | ||
|
||
# Path to the folder containing the chunked data | ||
folder_path = "/gcs/gnn_chapter/GeoDS_data" | ||
|
||
# List of dataframes to store the data from each file | ||
chunks = [] | ||
# Loop through each file in the folder | ||
for filename in os.listdir(folder_path): | ||
# Check if the file is a CSV file | ||
if filename.endswith(".csv"): | ||
# Load the file into a dataframe | ||
filepath = os.path.join(folder_path, filename) | ||
df = pd.read_csv(filepath) | ||
# Append the dataframe to the list | ||
chunks.append(df) | ||
|
||
data_out = pd.DataFrame() | ||
best_params = [] | ||
|
||
# Create lags | ||
def generate_lagged_columns(df, variable, lags): | ||
for i in range(1, lags + 1): | ||
lagged_var_name = f'lag{i}_{variable}' | ||
df[lagged_var_name] = df.groupby(['origin', 'destination'])[variable].shift(i) | ||
return df | ||
|
||
for chunk in chunks: | ||
|
||
regr = RandomForestRegressor(random_state=4000, n_jobs=-1, verbose=0) | ||
|
||
tune_grid = {"max_depth": [14], "max_features": [30]} #{"max_depth": [10, 12, 14, 18, 22, 26, 30], "max_features": [1, 2, 4, 8, 12, 16, 25, 30]} | ||
|
||
rf_random = GridSearchCV( | ||
regr, | ||
tune_grid, | ||
cv=5, | ||
scoring="neg_root_mean_squared_error", | ||
verbose=3, | ||
) | ||
|
||
data = chunk.copy() | ||
|
||
# remove the separators from area and convert to numeric | ||
data['Total_Area_o'] = data['Total_Area_o'].str.replace(r',', r'', regex=True) | ||
data['Total_Area_o'] = pd.to_numeric(data['Total_Area_o']) | ||
data['Total_Area_d'] = data['Total_Area_d'].str.replace(r',', r'', regex=True) | ||
data['Total_Area_d'] = pd.to_numeric(data['Total_Area_d']) | ||
|
||
# Drop date | ||
data = data.drop('start_date', axis=1) | ||
|
||
# Lag target | ||
lags = 3 | ||
data = generate_lagged_columns(data, 'pop_flows', lags) | ||
|
||
# Shift traget | ||
data['pop_flows_target'] = data.groupby(['origin', 'destination'])['pop_flows'].shift(-1) | ||
|
||
# keep the last year | ||
X_predic = data[data['Timeline'] == max(data['Timeline'])].drop(['pop_flows_target'], axis=1) | ||
|
||
# Drop because of shift | ||
data.dropna(inplace=True) | ||
|
||
# Run RF with 5 fold CV + grid search | ||
X = data.drop(['pop_flows_target'], axis=1) | ||
y = data['pop_flows_target'] | ||
|
||
# Exclude columns | ||
columns_to_exclude = ['origin', 'destination', 'Timeline'] | ||
|
||
|
||
rf_random.fit(X.drop(columns=columns_to_exclude), y) | ||
y_predic = rf_random.predict(X_predic.drop(columns=columns_to_exclude)) | ||
|
||
X_predic['prediction'] = y_predic | ||
data_out = pd.concat([data_out, X_predic]) | ||
|
||
best_params.append(rf_random.best_params_) | ||
|
||
|
||
# Get the current date and time | ||
current_time = datetime.datetime.now() | ||
|
||
# Format the timestamp as desired | ||
timestamp = current_time.strftime("%Y-%m-%d_%H-%M") | ||
|
||
# Use the timestamp when saving the DataFrame | ||
filename = f"RF_GeoDS_{timestamp}.csv" | ||
|
||
# Save results | ||
results = pd.DataFrame({ | ||
'year': data_out['Timeline']+1, | ||
'origin': data_out['origin'], | ||
'destination': data_out['destination'], | ||
'prediction': data_out['prediction'] | ||
}) | ||
|
||
results.to_csv('/gcs/gnn_chapter/GeoDS_results/' + filename, index=False) | ||
|
||
field_names = best_params[0].keys() | ||
best_param_filename = f"RF_GeoDS_best_params_{timestamp}.csv" | ||
|
||
# Open the CSV file in write mode | ||
with open('/gcs/gnn_chapter/GeoDS_results/'+ best_param_filename, 'w', newline='') as file: | ||
# Create a CSV writer | ||
writer = csv.DictWriter(file, fieldnames=field_names) | ||
|
||
# Write the header | ||
writer.writeheader() | ||
|
||
# Write the data from the list of dictionaries | ||
for row in best_params: | ||
writer.writerow(row) |
114 changes: 114 additions & 0 deletions
114
Modelling_and_results/Random_forest/RF_GeoDS_limited.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,114 @@ | ||
# import packages | ||
import pandas as pd | ||
import os | ||
import numpy as np | ||
import datetime | ||
|
||
from sklearn.model_selection import train_test_split, GridSearchCV, KFold | ||
from sklearn.ensemble import RandomForestRegressor | ||
from sklearn.metrics import mean_squared_error | ||
|
||
# Path to the folder containing the chunked data | ||
folder_path = "/gcs/gnn_chapter/GeoDS_data" | ||
|
||
|
||
# List of dataframes to store the data from each file | ||
chunks = [] | ||
# Loop through each file in the folder | ||
for filename in os.listdir(folder_path): | ||
# Check if the file is a CSV file | ||
if filename.endswith(".csv"): | ||
# Load the file into a dataframe | ||
filepath = os.path.join(folder_path, filename) | ||
df = pd.read_csv(filepath) | ||
# Append the dataframe to the list | ||
chunks.append(df) | ||
|
||
data_out = pd.DataFrame() | ||
best_params = [] | ||
|
||
# Create lags | ||
def generate_lagged_columns(df, variable, lags): | ||
for i in range(1, lags + 1): | ||
lagged_var_name = f'lag{i}_{variable}' | ||
df[lagged_var_name] = df.groupby(['origin', 'destination'])[variable].shift(i) | ||
return df | ||
|
||
for chunk in chunks: | ||
|
||
regr = RandomForestRegressor(random_state=4000, n_jobs=-1, verbose=0) | ||
|
||
tune_grid = {"max_depth": [14], "max_features": [30]} | ||
|
||
rf_random = GridSearchCV( | ||
regr, | ||
tune_grid, | ||
cv=5, | ||
scoring="neg_root_mean_squared_error", | ||
verbose=3, | ||
) | ||
|
||
data = chunk.copy() | ||
|
||
# remove the separators from area and convert to numeric | ||
data['Total_Area_o'] = data['Total_Area_o'].str.replace(r',', r'', regex=True) | ||
data['Total_Area_o'] = pd.to_numeric(data['Total_Area_o']) | ||
data['Total_Area_d'] = data['Total_Area_d'].str.replace(r',', r'', regex=True) | ||
data['Total_Area_d'] = pd.to_numeric(data['Total_Area_d']) | ||
|
||
# Drop date | ||
data = data.drop('start_date', axis=1) | ||
|
||
# Lag target | ||
lags = 3 | ||
data = generate_lagged_columns(data, 'pop_flows', lags) | ||
|
||
# Shift traget | ||
data['pop_flows_target'] = data.groupby(['origin', 'destination'])['pop_flows'].shift(-1) | ||
|
||
# keep the last year | ||
X_predic = data[data['Timeline'] == max(data['Timeline'])].drop(['pop_flows_target'], axis=1) | ||
|
||
# Drop because of shift | ||
data.dropna(inplace=True) | ||
|
||
# Run RF with 5 fold CV + grid search | ||
X = data.drop(['pop_flows_target'], axis=1) | ||
y = data['pop_flows_target'] | ||
|
||
# use the specification from gravity | ||
X = X[['origin', 'destination','Timeline','population_2019_o','population_2019_d', 'pop_flows', 'distances', 'neighbouring', 'all_pop_flows_to_d', 'o_pop_flows_to_all']] | ||
X_predic = X_predic[['origin', 'destination','Timeline','population_2019_o','population_2019_d', 'pop_flows', 'distances', 'neighbouring', 'all_pop_flows_to_d', 'o_pop_flows_to_all']] | ||
|
||
# Exclude columns | ||
columns_to_exclude = ['origin', 'destination', 'Timeline'] | ||
|
||
rf_random.fit(X.drop(columns=columns_to_exclude), y) | ||
y_predic = rf_random.predict(X_predic.drop(columns=columns_to_exclude)) | ||
|
||
X_predic['prediction'] = y_predic | ||
data_out = pd.concat([data_out, X_predic]) | ||
|
||
best_params.append(rf_random.best_params_) | ||
|
||
|
||
# Get the current date and time | ||
current_time = datetime.datetime.now() | ||
|
||
# Format the timestamp as desired | ||
timestamp = current_time.strftime("%Y-%m-%d_%H-%M") | ||
|
||
# Use the timestamp when saving the DataFrame | ||
filename = f"RF_GeoDS_limited_{timestamp}.csv" | ||
|
||
# Save results | ||
results = pd.DataFrame({ | ||
'year': data_out['Timeline']+1, | ||
'origin': data_out['origin'], | ||
'destination': data_out['destination'], | ||
'prediction': data_out['prediction'] | ||
}) | ||
|
||
results.to_csv('/gcs/gnn_chapter/GeoDS_results/' + filename, index=False) | ||
|
||
|
112 changes: 112 additions & 0 deletions
112
Modelling_and_results/Random_forest/RF_GeoDS_limited_lag.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,112 @@ | ||
# import packages | ||
import pandas as pd | ||
import os | ||
import numpy as np | ||
import datetime | ||
|
||
from sklearn.model_selection import train_test_split, GridSearchCV, KFold | ||
from sklearn.ensemble import RandomForestRegressor | ||
from sklearn.metrics import mean_squared_error | ||
|
||
# Path to the folder containing the chunked data | ||
folder_path = "/gcs/gnn_chapter/GeoDS_data" | ||
|
||
|
||
# List of dataframes to store the data from each file | ||
chunks = [] | ||
# Loop through each file in the folder | ||
for filename in os.listdir(folder_path): | ||
# Check if the file is a CSV file | ||
if filename.endswith(".csv"): | ||
# Load the file into a dataframe | ||
filepath = os.path.join(folder_path, filename) | ||
df = pd.read_csv(filepath) | ||
# Append the dataframe to the list | ||
chunks.append(df) | ||
|
||
data_out = pd.DataFrame() | ||
best_params = [] | ||
|
||
# Create lags | ||
def generate_lagged_columns(df, variable, lags): | ||
for i in range(1, lags + 1): | ||
lagged_var_name = f'lag{i}_{variable}' | ||
df[lagged_var_name] = df.groupby(['origin', 'destination'])[variable].shift(i) | ||
return df | ||
|
||
for chunk in chunks: | ||
|
||
regr = RandomForestRegressor(random_state=4000, n_jobs=-1, verbose=0) | ||
|
||
tune_grid = {"max_depth": [14], "max_features": [30]} | ||
|
||
rf_random = GridSearchCV( | ||
regr, | ||
tune_grid, | ||
cv=5, | ||
scoring="neg_root_mean_squared_error", | ||
verbose=3, | ||
) | ||
|
||
data = chunk.copy() | ||
|
||
# remove the separators from area and convert to numeric | ||
data['Total_Area_o'] = data['Total_Area_o'].str.replace(r',', r'', regex=True) | ||
data['Total_Area_o'] = pd.to_numeric(data['Total_Area_o']) | ||
data['Total_Area_d'] = data['Total_Area_d'].str.replace(r',', r'', regex=True) | ||
data['Total_Area_d'] = pd.to_numeric(data['Total_Area_d']) | ||
|
||
# Drop date | ||
data = data.drop('start_date', axis=1) | ||
|
||
# Lag target | ||
lags = 3 | ||
data = generate_lagged_columns(data, 'pop_flows', lags) | ||
|
||
# Shift traget | ||
data['pop_flows_target'] = data.groupby(['origin', 'destination'])['pop_flows'].shift(-1) | ||
|
||
# keep the last year | ||
X_predic = data[data['Timeline'] == max(data['Timeline'])].drop(['pop_flows_target'], axis=1) | ||
|
||
# Drop because of shift | ||
data.dropna(inplace=True) | ||
|
||
# Run RF with 5 fold CV + grid search | ||
X = data.drop(['pop_flows_target'], axis=1) | ||
y = data['pop_flows_target'] | ||
|
||
# use the specification from gravity | ||
X = X[['origin', 'destination','Timeline','population_2019_o','population_2019_d', 'pop_flows', 'distances', 'neighbouring', 'all_pop_flows_to_d', 'o_pop_flows_to_all', 'lag1_pop_flows', 'lag2_pop_flows', 'lag3_pop_flows']] | ||
X_predic = X_predic[['origin', 'destination','Timeline','population_2019_o','population_2019_d', 'pop_flows', 'distances', 'neighbouring', 'all_pop_flows_to_d', 'o_pop_flows_to_all', 'lag1_pop_flows', 'lag2_pop_flows', 'lag3_pop_flows']] | ||
|
||
# Exclude columns | ||
columns_to_exclude = ['origin', 'destination', 'Timeline'] | ||
|
||
rf_random.fit(X.drop(columns=columns_to_exclude), y) | ||
y_predic = rf_random.predict(X_predic.drop(columns=columns_to_exclude)) | ||
|
||
X_predic['prediction'] = y_predic | ||
data_out = pd.concat([data_out, X_predic]) | ||
|
||
best_params.append(rf_random.best_params_) | ||
|
||
|
||
# Get the current date and time | ||
current_time = datetime.datetime.now() | ||
|
||
# Format the timestamp as desired | ||
timestamp = current_time.strftime("%Y-%m-%d_%H-%M") | ||
|
||
# Use the timestamp when saving the DataFrame | ||
filename = f"RF_GeoDS_limited_lags_{timestamp}.csv" | ||
|
||
# Save results | ||
results = pd.DataFrame({ | ||
'year': data_out['Timeline']+1, | ||
'origin': data_out['origin'], | ||
'destination': data_out['destination'], | ||
'prediction': data_out['prediction'] | ||
}) | ||
|
||
results.to_csv('/gcs/gnn_chapter/GeoDS_results/' + filename, index=False) |
Oops, something went wrong.