Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Model training inference v2 #146

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
changes for model training and inference
orianac committed Feb 8, 2022

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature.
commit ce8902d9004bb1bcf891bd19405d6602b548b44e
4 changes: 2 additions & 2 deletions carbonplan_trace/v1/load.py
Original file line number Diff line number Diff line change
@@ -229,13 +229,13 @@ def biomass(tiles, year):


def training(realm, y0=2003, y1=2010, reload=False, access_key_id=None, secret_access_key=None):
output_filename = f's3://carbonplan-climatetrace/v1/training/{realm}/all_data.parquet'
output_filename = f's3://carbonplan-climatetrace/v2/training/{realm}/all_data.parquet'
if fs.exists(output_filename) and not reload:
return pd.read_parquet(output_filename)
else:
output = []
for yr in range(y0, y1):
folder_name = f's3://carbonplan-climatetrace/v1/training/{realm}/{yr}/'
folder_name = f's3://carbonplan-climatetrace/v2/training/{realm}/{yr}/'
files = fs.ls(folder_name)
for f in files:
output.append(pd.read_parquet(f's3://{f}'))
221 changes: 188 additions & 33 deletions notebooks/processing/inference.ipynb
Original file line number Diff line number Diff line change
@@ -50,6 +50,17 @@
"from carbonplan_trace.v1 import utils\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pyproj\n",
"\n",
"pyproj.__version__"
]
},
{
"cell_type": "code",
"execution_count": null,
@@ -88,7 +99,8 @@
" # spin up local cluster. must be on big enough machine\n",
" from dask.distributed import Client\n",
"\n",
" client = Client(n_workers=2, threads_per_worker=15, resources={\"workertoken\": 1})\n",
" # when very very huge use 8,8\n",
" client = Client(n_workers=8, threads_per_worker=8, resources={\"workertoken\": 1})\n",
" client\n",
"else:\n",
" gateway = Gateway()\n",
@@ -107,15 +119,6 @@
"# cluster.scale(100)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"cluster"
]
},
{
"cell_type": "code",
"execution_count": null,
@@ -145,6 +148,17 @@
" cluster.shutdown()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"shutdown_cluster(\"local\")"
]
},
{
"cell_type": "code",
"execution_count": null,
@@ -171,6 +185,20 @@
"tiles and write it out to a mapper with those specifications.\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"ul_lats = [\"10S\", \"20S\", \"30S\"]\n",
"ul_lons = [f\"{lon}E\" for lon in np.arange(110, 151, 10)]\n",
"lat_lon_tags = []\n",
"for ul_lat in ul_lats:\n",
" for ul_lon in ul_lons:\n",
" lat_lon_tags.append((ul_lat, ul_lon))"
]
},
{
"cell_type": "code",
"execution_count": null,
@@ -182,11 +210,12 @@
" \"palladium/production/s3fs-public/atoms/files/\"\n",
" \"WRS2_descending_0.zip\"\n",
")\n",
"bucket = \"s3://carbonplan-climatetrace/v1\"\n",
"bucket = \"s3://carbonplan-climatetrace/v2.1\"\n",
"\n",
"biomass_folder = \"s3://carbonplan-climatetrace/intermediate/ecoregions_mask/\"\n",
"biomass_files = fs.ls(biomass_folder)\n",
"lat_lon_tags = [utils.get_lat_lon_tags_from_tile_path(fp) for fp in biomass_files]\n",
"# biomass_folder = \"s3://carbonplan-climatetrace/intermediate/ecoregions_mask/\"\n",
"# biomass_files = fs.ls(biomass_folder) # just to get list of lat_lon tiles we want\n",
"# lat_lon_tags = [utils.get_lat_lon_tags_from_tile_path(fp) for fp in biomass_files]\n",
"# lat_lon_tags = [('60N', '130W')]#, ('40N', '130W')]#, ('00N', '060W')] #('50N', '130W'),\n",
"bounding_boxes = [utils.parse_bounding_box_from_lat_lon_tags(lat, lon) for lat, lon in lat_lon_tags]"
]
},
@@ -199,10 +228,11 @@
"from carbonplan_trace.v1.glas_allometric_eq import REALM_GROUPINGS\n",
"\n",
"processed_scenes = []\n",
"for year in np.arange(2014, 2021):\n",
" processed_scenes.extend(fs.ls(f\"{bucket}/inference/rf/{year}\", recursive=True))\n",
"for year in np.arange(2011, 2022):\n",
" processed_scenes.extend(fs.ls(f\"{bucket}/inference/xg/{year}\", recursive=True))\n",
"\n",
"processed_scenes = [scene[-19:-8] for scene in processed_scenes]"
"processed_scenes = [scene[-19:-8] for scene in processed_scenes]\n",
"len(processed_scenes)"
]
},
{
@@ -211,7 +241,15 @@
"metadata": {},
"outputs": [],
"source": [
"len(processed_scenes)"
"import carbonplan_trace"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We'll loop through every scene and every year and calculate biomass for that scene. Will produce\n",
"table of values [x, y, (both specific to utm projection), lat, lon, biomass].\n"
]
},
{
@@ -220,7 +258,9 @@
"metadata": {},
"outputs": [],
"source": [
"len(processed_scenes) - 57875"
"for bounding_box in bounding_boxes:\n",
" min_lat, max_lat, min_lon, max_lon = bounding_box\n",
" valid_scenes = gdf.cx[min_lon:max_lon, min_lat:max_lat][[\"PATH\", \"ROW\"]].values"
]
},
{
@@ -229,15 +269,54 @@
"metadata": {},
"outputs": [],
"source": [
"len(bounding_boxes)"
"file_lengths = pd.DataFrame(\n",
" columns=[\"v1-rf\", \"v2-rf\", \"v2-xg\"],\n",
" index=[\"_\".join([str(path), str(row)]) for (path, row) in valid_scenes],\n",
")"
]
},
{
"cell_type": "markdown",
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"# rerun_scenes = {'2010':[], '2014':[]}\n",
"# setups = [('v2', 'rf')]#, ('v2', 'xg')] #('v1', 'rf'),\n",
"# for year in ['2010', '2014']:\n",
"# for (version, model) in setups:\n",
"# for [path, row] in valid_scenes:\n",
"# output_name = f\"{year}/{path:03d}{row:03d}.parquet\"\n",
"# print(f's3://carbonplan-climatetrace/{version}/inference/{model}/{output_name}')\n",
"# if len(fs.ls(f's3://carbonplan-climatetrace/{version}/inference/{model}/{output_name}')) == 0:\n",
"# if [path, row] not in rerun_scenes[year]:\n",
"# rerun_scenes[year].append([path, row])\n",
"# i+=1\n",
"# file_length = len(pd.read_parquet(f's3://carbonplan-climatetrace/{version}/inference/{model}/{output_name}'))\n",
"# except FileNotFoundError:\n",
"# file_length = np.nan\n",
"\n",
"# file_lengths.loc[f'{path}_{row}', f'{version}-{model}'] = file_length"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"We'll loop through every scene and every year and calculate biomass for that scene. Will produce\n",
"table of values [x, y, (both specific to utm projection), lat, lon, biomass].\n"
"# file_lengths.to_csv('files_to_repeat.csv')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# remove each entry in index"
]
},
{
@@ -249,25 +328,26 @@
"outputs": [],
"source": [
"landsat_bucket = \"s3://usgs-landsat/collection02/level-2/standard/etm/{}/{:03d}/{:03d}/\"\n",
"\n",
"with rio.Env(aws_session):\n",
" # tasks = []\n",
" tasks = []\n",
" task_ids = []\n",
" for bounding_box in bounding_boxes:\n",
" print(bounding_box)\n",
" min_lat, max_lat, min_lon, max_lon = bounding_box\n",
" scenes_in_tile = gdf.cx[min_lon:max_lon, min_lat:max_lat][[\"PATH\", \"ROW\"]].values\n",
" for year in np.arange(2014, 2021):\n",
" for year in np.arange(2011, 2022):\n",
" for [path, row] in scenes_in_tile:\n",
" scene_stores = fs.ls(landsat_bucket.format(year, path, row))\n",
" output_name = f\"{year}/{path:03d}{row:03d}\"\n",
" if len(scene_stores) == 0:\n",
" continue\n",
" elif output_name in processed_scenes:\n",
" continue\n",
" elif output_name in task_id:\n",
" continue\n",
" else:\n",
" tasks.append(\n",
" # predict(\n",
" # predict(\n",
" client.compute(\n",
" predict_delayed(\n",
" model_folder=f\"{bucket}/models/\",\n",
@@ -281,7 +361,7 @@
" resources={\"workertoken\": 1},\n",
" )\n",
" )\n",
" task_ids.append([path, row, year, max_lat, min_lon])"
" task_id.append(output_name)"
]
},
{
@@ -292,7 +372,7 @@
},
"outputs": [],
"source": [
"len(tasks)"
"len(rerun_scenes[\"2014\"])"
]
},
{
@@ -307,6 +387,15 @@
"results"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"results"
]
},
{
"cell_type": "code",
"execution_count": null,
@@ -320,8 +409,8 @@
"# row = task_id[i][1]\n",
"# year = task_id[i][2]\n",
"\n",
"path = 93\n",
"row = 11\n",
"path = 48\n",
"row = 22\n",
"year = 2014\n",
"\n",
"print(path, row, year)\n",
@@ -337,6 +426,72 @@
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"fs.ls(\"s3://carbonplan-climatetrace/v2/inference/rf/2014/054018.parquet\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"fs.ls(\"s3://carbonplan-climatetrace/v2/inference/xg/2014/054018.parquet\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"fs.ls(\"s3://carbonplan-climatetrace/v2/inference/rf/2014/054018.parquet\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# i = 0\n",
"# path = task_id[i][0]\n",
"# row = task_id[i][1]\n",
"# year = task_id[i][2]\n",
"\n",
"path = 54\n",
"row = 18\n",
"year = 2010\n",
"\n",
"print(path, row, year)\n",
"\n",
"predict(\n",
" model_folder=f\"{bucket}/models/\",\n",
" path=path,\n",
" row=row,\n",
" year=year,\n",
" access_key_id=access_key_id,\n",
" secret_access_key=secret_access_key,\n",
" output_write_bucket=f\"{bucket}/inference\",\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"pd.read_parquet(\"s3://carbonplan-climatetrace/v2/inference/rf/2010/054018.parquet\")"
]
},
{
"cell_type": "code",
"execution_count": null,
@@ -404,9 +559,9 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"display_name": "Python [conda env:notebook] *",
"language": "python",
"name": "python3"
"name": "conda-env-notebook-py"
},
"language_info": {
"codemirror_mode": {
297 changes: 98 additions & 199 deletions notebooks/processing/model.ipynb
Original file line number Diff line number Diff line change
@@ -45,7 +45,11 @@
"metadata": {},
"outputs": [],
"source": [
"realms = list(REALM_GROUPINGS.keys())"
"# we train one model per realm\n",
"\n",
"# realms = list(REALM_GROUPINGS.keys())\n",
"# only use australia for example, but we would want all when rerunning this\n",
"realms = [\"australia\"]"
]
},
{
@@ -55,43 +59,44 @@
"metadata": {},
"outputs": [],
"source": [
"# HPO\n",
"import itertools\n",
"\n",
"\n",
"def product_dict(**kwargs):\n",
" keys = kwargs.keys()\n",
" vals = kwargs.values()\n",
" for instance in itertools.product(*vals):\n",
" yield dict(zip(keys, instance))\n",
"\n",
"\n",
"param_set = {\n",
" \"learning_rate\": [0.07, 0.05, 0.03],\n",
" \"max_depth\": [10, 12, 14],\n",
" \"colsample_bytree\": [0.5, 0.7, 0.9],\n",
" \"subsample\": [0.5, 0.7, 0.9],\n",
" \"min_child_weight\": [2, 4, 6],\n",
" \"lambda\": [1, 1.5, 2],\n",
" \"alpha\": [0, 0.5, 1],\n",
" \"gamma\": [0, 0.5, 1],\n",
"}\n",
"\n",
"groupings = [\n",
" [\"learning_rate\"],\n",
" [\"max_depth\"],\n",
" [\"colsample_bytree\", \"subsample\", \"min_child_weight\"],\n",
" [\"lambda\", \"alpha\", \"gamma\"],\n",
"]\n",
"\n",
"dims = [list(range(len(param_set[g[0]]))) for g in groupings]\n",
"param_set_list = []\n",
"for orders in list(itertools.product(*dims)):\n",
" d = {}\n",
" for o, g in zip(orders, groupings):\n",
" for k in g:\n",
" d[k] = param_set[k][o]\n",
" param_set_list.append(d)"
"# This block of code is used for generating difference parameter sets for hyperparameter optimization (HPO) of the model\n",
"# the params here are for the xgboost model\n",
"\n",
"# import itertools\n",
"\n",
"# def product_dict(**kwargs):\n",
"# keys = kwargs.keys()\n",
"# vals = kwargs.values()\n",
"# for instance in itertools.product(*vals):\n",
"# yield dict(zip(keys, instance))\n",
"\n",
"\n",
"# param_set = {\n",
"# \"learning_rate\": [0.07, 0.05, 0.03],\n",
"# \"max_depth\": [10, 12, 14],\n",
"# \"colsample_bytree\": [0.5, 0.7, 0.9],\n",
"# \"subsample\": [0.5, 0.7, 0.9],\n",
"# \"min_child_weight\": [2, 4, 6],\n",
"# \"lambda\": [1, 1.5, 2],\n",
"# \"alpha\": [0, 0.5, 1],\n",
"# \"gamma\": [0, 0.5, 1],\n",
"# }\n",
"\n",
"# groupings = [\n",
"# [\"learning_rate\"],\n",
"# [\"max_depth\"],\n",
"# [\"colsample_bytree\", \"subsample\", \"min_child_weight\"],\n",
"# [\"lambda\", \"alpha\", \"gamma\"],\n",
"# ]\n",
"\n",
"# dims = [list(range(len(param_set[g[0]]))) for g in groupings]\n",
"# param_set_list = []\n",
"# for orders in list(itertools.product(*dims)):\n",
"# d = {}\n",
"# for o, g in zip(orders, groupings):\n",
"# for k in g:\n",
"# d[k] = param_set[k][o]\n",
"# param_set_list.append(d)"
]
},
{
@@ -101,6 +106,9 @@
"metadata": {},
"outputs": [],
"source": [
"# helper functions for assessing model performances\n",
"\n",
"\n",
"def get_all_prediction_result(model, df_train, df_test, df_val):\n",
"\n",
" df_train[\"biomass_pred\"] = model._predict(df_train)\n",
@@ -124,7 +132,21 @@
" mae = (merged.biomass_year2 - merged.biomass_year1).abs().mean()\n",
" me = (merged.biomass_year2 - merged.biomass_year1).mean()\n",
"\n",
" return {\"mae\": mae, \"me\": me}"
" return {\"mae\": mae, \"me\": me}\n",
"\n",
"\n",
"def plot_scatter(sub, title, n=500000):\n",
" xmin = -10\n",
" size = min(len(sub), n)\n",
" toplot = sub.sample(n=size)\n",
" xmax = toplot.biomass.quantile(0.95)\n",
" plt.scatter(toplot.biomass, toplot.biomass_pred, s=1, alpha=0.03)\n",
" plt.plot([xmin, xmax], [xmin, xmax], \"k\")\n",
" plt.xlabel(\"True Biomass (Mg/ha)\")\n",
" plt.ylabel(\"Predicted Biomass (Mg/ha)\")\n",
" plt.xlim(xmin, xmax)\n",
" plt.ylim(xmin, xmax)\n",
" plt.title(title)"
]
},
{
@@ -137,11 +159,16 @@
"outputs": [],
"source": [
"scores = []\n",
"random_split = False\n",
"# whether to randomly split the train/test data or to split train/test based on year\n",
"# doesn't seem to make too big of a difference on validation performance\n",
"random_split = True\n",
"# whether to reload the training data from individual years, or use the compiled data directly\n",
"# only needs to be True when the training data is re-generated\n",
"reload = False\n",
"# whether to overwrite the models already trained\n",
"overwrite = False\n",
"\n",
"for model_class in [m.random_forest_model]: # m.xgb_model\n",
"for model_class in [m.random_forest_model, m.xgb_model]:\n",
" for realm in realms:\n",
" print(f\"Building model for {realm} realm\")\n",
"\n",
@@ -154,32 +181,35 @@
" )\n",
" print(f\" size of entire df is {round(df.size / 1e9, 2)}Gb\")\n",
"\n",
" for strategy in [\"last\"]: # [\"first\", \"last\", \"no\"]:\n",
" # split into train/test based on year\n",
" for strategy in [\"none\"]: # [\"first\", \"last\", \"none\"]:\n",
" # strategy = \"first\" means that the first year is used for validation, and \"last\" means the last year is used for validation\n",
" # strategy = none means that no data is reserved for validation => used for training the final production model,\n",
" # whereas first/last allow us to assess model performance during the model design and tuning phases\n",
" df_train, df_test, df_val = m.train_test_split_based_on_year(\n",
" df, val_strategy=strategy, random_train_test=random_split\n",
" )\n",
" print(f\" training sample size = {len(df_train)}\")\n",
" print(f\" testing sample size = {len(df_test)}\")\n",
" print(f\" eval sample size = {len(df_val)}\")\n",
"\n",
" # build 2 models: 1) baseline/mean, 2) xgboost\n",
" # TODO: build linear model as another baseline model\n",
" # m.baseline_model, m.gradient_boost_model, m.random_forest_model\n",
"\n",
" # this for loop is for running different parameter sets in HPO\n",
" for params in [{}]:\n",
"\n",
" # instantiating the model also does .fit\n",
" # this will load the model if it already exist and overwrite=False, and fit the model if overwrite=True or the model does not exist\n",
" model = model_class(\n",
" realm=realm,\n",
" df_train=df_train,\n",
" df_test=df_test,\n",
" output_folder=\"s3://carbonplan-climatetrace/v1/models/\",\n",
" output_folder=\"s3://carbonplan-climatetrace/v2.1/models/\", # v1 or v2\n",
" overwrite=overwrite,\n",
" validation_year=\"none\",\n",
" validation_year=strategy,\n",
" params=params,\n",
" )\n",
"\n",
" # do model evaluation on each split of the data: train, test, and validation\n",
" for split, sub in zip((\"train\", \"test\", \"val\"), (df_train, df_test, df_val)):\n",
" # validation data can be empty if val strategy = 'none'\n",
" if len(sub) > 0:\n",
" model_score = model.evaluate(sub)\n",
" model_score[\"model_name\"] = model.name\n",
@@ -194,6 +224,7 @@
" df_train[\"biomass_pred\"] = model.predict(df_train)\n",
" df_test[\"biomass_pred\"] = model.predict(df_test)\n",
"\n",
" # plot the prediction result\n",
" plt.figure(figsize=(10, 4.5))\n",
" plt.subplot(1, 2, 1)\n",
" plot_scatter(df_train, title=f\"{realm} train samples\")\n",
@@ -203,57 +234,35 @@
" plt.show()\n",
" plt.close()\n",
"\n",
" plt.figure(figsize=(10, 4))\n",
" plt.title(f\"{realm} feature importance\")\n",
" xticks = np.arange(len(m.features)) * 2\n",
" plt.bar(xticks, model.model.feature_importances_)\n",
" plt.xticks(ticks=xticks, labels=m.features, rotation=\"vertical\")\n",
" plt.savefig(f\"{realm}_feature_imp.png\")\n",
" plt.show()\n",
" plt.close()\n",
" # plotting feature importance if the model being trained is random forest\n",
" if \"rf\" in model.name:\n",
" plt.figure(figsize=(10, 4))\n",
" plt.title(f\"{realm} feature importance\")\n",
" xticks = np.arange(len(m.features)) * 2\n",
" plt.bar(xticks, model.model.feature_importances_)\n",
" plt.xticks(ticks=xticks, labels=m.features, rotation=\"vertical\")\n",
" plt.savefig(f\"{realm}_feature_imp.png\")\n",
" plt.show()\n",
" plt.close()\n",
" # TODO: plot something else if we're training the xgboost model\n",
"\n",
"scores = pd.DataFrame(scores)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e6519687-11f4-41e0-b8fe-0191fecc98ea",
"metadata": {},
"outputs": [],
"source": [
"def plot_scatter(sub, title, n=500000):\n",
" xmin = -10\n",
" size = min(len(sub), n)\n",
" toplot = sub.sample(n=size)\n",
" xmax = toplot.biomass.quantile(0.95)\n",
" plt.scatter(toplot.biomass, toplot.biomass_pred, s=1, alpha=0.03)\n",
" plt.plot([xmin, xmax], [xmin, xmax], \"k\")\n",
" plt.xlabel(\"True Biomass (Mg/ha)\")\n",
" plt.ylabel(\"Predicted Biomass (Mg/ha)\")\n",
" plt.xlim(xmin, xmax)\n",
" plt.ylim(xmin, xmax)\n",
" plt.title(title)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "527a24c3-477f-4023-8816-3f2cb8d91ba3",
"metadata": {},
"outputs": [],
"source": [
"df_train.year.unique()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8716960a-ba23-42d3-abe9-03fb633dad2e",
"metadata": {},
"outputs": [],
"source": [
"scores"
"scores\n",
"\n",
"# only selecting everything that's test or val split\n",
"# scores.loc[scores.split == 'val]\n",
"\n",
"# doing weighted average of the scores\n",
"# (scores.loc[scores.split == 'test'].r2 * scores.loc[scores.split == 'test'].sample_size).sum() / scores.loc[scores.split == 'test'].sample_size.sum()"
]
},
{
@@ -291,116 +300,6 @@
" sub = scores.loc[(scores.split == \"train\") & (scores.validation_year == validation_year)]\n",
" print(f\"training score = {(sub.r2 * sub.sample_size).sum() / sub.sample_size.sum()}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "bea208ad-daa1-4811-bbe0-cbc6e1ff75dc",
"metadata": {},
"outputs": [],
"source": [
"temporal_variability = pd.read_csv(\"temporal_variability.csv\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cd8e436c-3a67-4541-9772-3ca633323102",
"metadata": {},
"outputs": [],
"source": [
"temporal_variability[\"realm\"] = temporal_variability.model_name.apply(lambda x: x.split(\"_\")[1])\n",
"temporal_variability[\"model_type\"] = temporal_variability.model_name.apply(\n",
" lambda x: x.split(\"_\")[0]\n",
")\n",
"\n",
"sample_size = (\n",
" scores.loc[(scores.random_split == True) & (scores.model_name.str.startswith(\"xgb\"))]\n",
" .groupby(\"realm\")\n",
" .sample_size.sum()\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "716e1e97-4f6c-4609-8f2f-bfd7bcd6bdba",
"metadata": {},
"outputs": [],
"source": [
"# weighted average\n",
"merged = temporal_variability.loc[temporal_variability.random_split != True].merge(\n",
" sample_size, how=\"left\", on=\"realm\"\n",
")\n",
"name_dict = {\n",
" \"gb\": \"gradient boosting\",\n",
" \"ground\": \"lidar derived\",\n",
" \"rf\": \"random forest\",\n",
" \"xgb\": \"xgboost\",\n",
"}\n",
"merged[\"model_type\"] = merged.model_type.apply(lambda x: name_dict[x])\n",
"\n",
"print(\n",
" \"Biomass MAE between years 2007 and 2008 of the same location using different model architecture\"\n",
")\n",
"print(\"\")\n",
"for model, g in merged.groupby(\"model_type\"):\n",
" print(\n",
" model.ljust(20),\n",
" np.round((g.mae * g.sample_size).sum() / g.sample_size.sum(), 4),\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "90400beb-c2bc-443f-9aa8-a2e76a57a9b8",
"metadata": {},
"outputs": [],
"source": [
"# simple average\n",
"temporal_variability.loc[temporal_variability.random_split != True].merge(\n",
" sample_size, how=\"left\", on=\"realm\"\n",
").groupby(\"model_type\").mae.mean()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f3498408-2ad8-4aed-a8bb-f6d722ba025a",
"metadata": {},
"outputs": [],
"source": [
"scores = pd.read_csv(\"HPO_1.csv\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6e5d4987-11af-4f8b-bc48-965bb7838310",
"metadata": {},
"outputs": [],
"source": [
"df.loc[df.split == \"test\"].groupby(\n",
" [\"learning_rate\", \"max_depth\", \"colsample_bytree\", \"lambda\"]\n",
").mean().sort_values(by=\"r2\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ea82ac5c-5cbb-4b87-b739-f98249a73b02",
"metadata": {},
"outputs": [],
"source": [
"# from sklearn.preprocessing import OneHotEncoder\n",
"# igbp_encoder = OneHotEncoder(sparse=False, categories='auto', handle_unknown='ignore').fit(df_train[['igbp']])\n",
"# # one hot encoding for igbp\n",
"# encoded_igbp = igbp_encoder.transform(X[['igbp']])\n",
"# X = X.drop(['igbp'], axis=1)\n",
"# for i in range(encoded_igbp.shape[1]):\n",
"# X[f'igbp_cat_{str(i+1)}'] = encoded_igbp[:, i]"
]
}
],
"metadata": {