Merge pull request #3 from godatadriven/dev

Dev
godatadriven · Jan 5, 2021 · 9d50b19 · 9d50b19
2 parents 3cc0840 + a71bbf7
commit 9d50b19
Show file tree

Hide file tree

Showing 5 changed files with 415 additions and 1,094 deletions.
diff --git a/README.md b/README.md
@@ -148,3 +148,24 @@ Note that the inputs to `MyPivenModel` must match the inputs to the `piven_model
 You can now call all methods defined as in the PivenBaseModel class. Check the 
 [PivenMlpModel class](https://gitlab.com/jasperginn/piven.py/-/blob/dev/src/piven/Models/mlp_regressor.py)
 for a more detailed example.
+
+## Details: loss function
+
+The piven loss function is more complicated than regular loss functions in that it combines three objectives:
+
+1. The coverage (number of observations within lower and upper PI) should be approximately 1-*a*, where *a* is the 
+desired significance level.
+2. The PI should not be too wide.
+3. The point-prediction should be as accurate as possible.
+
+The piven loss function combines these objectives into a single loss. The loss function takes three arguments.
+
+1. *alpha*: the desired significance level. Given this value, we aim for PI such that, if we re-run our experiments
+many times, the PI would include the true values on our outcome metric (1 - *alpha*) times.
+2. *lambda*: this is a hyperparameter controlling the relative importance of PI width versus PI coverage. As lambda
+shrinks down to 0, you will observe smaller PI at the cost of lower coverage.
+3. *soften*: technicality. Primarily used to ensure that the loss function can be optimized using a gradient-based
+solver.
+
+The default settings are those used by the paper's authors. You should probably leave them as they are unless you
+know what you are doing. For further details, see pp. 4-5 of the paper cited above. 
diff --git a/notebooks/ablation_analysis.ipynb b/notebooks/ablation_analysis.ipynb
@@ -0,0 +1,175 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Using an ensemble of piven regressors"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "n_regressors = 10\n",
+    "n = test_y.shape[0]\n",
+    "y_pred_out = np.zeros((n, n_regressors))\n",
+    "y_pred_pi_low = np.zeros((n, n_regressors))\n",
+    "y_pred_pi_high = np.zeros((n, n_regressors))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Convenience function\n",
+    "def make_model():\n",
+    "    # Put model in pipeline\n",
+    "    model = PivenKerasRegressor(build_fn=piven_model, \n",
+    "                              input_dim=train_x.shape[-1], \n",
+    "                              dense_units=(32,16), \n",
+    "                              dropout_rate=(0.0,0.09),\n",
+    "                              lambda_=22.28,\n",
+    "                              lr= 0.000428)\n",
+    "    pipeline = Pipeline([\n",
+    "        (\"preprocess\", StandardScaler()),\n",
+    "        (\"model\", model)\n",
+    "    ])\n",
+    "    # Finally, normalize the output target\n",
+    "    model_ttr = PivenTransformedTargetRegressor(\n",
+    "        regressor=pipeline,\n",
+    "        transformer=StandardScaler()\n",
+    "    )\n",
+    "    return model_ttr\n",
+    "\n",
+    "# Back-transform the predictions and PIs\n",
+    "def back_transform(x, max_train):\n",
+    "    xexp = np.exp(x)\n",
+    "    return max_train + 1 - xexp"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Callbacks\n",
+    "early_stop = tf.keras.callbacks.EarlyStopping(\n",
+    "    monitor=\"val_loss\",\n",
+    "    min_delta=0,\n",
+    "    patience=5,\n",
+    "    verbose=0,\n",
+    "    mode=\"auto\",\n",
+    "    baseline=None,\n",
+    "    restore_best_weights=True,\n",
+    ")\n",
+    "\n",
+    "\n",
+    "reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(\n",
+    "    monitor='val_loss', factor=0.1, patience=2, verbose=1,\n",
+    "    mode='auto', min_delta=0.0001, cooldown=0, min_lr=0.00001\n",
+    ")\n",
+    "# Fit\n",
+    "for i in range(n_regressors):\n",
+    "    print(f\"Fitting model {i+1}\")  \n",
+    "    model_ttr = make_model()\n",
+    "    h = model_ttr.fit(train_x, train_y_transformed, model__epochs=25, \n",
+    "                      model__validation_split=0.2, model__batch_size=64,\n",
+    "                      model__callbacks=[early_stop, reduce_lr])\n",
+    "    ypred, y_pi_low, y_pi_high = model_ttr.predict(test_x, \n",
+    "                                                   return_prediction_intervals=True)\n",
+    "\n",
+    "    y_pred_out[:,i] = back_transform(ypred, max_out_train)\n",
+    "    # Need to reverse the bounds\n",
+    "    y_pred_pi_low[:,i] = back_transform(y_pi_high, max_out_train)\n",
+    "    y_pred_pi_high[:,i] = back_transform(y_pi_low, max_out_train)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "np.save(\"y_pred.npy\", y_pred_out)\n",
+    "np.save(\"y_pred_pi_low.npy\", y_pred_pi_low)\n",
+    "np.save(\"y_pred_pi_high.npy\", y_pred_pi_high)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "y_pred = np.mean(y_pred_out, axis=1)\n",
+    "y_pi_low = np.mean(y_pred_pi_low, axis=1)\n",
+    "y_pi_high = np.mean(y_pred_pi_high, axis=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "idx_sort = np.argsort(test_y)\n",
+    "plt.fill_betweenx(\n",
+    "    test_y[idx_sort],\n",
+    "    y_pi_low[idx_sort],\n",
+    "    y_pi_high[idx_sort],\n",
+    "    alpha=0.5\n",
+    ")\n",
+    "plt.scatter(y_pred[idx_sort], test_y[idx_sort], c=\"r\", alpha=0.2)\n",
+    "plt.xlim(1900, 2015)\n",
+    "plt.title(\"Predicted versus true release years\")\n",
+    "plt.xlabel(\"Predicted release year\")\n",
+    "plt.ylabel(\"True release year\")\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "idx_sort = np.argsort(test_y)\n",
+    "for idx in range(y_pred_out.shape[1]):\n",
+    "    plt.plot(y_pred_pi_low[idx_sort, idx], test_y[idx_sort], alpha=0.6)\n",
+    "    plt.plot(y_pred_pi_high[idx_sort, idx], test_y[idx_sort], alpha=0.6)\n",
+    "plt.xlim(1860, 2015)\n",
+    "plt.title(\"Predicted versus true release years\")\n",
+    "plt.xlabel(\"Predicted release year\")\n",
+    "plt.ylabel(\"True release year\")\n",
+    "plt.show()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python (pivenregressor)",
+   "language": "python",
+   "name": "pivenregressor"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/notebooks/hyperparameter_optimization.ipynb b/notebooks/hyperparameter_optimization.ipynb
@@ -0,0 +1,139 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Hyperparameter optimization"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import optuna\n",
+    "\n",
+    "\n",
+    "def back_transform(x, max_train):\n",
+    "    xexp = np.exp(x)\n",
+    "    return max_train + 1 - xexp\n",
+    "\n",
+    "\n",
+    "early_stop = tf.keras.callbacks.EarlyStopping(\n",
+    "    monitor=\"val_loss\",\n",
+    "    min_delta=0,\n",
+    "    patience=5,\n",
+    "    verbose=0,\n",
+    "    mode=\"auto\",\n",
+    "    baseline=None,\n",
+    "    restore_best_weights=True,\n",
+    ")\n",
+    "\n",
+    "\n",
+    "reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(\n",
+    "    monitor='val_loss', factor=0.1, patience=2, verbose=0,\n",
+    "    mode='auto', min_delta=0.0001, cooldown=0, min_lr=0.00001\n",
+    ")\n",
+    "\n",
+    "\n",
+    "def objective(trial: optuna.trial.Trial):\n",
+    "    lambda_ = trial.suggest_float(\"lambda_\", 22.0, 28.0)\n",
+    "    lr = trial.suggest_loguniform(\"lr\", 1e-5, 0.001)\n",
+    "    dropout_rate_l1 = trial.suggest_uniform(\"dropout_rate_l1\", 0.0, 0.5)\n",
+    "    dropout_rate_l2 = trial.suggest_uniform(\"dropout_rate_l2\", 0.0, 0.5)\n",
+    "    layer_size = trial.suggest_categorical(\"layer_size\", \n",
+    "                                           [\"32+16\", \"64+32\", \n",
+    "                                            \"128+64\", \"128+128\"],\n",
+    "                                          )\n",
+    "    layer_size = tuple([int(ls) for ls in layer_size.split(\"+\")])\n",
+    "    seed = trial.suggest_int(\"seed\", 162, 9999999)\n",
+    "    np.random.seed(seed)\n",
+    "    # Split train from test data\n",
+    "    x_ind = [*range(train_x.shape[0])]\n",
+    "    np.random.shuffle(x_ind)\n",
+    "    train_x_trial, val_x_trial = train_x[:420000, :].copy(), train_x[420000:, :].copy()\n",
+    "    train_y_trial, val_y_trial = train_y[:420000].copy(), train_y[420000:].copy()\n",
+    "    max_train_out = np.max(train_y_trial)\n",
+    "    # Transform\n",
+    "    train_y_trial_transformed = reflect_and_log(train_y_trial, max_train_out)\n",
+    "    val_y_trial_transformed = reflect_and_log(val_y_trial, max_train_out)\n",
+    "    # Instantiate model\n",
+    "    model = PivenKerasRegressor(build_fn=piven_model, \n",
+    "                              input_dim=train_x_trial.shape[-1], \n",
+    "                              dense_units=layer_size, \n",
+    "                              dropout_rate=(dropout_rate_l1, dropout_rate_l2),\n",
+    "                              lambda_=lambda_,\n",
+    "                              lr=lr)\n",
+    "    # Put model in pipeline\n",
+    "    pipeline = Pipeline([\n",
+    "        (\"preprocess\", StandardScaler()),\n",
+    "        (\"model\", model)\n",
+    "    ])\n",
+    "    model_ttr = PivenTransformedTargetRegressor(regressor=pipeline,\n",
+    "                                                transformer=StandardScaler())\n",
+    "    # Fit\n",
+    "    model_ttr.fit(train_x_trial, train_y_trial_transformed, model__epochs=25, \n",
+    "                  model__validation_split=0.1, model__batch_size=64,\n",
+    "                  model__verbose=False, model__callbacks=[early_stop, reduce_lr])\n",
+    "    y_pred, y_pred_pi_low, y_pred_pi_high = model_ttr.predict(val_x_trial, return_prediction_intervals=True)\n",
+    "    # Back-transform the predictions and PIs\n",
+    "    y_pred = back_transform(y_pred, max_train_out)\n",
+    "    y_pi_low = back_transform(y_pred_pi_high, max_train_out)\n",
+    "    y_pi_high = back_transform(y_pred_pi_low, max_train_out)\n",
+    "    # Compute metrics\n",
+    "    cov_trial = coverage(val_y_trial, y_pi_low, y_pi_high)\n",
+    "    piw_trial = pi_width(y_pi_low, y_pi_high)\n",
+    "    loss_trial = piven_loss_numpy(val_y_trial, y_pred, y_pi_low, y_pi_high, lambda_, 160.0, 0.05)\n",
+    "    # Set metrics\n",
+    "    trial.set_user_attr(\"coverage\", float(cov_trial))\n",
+    "    trial.set_user_attr(\"pi_width\", float(piw_trial))\n",
+    "    # Return model loss\n",
+    "    return loss_trial"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "experiment = optuna.create_study(direction=\"minimize\",\n",
+    "                                 study_name=\"piven_year_msd\",\n",
+    "                                 storage='sqlite:///data/experiment_results.db',\n",
+    "                                 load_if_exists=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "experiment.optimize(objective, n_trials=50)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "PyCharm (pivenregressor)",
+   "language": "python",
+   "name": "pycharm-b9238b0d"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}