diff --git a/02_activities/assignments/assignment_3.ipynb b/02_activities/assignments/assignment_3.ipynb index 060c21671..cdf5a222f 100644 --- a/02_activities/assignments/assignment_3.ipynb +++ b/02_activities/assignments/assignment_3.ipynb @@ -20,6 +20,37 @@ "+ Please note: the instructions are not meant to be 100% prescriptive, but instead they are a set of minimum requirements. If you find predictive performance gains by applying additional steps, by all means show them. " ] }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " X Y month day FFMC DMC DC ISI temp RH wind rain area\n", + "0 7 5 mar fri 86.2 26.2 94.3 5.1 8.2 51 6.7 0.0 0.0\n", + "1 7 4 oct tue 90.6 35.4 669.1 6.7 18.0 33 0.9 0.0 0.0\n", + "2 7 4 oct sat 90.6 43.7 686.9 6.7 14.6 33 1.3 0.0 0.0\n", + "3 8 6 mar fri 91.7 33.3 77.5 9.0 8.3 97 4.0 0.2 0.0\n", + "4 8 6 mar sun 89.3 51.3 102.2 9.6 11.4 99 1.8 0.0 0.0\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import os\n", + "\n", + "# Define the path to the data\n", + "data= r\"C:\\Users\\ibast\\Downloads\\forest+fires\\forestfires.csv\"\n", + "\n", + "# Load the dataset\n", + "df = pd.read_csv(data)\n", + "\n", + "print(df.head())\n" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -97,25 +128,66 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Index(['X', 'Y', 'month', 'day', 'FFMC', 'DMC', 'DC', 'ISI', 'temp', 'RH',\n", + " 'wind', 'rain', 'area'],\n", + " dtype='object')\n" + ] + } + ], "source": [ - "# Load the libraries as required." + "import pandas as pd\n", + "import os\n", + "\n", + "# Define the path to the data\n", + "data= r\"C:\\Users\\ibast\\Downloads\\forest+fires\\forestfires.csv\"\n", + "\n", + "# Load the dataset\n", + "fires_dt = pd.read_csv(data)\n", + "\n", + "print(fires_dt.columns)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 517 entries, 0 to 516\n", + "Data columns (total 13 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 X 517 non-null int64 \n", + " 1 Y 517 non-null int64 \n", + " 2 month 517 non-null object \n", + " 3 day 517 non-null object \n", + " 4 FFMC 517 non-null float64\n", + " 5 DMC 517 non-null float64\n", + " 6 DC 517 non-null float64\n", + " 7 ISI 517 non-null float64\n", + " 8 temp 517 non-null float64\n", + " 9 RH 517 non-null int64 \n", + " 10 wind 517 non-null float64\n", + " 11 rain 517 non-null float64\n", + " 12 area 517 non-null float64\n", + "dtypes: float64(8), int64(3), object(2)\n", + "memory usage: 52.6+ KB\n" + ] + } + ], "source": [ - "# Load data\n", - "columns = [\n", - " 'coord_x', 'coord_y', 'month', 'day', 'ffmc', 'dmc', 'dc', 'isi', 'temp', 'rh', 'wind', 'rain', 'area' \n", - "]\n", - "fires_dt = (pd.read_csv('../../05_src/data/fires/forestfires.csv', header = 0, names = columns))\n", - "fires_dt.info()\n" + "fires_dt.info()" ] }, { @@ -129,10 +201,42 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Features shape (X): (517, 12)\n", + "Target shape (y): (517,)\n", + " X Y month day FFMC DMC DC ISI temp RH wind rain\n", + "0 7 5 mar fri 86.2 26.2 94.3 5.1 8.2 51 6.7 0.0\n", + "1 7 4 oct tue 90.6 35.4 669.1 6.7 18.0 33 0.9 0.0\n", + "2 7 4 oct sat 90.6 43.7 686.9 6.7 14.6 33 1.3 0.0\n", + "3 8 6 mar fri 91.7 33.3 77.5 9.0 8.3 97 4.0 0.2\n", + "4 8 6 mar sun 89.3 51.3 102.2 9.6 11.4 99 1.8 0.0\n", + "0 0.0\n", + "1 0.0\n", + "2 0.0\n", + "3 0.0\n", + "4 0.0\n", + "Name: area, dtype: float64\n" + ] + } + ], + "source": [ + "X = fires_dt.drop('area', axis=1) # Features: all columns except 'area'\n", + "y = fires_dt['area'] # Target: the 'area' column\n", + "\n", + "# Display the shapes of X and y to confirm\n", + "print(\"Features shape (X):\", X.shape)\n", + "print(\"Target shape (y):\", y.shape)\n", + "\n", + "# Optionally display the first few rows of X and y\n", + "print(X.head())\n", + "print(y.head())" + ] }, { "cell_type": "code", @@ -180,10 +284,38 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "import pandas as pd\n", + "from sklearn.compose import ColumnTransformer\n", + "from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer\n", + "from sklearn.pipeline import Pipeline\n" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Numerical Columns: ['X', 'Y', 'FFMC', 'DMC', 'DC', 'ISI', 'temp', 'RH', 'wind', 'rain']\n", + "Categorical Columns: ['month', 'day']\n" + ] + } + ], + "source": [ + "\n", + "numerical_cols = X.select_dtypes(include=['float64', 'int64']).columns.tolist()\n", + "categorical_cols = X.select_dtypes(include=['object']).columns.tolist()\n", + "\n", + "print(\"Numerical Columns:\", numerical_cols)\n", + "print(\"Categorical Columns:\", categorical_cols)\n" + ] }, { "cell_type": "markdown", @@ -202,7 +334,43 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "import numpy as np\n", + "numerical_cols = X.select_dtypes(include=['float64', 'int64']).columns.tolist()\n", + "categorical_cols = X.select_dtypes(include=['object']).columns.tolist()\n", + "\n", + "\n", + "num_transformer = Pipeline(steps=[\n", + " ('scaler', StandardScaler())\n", + "])\n", + "\n", + "cat_transformer = OneHotEncoder(handle_unknown='ignore')\n", + "\n", + "# Create the Column Transformer\n", + "preprocessor = ColumnTransformer(\n", + " transformers=[\n", + " ('num', num_transformer, numerical_cols),\n", + " ('cat', cat_transformer, categorical_cols)\n", + " ]\n", + ")\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "# Transform the data using preproc1\n", + "X_preproc1 = preproc1.fit_transform(X)\n", + "\n", + "# Transform the data using preproc2\n", + "X_preproc2 = preproc2.fit_transform(X)\n", + "\n", + "X_preproc1_df = pd.DataFrame(X_preproc1)\n", + "X_preproc2_df = pd.DataFrame(X_preproc2)\n" + ] }, { "cell_type": "markdown", @@ -227,40 +395,76 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 51, "metadata": {}, "outputs": [], "source": [ - "# Pipeline A = preproc1 + baseline\n" + "from sklearn.ensemble import RandomForestRegressor\n", + "from sklearn.model_selection import GridSearchCV, cross_val_score\n", + "from sklearn.metrics import mean_squared_error\n" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 52, "metadata": {}, "outputs": [], "source": [ - "# Pipeline B = preproc2 + baseline\n" + "from sklearn.compose import ColumnTransformer\n", + "from sklearn.pipeline import Pipeline\n", + "from sklearn.preprocessing import StandardScaler, OneHotEncoder\n", + "from sklearn.linear_model import LinearRegression\n", + "from sklearn.ensemble import RandomForestRegressor" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Pipelines created successfully.\n" + ] + } + ], "source": [ - "# Pipeline C = preproc1 + advanced model\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Pipeline D = preproc2 + advanced model\n", "\n", - " " + "numerical_cols = X.select_dtypes(include=['float64', 'int64']).columns.tolist()\n", + "categorical_cols = X.select_dtypes(include=['object']).columns.tolist()\n", + "\n", + "# Preprocessing for numerical features\n", + "num_transformer = Pipeline(steps=[\n", + " ('scaler', StandardScaler())\n", + "])\n", + "\n", + "# Preprocessing for categorical features\n", + "cat_transformer = OneHotEncoder(handle_unknown='ignore')\n", + "\n", + "# Create the Column Transformer\n", + "preprocessor = ColumnTransformer(\n", + " transformers=[\n", + " ('num', num_transformer, numerical_cols),\n", + " ('cat', cat_transformer, categorical_cols)\n", + " ]\n", + ")\n", + "\n", + "# Create the pipeline for the baseline regressor (Linear Regression)\n", + "pipeline_lr = Pipeline(steps=[\n", + " ('preprocessing', preprocessor),\n", + " ('regressor', LinearRegression())\n", + "])\n", + "\n", + "# Create the pipeline for the advanced regressor (Random Forest Regressor)\n", + "pipeline_rf = Pipeline(steps=[\n", + " ('preprocessing', preprocessor),\n", + " ('regressor', RandomForestRegressor())\n", + "])\n", + "\n", + "# Now, the pipelines are correctly set up with the preprocessor defined.\n", + "print(\"Pipelines created successfully.\")" ] }, { @@ -278,44 +482,643 @@ "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/html": [ + "
GridSearchCV(cv=5,\n",
+       "             estimator=Pipeline(steps=[('preprocessing',\n",
+       "                                        ColumnTransformer(transformers=[('num',\n",
+       "                                                                         Pipeline(steps=[('scaler',\n",
+       "                                                                                          StandardScaler())]),\n",
+       "                                                                         ['X',\n",
+       "                                                                          'Y',\n",
+       "                                                                          'FFMC',\n",
+       "                                                                          'DMC',\n",
+       "                                                                          'DC',\n",
+       "                                                                          'ISI',\n",
+       "                                                                          'temp',\n",
+       "                                                                          'RH',\n",
+       "                                                                          'wind',\n",
+       "                                                                          'rain']),\n",
+       "                                                                        ('cat',\n",
+       "                                                                         OneHotEncoder(handle_unknown='ignore'),\n",
+       "                                                                         ['month',\n",
+       "                                                                          'day'])])),\n",
+       "                                       ('regressor', RandomForestRegressor())]),\n",
+       "             n_jobs=-1,\n",
+       "             param_grid={'regressor__max_depth': [None, 10, 20],\n",
+       "                         'regressor__min_samples_split': [2, 5],\n",
+       "                         'regressor__n_estimators': [50, 100]},\n",
+       "             scoring='neg_mean_squared_error')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "GridSearchCV(cv=5,\n", + " estimator=Pipeline(steps=[('preprocessing',\n", + " ColumnTransformer(transformers=[('num',\n", + " Pipeline(steps=[('scaler',\n", + " StandardScaler())]),\n", + " ['X',\n", + " 'Y',\n", + " 'FFMC',\n", + " 'DMC',\n", + " 'DC',\n", + " 'ISI',\n", + " 'temp',\n", + " 'RH',\n", + " 'wind',\n", + " 'rain']),\n", + " ('cat',\n", + " OneHotEncoder(handle_unknown='ignore'),\n", + " ['month',\n", + " 'day'])])),\n", + " ('regressor', RandomForestRegressor())]),\n", + " n_jobs=-1,\n", + " param_grid={'regressor__max_depth': [None, 10, 20],\n", + " 'regressor__min_samples_split': [2, 5],\n", + " 'regressor__n_estimators': [50, 100]},\n", + " scoring='neg_mean_squared_error')" + ] + }, + "execution_count": 54, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\n", + "param_grid_lr = {\n", + " 'regressor__fit_intercept': [True, False]\n", + "}\n", + "\n", + "# Define parameter \n", + "param_grid_rf = {\n", + " 'regressor__n_estimators': [50, 100],\n", + " 'regressor__max_depth': [None, 10, 20],\n", + " 'regressor__min_samples_split': [2, 5]\n", + "}\n", + "\n", + "# GridSearchCV for LR\n", + "grid_lr = GridSearchCV(pipeline_lr, param_grid_lr, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)\n", + "grid_lr.fit(X, y)\n", + "\n", + "#GridSearchCV for Random Forest Regressor\n", + "grid_rf = GridSearchCV(pipeline_rf, param_grid_rf, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)\n", + "grid_rf.fit(X, y)\n" + ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Best Linear Regression Model: Pipeline(steps=[('preprocessing',\n", + " ColumnTransformer(transformers=[('num',\n", + " Pipeline(steps=[('scaler',\n", + " StandardScaler())]),\n", + " ['X', 'Y', 'FFMC', 'DMC',\n", + " 'DC', 'ISI', 'temp', 'RH',\n", + " 'wind', 'rain']),\n", + " ('cat',\n", + " OneHotEncoder(handle_unknown='ignore'),\n", + " ['month', 'day'])])),\n", + " ('regressor', LinearRegression())])\n", + "Best Linear Regression MSE: 4429.800262484062\n", + "Best Random Forest Model: Pipeline(steps=[('preprocessing',\n", + " ColumnTransformer(transformers=[('num',\n", + " Pipeline(steps=[('scaler',\n", + " StandardScaler())]),\n", + " ['X', 'Y', 'FFMC', 'DMC',\n", + " 'DC', 'ISI', 'temp', 'RH',\n", + " 'wind', 'rain']),\n", + " ('cat',\n", + " OneHotEncoder(handle_unknown='ignore'),\n", + " ['month', 'day'])])),\n", + " ('regressor',\n", + " RandomForestRegressor(max_depth=10, min_samples_split=5))])\n", + "Best Random Forest MSE: 4678.288935400789\n" + ] + } + ], + "source": [ + "# Best score for Linear Regression\n", + "best_lr = grid_lr.best_estimator_\n", + "best_lr_score = -grid_lr.best_score_ \n", + "\n", + "# Best estimator and the best score for Random Forest Regressor\n", + "best_rf = grid_rf.best_estimator_\n", + "best_rf_score = -grid_rf.best_score_ \n", + "# Convert from negative MSE\n", + "\n", + "print(\"Best Linear Regression Model:\", best_lr)\n", + "print(\"Best Linear Regression MSE:\", best_lr_score)\n", + "\n", + "print(\"Best Random Forest Model:\", best_rf)\n", + "print(\"Best Random Forest MSE:\", best_rf_score)\n" + ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], - "source": [] + "source": [ + "# Evaluate\n", + "\n", + "+ Which model has the best performance?" + ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Linear Regression RMSE: 66.55674468064122\n", + "Random Forest RMSE: 69.6765818398016\n", + "Linear Regression has the best performance.\n" + ] + } + ], + "source": [ + "\n", + "# Linear Regression pipeline with cross-validation\n", + "scores_lr = cross_val_score(pipeline_lr, X, y, cv=5, scoring='neg_mean_squared_error')\n", + "mse_lr = -scores_lr.mean()\n", + "rmse_lr = np.sqrt(mse_lr)\n", + "\n", + "# Random Forest pipeline with cross-validation\n", + "scores_rf = cross_val_score(pipeline_rf, X, y, cv=5, scoring='neg_mean_squared_error')\n", + "mse_rf = -scores_rf.mean()\n", + "rmse_rf = np.sqrt(mse_rf)\n", + "\n", + "# Print RMSE for comparison\n", + "print(\"Linear Regression RMSE:\", rmse_lr)\n", + "print(\"Random Forest RMSE:\", rmse_rf)\n", + "\n", + "# RMSE\n", + "if rmse_lr < rmse_rf:\n", + " print(\"Linear Regression has the best performance.\")\n", + "else:\n", + " print(\"Random Forest has the best performance.\")\n" + ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "# Evaluate\n", - "\n", - "+ Which model has the best performance?" + "cross_val_score is used with neg_mean_squared_error to perform cross-validation, where the negative sign is reversed to get a positive MSE.\n", + "The square root of MSE (rmse_lr and rmse_rf) gives the RMSE, making it easier to interpret in the same units as the target variable.\n", + "The model with the lower RMSE value has the better performance." ] }, { @@ -329,10 +1132,32 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 57, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Best performing model saved to 'best_model.pkl'.\n" + ] + } + ], + "source": [ + "import pickle\n", + "\n", + "# Assuming Random Forest had the best performance from the previous step\n", + "best_model = pipeline_rf # Change this to pipeline_lr if Linear Regression performs better\n", + "\n", + "# Fit the best model to the entire dataset\n", + "best_model.fit(X, y)\n", + "\n", + "# Save the model to a pickle file\n", + "with open('best_model.pkl', 'wb') as file:\n", + " pickle.dump(best_model, file)\n", + "\n", + "print(\"Best performing model saved to 'best_model.pkl'.\")\n" + ] }, { "cell_type": "code", @@ -358,17 +1183,645 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 32, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Collecting shap\n", + " Downloading shap-0.46.0-cp310-cp310-win_amd64.whl.metadata (25 kB)\n", + "Requirement already satisfied: numpy in c:\\users\\ibast\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from shap) (2.1.2)\n", + "Requirement already satisfied: scipy in c:\\users\\ibast\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from shap) (1.14.1)\n", + "Requirement already satisfied: scikit-learn in c:\\users\\ibast\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from shap) (1.5.2)\n", + "Requirement already satisfied: pandas in c:\\users\\ibast\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from shap) (2.2.3)\n", + "Collecting tqdm>=4.27.0 (from shap)\n", + " Downloading tqdm-4.66.6-py3-none-any.whl.metadata (57 kB)\n", + "Requirement already satisfied: packaging>20.9 in c:\\users\\ibast\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from shap) (24.1)\n", + "Collecting slicer==0.0.8 (from shap)\n", + " Downloading slicer-0.0.8-py3-none-any.whl.metadata (4.0 kB)\n", + "Collecting numba (from shap)\n", + " Downloading numba-0.60.0-cp310-cp310-win_amd64.whl.metadata (2.8 kB)\n", + "Requirement already satisfied: cloudpickle in c:\\users\\ibast\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from shap) (3.1.0)\n", + "Requirement already satisfied: colorama in c:\\users\\ibast\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from tqdm>=4.27.0->shap) (0.4.6)\n", + "Collecting llvmlite<0.44,>=0.43.0dev0 (from numba->shap)\n", + " Downloading llvmlite-0.43.0-cp310-cp310-win_amd64.whl.metadata (4.9 kB)\n", + "Collecting numpy (from shap)\n", + " Downloading numpy-2.0.2-cp310-cp310-win_amd64.whl.metadata (59 kB)\n", + "Requirement already satisfied: python-dateutil>=2.8.2 in c:\\users\\ibast\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from pandas->shap) (2.9.0.post0)\n", + "Requirement already satisfied: pytz>=2020.1 in c:\\users\\ibast\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from pandas->shap) (2024.2)\n", + "Requirement already satisfied: tzdata>=2022.7 in c:\\users\\ibast\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from pandas->shap) (2024.2)\n", + "Requirement already satisfied: joblib>=1.2.0 in c:\\users\\ibast\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from scikit-learn->shap) (1.4.2)\n", + "Requirement already satisfied: threadpoolctl>=3.1.0 in c:\\users\\ibast\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from scikit-learn->shap) (3.5.0)\n", + "Requirement already satisfied: six>=1.5 in c:\\users\\ibast\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from python-dateutil>=2.8.2->pandas->shap) (1.16.0)\n", + "Downloading shap-0.46.0-cp310-cp310-win_amd64.whl (456 kB)\n", + "Downloading slicer-0.0.8-py3-none-any.whl (15 kB)\n", + "Downloading tqdm-4.66.6-py3-none-any.whl (78 kB)\n", + "Downloading numba-0.60.0-cp310-cp310-win_amd64.whl (2.7 MB)\n", + " ---------------------------------------- 2.7/2.7 MB 22.2 MB/s eta 0:00:00\n", + "Downloading numpy-2.0.2-cp310-cp310-win_amd64.whl (15.9 MB)\n", + " ---------------------------------------- 15.9/15.9 MB 33.4 MB/s eta 0:00:00\n", + "Downloading llvmlite-0.43.0-cp310-cp310-win_amd64.whl (28.1 MB)\n", + " ---------------------------------------- 28.1/28.1 MB 34.3 MB/s eta 0:00:00\n", + "Installing collected packages: tqdm, slicer, numpy, llvmlite, numba, shap\n", + " Attempting uninstall: numpy\n", + " Found existing installation: numpy 2.1.2\n", + " Uninstalling numpy-2.1.2:\n", + " Successfully uninstalled numpy-2.1.2\n", + "Successfully installed llvmlite-0.43.0 numba-0.60.0 numpy-2.0.2 shap-0.46.0 slicer-0.0.8 tqdm-4.66.6\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING: Ignoring invalid distribution -andas (c:\\users\\ibast\\appdata\\local\\programs\\python\\python310\\lib\\site-packages)\n", + "WARNING: Error parsing dependencies of bleach: Expected matching RIGHT_PARENTHESIS for LEFT_PARENTHESIS, after version specifier\n", + " tinycss2 (>=1.1.0<1.2) ; extra == 'css'\n", + " ~~~~~~~~^\n", + "WARNING: Ignoring invalid distribution -andas (c:\\users\\ibast\\appdata\\local\\programs\\python\\python310\\lib\\site-packages)\n", + " WARNING: Failed to remove contents in a temporary directory 'C:\\Users\\ibast\\AppData\\Local\\Programs\\Python\\Python310\\Lib\\site-packages\\~~mpy.libs'.\n", + " You can safely remove it manually.\n", + " WARNING: Failed to remove contents in a temporary directory 'C:\\Users\\ibast\\AppData\\Local\\Programs\\Python\\Python310\\Lib\\site-packages\\~.mpy'.\n", + " You can safely remove it manually.\n", + "WARNING: Ignoring invalid distribution -andas (c:\\users\\ibast\\appdata\\local\\programs\\python\\python310\\lib\\site-packages)\n", + "\n", + "[notice] A new release of pip is available: 24.2 -> 24.3.1\n", + "[notice] To update, run: python.exe -m pip install --upgrade pip\n" + ] + } + ], + "source": [ + "!pip install shap" + ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 58, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/html": [ + "
Pipeline(steps=[('preprocessing',\n",
+       "                 ColumnTransformer(transformers=[('num',\n",
+       "                                                  Pipeline(steps=[('scaler',\n",
+       "                                                                   StandardScaler())]),\n",
+       "                                                  ['X', 'Y', 'FFMC', 'DMC',\n",
+       "                                                   'DC', 'ISI', 'temp', 'RH',\n",
+       "                                                   'wind', 'rain']),\n",
+       "                                                 ('cat',\n",
+       "                                                  OneHotEncoder(handle_unknown='ignore'),\n",
+       "                                                  ['month', 'day'])])),\n",
+       "                ('regressor', RandomForestRegressor())])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "Pipeline(steps=[('preprocessing',\n", + " ColumnTransformer(transformers=[('num',\n", + " Pipeline(steps=[('scaler',\n", + " StandardScaler())]),\n", + " ['X', 'Y', 'FFMC', 'DMC',\n", + " 'DC', 'ISI', 'temp', 'RH',\n", + " 'wind', 'rain']),\n", + " ('cat',\n", + " OneHotEncoder(handle_unknown='ignore'),\n", + " ['month', 'day'])])),\n", + " ('regressor', RandomForestRegressor())])" + ] + }, + "execution_count": 58, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "\n", + "# Split the data\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n", + "\n", + "# Re-fit the best model on the training data\n", + "best_model.fit(X_train, y_train)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "^C\n" + ] + } + ], + "source": [ + "!pip uninstall numpy\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: numpy==2.0.0 in c:\\users\\ibast\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (2.0.0)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING: Ignoring invalid distribution -andas (c:\\users\\ibast\\appdata\\local\\programs\\python\\python310\\lib\\site-packages)\n", + "WARNING: Error parsing dependencies of bleach: Expected matching RIGHT_PARENTHESIS for LEFT_PARENTHESIS, after version specifier\n", + " tinycss2 (>=1.1.0<1.2) ; extra == 'css'\n", + " ~~~~~~~~^\n", + "WARNING: Ignoring invalid distribution -andas (c:\\users\\ibast\\appdata\\local\\programs\\python\\python310\\lib\\site-packages)\n", + "WARNING: Ignoring invalid distribution -andas (c:\\users\\ibast\\appdata\\local\\programs\\python\\python310\\lib\\site-packages)\n", + "\n", + "[notice] A new release of pip is available: 24.2 -> 24.3.1\n", + "[notice] To update, run: python.exe -m pip install --upgrade pip\n" + ] + } + ], + "source": [ + "!pip install numpy==2.0.0\n" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: shap in c:\\users\\ibast\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (0.46.0)\n", + "Requirement already satisfied: numpy in c:\\users\\ibast\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from shap) (2.0.0)\n", + "Requirement already satisfied: scipy in c:\\users\\ibast\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from shap) (1.14.1)\n", + "Requirement already satisfied: scikit-learn in c:\\users\\ibast\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from shap) (1.5.2)\n", + "Requirement already satisfied: pandas in c:\\users\\ibast\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from shap) (2.2.3)\n", + "Requirement already satisfied: tqdm>=4.27.0 in c:\\users\\ibast\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from shap) (4.66.6)\n", + "Requirement already satisfied: packaging>20.9 in c:\\users\\ibast\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from shap) (24.1)\n", + "Requirement already satisfied: slicer==0.0.8 in c:\\users\\ibast\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from shap) (0.0.8)\n", + "Requirement already satisfied: numba in c:\\users\\ibast\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from shap) (0.60.0)\n", + "Requirement already satisfied: cloudpickle in c:\\users\\ibast\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from shap) (3.1.0)\n", + "Requirement already satisfied: colorama in c:\\users\\ibast\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from tqdm>=4.27.0->shap) (0.4.6)\n", + "Requirement already satisfied: llvmlite<0.44,>=0.43.0dev0 in c:\\users\\ibast\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from numba->shap) (0.43.0)\n", + "Requirement already satisfied: python-dateutil>=2.8.2 in c:\\users\\ibast\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from pandas->shap) (2.9.0.post0)\n", + "Requirement already satisfied: pytz>=2020.1 in c:\\users\\ibast\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from pandas->shap) (2024.2)\n", + "Requirement already satisfied: tzdata>=2022.7 in c:\\users\\ibast\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from pandas->shap) (2024.2)\n", + "Requirement already satisfied: joblib>=1.2.0 in c:\\users\\ibast\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from scikit-learn->shap) (1.4.2)\n", + "Requirement already satisfied: threadpoolctl>=3.1.0 in c:\\users\\ibast\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from scikit-learn->shap) (3.5.0)\n", + "Requirement already satisfied: six>=1.5 in c:\\users\\ibast\\appdata\\local\\programs\\python\\python310\\lib\\site-packages (from python-dateutil>=2.8.2->pandas->shap) (1.16.0)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING: Ignoring invalid distribution -andas (c:\\users\\ibast\\appdata\\local\\programs\\python\\python310\\lib\\site-packages)\n", + "WARNING: Error parsing dependencies of bleach: Expected matching RIGHT_PARENTHESIS for LEFT_PARENTHESIS, after version specifier\n", + " tinycss2 (>=1.1.0<1.2) ; extra == 'css'\n", + " ~~~~~~~~^\n", + "WARNING: Ignoring invalid distribution -andas (c:\\users\\ibast\\appdata\\local\\programs\\python\\python310\\lib\\site-packages)\n", + "WARNING: Ignoring invalid distribution -andas (c:\\users\\ibast\\appdata\\local\\programs\\python\\python310\\lib\\site-packages)\n", + "\n", + "[notice] A new release of pip is available: 24.2 -> 24.3.1\n", + "[notice] To update, run: python.exe -m pip install --upgrade pip\n" + ] + } + ], + "source": [ + "!pip install --upgrade shap\n" + ] }, { "cell_type": "markdown", @@ -377,6 +1830,21 @@ "*(Answer here.)*" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import shap\n", + "\n", + "# Create a SHAP explainer based on the best model\n", + "explainer = shap.Explainer(best_model['regressor'], X_train)\n", + "\n", + "# Calculate SHAP values for a specific observation in X_test\n", + "shap_values = explainer(X_test)" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -423,7 +1891,7 @@ ], "metadata": { "kernelspec": { - "display_name": "env", + "display_name": "Python 3", "language": "python", "name": "python3" }, @@ -437,7 +1905,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.3" + "version": "3.10.8" } }, "nbformat": 4, diff --git a/02_activities/assignments/best_model.pkl b/02_activities/assignments/best_model.pkl new file mode 100644 index 000000000..2b667ce86 Binary files /dev/null and b/02_activities/assignments/best_model.pkl differ