Perform 5-fold cross validation

UofT-DSI · Nov 10, 2024 · 7900d4b · 7900d4b
1 parent 0633db5
commit 7900d4b
Showing 1 changed file with 105 additions and 13 deletions.
diff --git a/02_activities/assignments/assignment_2.ipynb b/02_activities/assignments/assignment_2.ipynb
@@ -25,7 +25,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 41,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -57,7 +57,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 42,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -84,14 +84,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 43,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "\u001b[92mThe random state is used to control the randomness of the data splitting process.\u001b[0m\n",
+      "\u001b[92mRandom state is used to control the randomness of the data splitting process.\u001b[0m\n",
       "\u001b[92mBy specifying a random state, we are setting a seed for the random number generator, ensuring that the train-test split is reproducible every time we run our code.\n",
       "\u001b[0m\n",
       "\u001b[1m\u001b[92mWhy is it Useful?\u001b[0m\n",
@@ -151,7 +151,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 44,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -214,7 +214,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 45,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -223,7 +223,7 @@
     "# Creating the model pipeline\n",
     "model_pipeline = Pipeline(steps=[\n",
     "    ('preprocessing', preprocessor),  # The ColumnTransformer from the previous step\n",
-    "    ('classifier', RandomForestClassifier())  # The classifier\n",
+    "    ('classifier', RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42))  # The classifier\n",
     "])"
    ]
   },
@@ -242,10 +242,64 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 47,
    "metadata": {},
-   "outputs": [],
-   "source": []
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "c:\\Users\\JEFFEY MARKUS\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:242: UserWarning: Found unknown categories in columns [7] during transform. These unknown categories will be encoded as all zeros\n",
+      "  warnings.warn(\n",
+      "c:\\Users\\JEFFEY MARKUS\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:242: UserWarning: Found unknown categories in columns [7] during transform. These unknown categories will be encoded as all zeros\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Cross-Validation Results:\n",
+      "neg_log_loss - Training: -0.3118 (+/- 0.0017), Validation: -0.3281 (+/- 0.0030)\n",
+      "roc_auc - Training: 0.9254 (+/- 0.0008), Validation: 0.9106 (+/- 0.0034)\n",
+      "accuracy - Training: 0.8653 (+/- 0.0012), Validation: 0.8568 (+/- 0.0043)\n",
+      "balanced_accuracy - Training: 0.7630 (+/- 0.0023), Validation: 0.7507 (+/- 0.0076)\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sklearn.model_selection import cross_validate\n",
+    "from sklearn.metrics import make_scorer, log_loss, roc_auc_score, accuracy_score, balanced_accuracy_score\n",
+    "import numpy as np\n",
+    "\n",
+    "# Defining the performance metrics\n",
+    "scoring = {\n",
+    "    'neg_log_loss': 'neg_log_loss',\n",
+    "    'roc_auc': 'roc_auc',\n",
+    "    'accuracy': 'accuracy',\n",
+    "    'balanced_accuracy': 'balanced_accuracy'\n",
+    "}\n",
+    "\n",
+    "# Evaluating the model pipeline using 5-fold cross-validation\n",
+    "cv_results = cross_validate(\n",
+    "    model_pipeline, \n",
+    "    X, \n",
+    "    Y, \n",
+    "    cv=5, \n",
+    "    scoring=scoring, \n",
+    "    return_train_score=True\n",
+    ")\n",
+    "\n",
+    "# Reporting the training and validation results\n",
+    "print(\"\\nCross-Validation Results:\")\n",
+    "for metric in scoring.keys():\n",
+    "    train_mean = np.mean(cv_results[f'train_{metric}'])\n",
+    "    train_std = np.std(cv_results[f'train_{metric}'])\n",
+    "    test_mean = np.mean(cv_results[f'test_{metric}'])\n",
+    "    test_std = np.std(cv_results[f'test_{metric}'])\n",
+    "    print(f\"{metric} - Training: {train_mean:.4f} (+/- {train_std:.4f}), Validation: {test_mean:.4f} (+/- {test_std:.4f})\")\n"
+   ]
   },
   {
    "cell_type": "markdown",
@@ -256,10 +310,48 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 50,
    "metadata": {},
-   "outputs": [],
-   "source": []
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Sorted Cross-Validation Results by Test Negative Log Loss:\n",
+      "\n",
+      "   fit_time  score_time  test_neg_log_loss  train_neg_log_loss  test_roc_auc  \\\n",
+      "1  3.270346    0.146423          -0.331249           -0.309713      0.906679   \n",
+      "0  3.128752    0.165946          -0.330795           -0.311253      0.907006   \n",
+      "2  3.231312    0.153061          -0.329205           -0.313765      0.910617   \n",
+      "4  2.981757    0.148714          -0.325861           -0.313799      0.914390   \n",
+      "3  2.921376    0.148241          -0.323481           -0.310539      0.914494   \n",
+      "\n",
+      "   train_roc_auc  test_accuracy  train_accuracy  test_balanced_accuracy  \\\n",
+      "1       0.926736       0.855651        0.865331                0.747288   \n",
+      "0       0.925341       0.848918        0.865095                0.737764   \n",
+      "2       0.924791       0.858722        0.866790                0.755843   \n",
+      "4       0.924358       0.859951        0.866060                0.759700   \n",
+      "3       0.925640       0.860872        0.863334                0.752904   \n",
+      "\n",
+      "   train_balanced_accuracy  \n",
+      "1                 0.764309  \n",
+      "0                 0.763306  \n",
+      "2                 0.764725  \n",
+      "4                 0.764245  \n",
+      "3                 0.758586  \n"
+     ]
+    }
+   ],
+   "source": [
+    "cv_results_df = pd.DataFrame(cv_results)\n",
+    "\n",
+    "# Sorting the DataFrame by 'test_neg_log_loss'\n",
+    "sorted_cv_results = cv_results_df.sort_values(by='test_neg_log_loss')\n",
+    "\n",
+    "# Displaying the sorted DataFrame\n",
+    "print(\"Sorted Cross-Validation Results by Test Negative Log Loss:\\n\")\n",
+    "print(sorted_cv_results)\n"
+   ]
   },
   {
    "cell_type": "markdown",