Skip to content

Commit

Permalink
Perform 5-fold cross validation
Browse files Browse the repository at this point in the history
  • Loading branch information
jeffey97 committed Nov 10, 2024
1 parent 0633db5 commit 7900d4b
Showing 1 changed file with 105 additions and 13 deletions.
118 changes: 105 additions & 13 deletions 02_activities/assignments/assignment_2.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 41,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -57,7 +57,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 42,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -84,14 +84,14 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 43,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[92mThe random state is used to control the randomness of the data splitting process.\u001b[0m\n",
"\u001b[92mRandom state is used to control the randomness of the data splitting process.\u001b[0m\n",
"\u001b[92mBy specifying a random state, we are setting a seed for the random number generator, ensuring that the train-test split is reproducible every time we run our code.\n",
"\u001b[0m\n",
"\u001b[1m\u001b[92mWhy is it Useful?\u001b[0m\n",
Expand Down Expand Up @@ -151,7 +151,7 @@
},
{
"cell_type": "code",
"execution_count": 19,
"execution_count": 44,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -214,7 +214,7 @@
},
{
"cell_type": "code",
"execution_count": 22,
"execution_count": 45,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -223,7 +223,7 @@
"# Creating the model pipeline\n",
"model_pipeline = Pipeline(steps=[\n",
" ('preprocessing', preprocessor), # The ColumnTransformer from the previous step\n",
" ('classifier', RandomForestClassifier()) # The classifier\n",
" ('classifier', RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)) # The classifier\n",
"])"
]
},
Expand All @@ -242,10 +242,64 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 47,
"metadata": {},
"outputs": [],
"source": []
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"c:\\Users\\JEFFEY MARKUS\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:242: UserWarning: Found unknown categories in columns [7] during transform. These unknown categories will be encoded as all zeros\n",
" warnings.warn(\n",
"c:\\Users\\JEFFEY MARKUS\\AppData\\Local\\Programs\\Python\\Python312\\Lib\\site-packages\\sklearn\\preprocessing\\_encoders.py:242: UserWarning: Found unknown categories in columns [7] during transform. These unknown categories will be encoded as all zeros\n",
" warnings.warn(\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"Cross-Validation Results:\n",
"neg_log_loss - Training: -0.3118 (+/- 0.0017), Validation: -0.3281 (+/- 0.0030)\n",
"roc_auc - Training: 0.9254 (+/- 0.0008), Validation: 0.9106 (+/- 0.0034)\n",
"accuracy - Training: 0.8653 (+/- 0.0012), Validation: 0.8568 (+/- 0.0043)\n",
"balanced_accuracy - Training: 0.7630 (+/- 0.0023), Validation: 0.7507 (+/- 0.0076)\n"
]
}
],
"source": [
"from sklearn.model_selection import cross_validate\n",
"from sklearn.metrics import make_scorer, log_loss, roc_auc_score, accuracy_score, balanced_accuracy_score\n",
"import numpy as np\n",
"\n",
"# Defining the performance metrics\n",
"scoring = {\n",
" 'neg_log_loss': 'neg_log_loss',\n",
" 'roc_auc': 'roc_auc',\n",
" 'accuracy': 'accuracy',\n",
" 'balanced_accuracy': 'balanced_accuracy'\n",
"}\n",
"\n",
"# Evaluating the model pipeline using 5-fold cross-validation\n",
"cv_results = cross_validate(\n",
" model_pipeline, \n",
" X, \n",
" Y, \n",
" cv=5, \n",
" scoring=scoring, \n",
" return_train_score=True\n",
")\n",
"\n",
"# Reporting the training and validation results\n",
"print(\"\\nCross-Validation Results:\")\n",
"for metric in scoring.keys():\n",
" train_mean = np.mean(cv_results[f'train_{metric}'])\n",
" train_std = np.std(cv_results[f'train_{metric}'])\n",
" test_mean = np.mean(cv_results[f'test_{metric}'])\n",
" test_std = np.std(cv_results[f'test_{metric}'])\n",
" print(f\"{metric} - Training: {train_mean:.4f} (+/- {train_std:.4f}), Validation: {test_mean:.4f} (+/- {test_std:.4f})\")\n"
]
},
{
"cell_type": "markdown",
Expand All @@ -256,10 +310,48 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 50,
"metadata": {},
"outputs": [],
"source": []
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Sorted Cross-Validation Results by Test Negative Log Loss:\n",
"\n",
" fit_time score_time test_neg_log_loss train_neg_log_loss test_roc_auc \\\n",
"1 3.270346 0.146423 -0.331249 -0.309713 0.906679 \n",
"0 3.128752 0.165946 -0.330795 -0.311253 0.907006 \n",
"2 3.231312 0.153061 -0.329205 -0.313765 0.910617 \n",
"4 2.981757 0.148714 -0.325861 -0.313799 0.914390 \n",
"3 2.921376 0.148241 -0.323481 -0.310539 0.914494 \n",
"\n",
" train_roc_auc test_accuracy train_accuracy test_balanced_accuracy \\\n",
"1 0.926736 0.855651 0.865331 0.747288 \n",
"0 0.925341 0.848918 0.865095 0.737764 \n",
"2 0.924791 0.858722 0.866790 0.755843 \n",
"4 0.924358 0.859951 0.866060 0.759700 \n",
"3 0.925640 0.860872 0.863334 0.752904 \n",
"\n",
" train_balanced_accuracy \n",
"1 0.764309 \n",
"0 0.763306 \n",
"2 0.764725 \n",
"4 0.764245 \n",
"3 0.758586 \n"
]
}
],
"source": [
"cv_results_df = pd.DataFrame(cv_results)\n",
"\n",
"# Sorting the DataFrame by 'test_neg_log_loss'\n",
"sorted_cv_results = cv_results_df.sort_values(by='test_neg_log_loss')\n",
"\n",
"# Displaying the sorted DataFrame\n",
"print(\"Sorted Cross-Validation Results by Test Negative Log Loss:\\n\")\n",
"print(sorted_cv_results)\n"
]
},
{
"cell_type": "markdown",
Expand Down

0 comments on commit 7900d4b

Please sign in to comment.