From 31c9b93a7467b786ab2570bb18f401af0b63b457 Mon Sep 17 00:00:00 2001
From: juliagallucci <78810440+juliagallucci@users.noreply.github.com>
Date: Thu, 26 Sep 2024 10:51:51 -0400
Subject: [PATCH 1/5] Add files via upload
assignment 1 and 2 answer keys
---
.../assignment_1_answers.ipynb | 802 +++++++++++++++++
.../assignment_2_answers.ipynb | 815 ++++++++++++++++++
2 files changed, 1617 insertions(+)
create mode 100644 03_instructional_team/assignment_1_answers.ipynb
create mode 100644 03_instructional_team/assignment_2_answers.ipynb
diff --git a/03_instructional_team/assignment_1_answers.ipynb b/03_instructional_team/assignment_1_answers.ipynb
new file mode 100644
index 00000000..9c3cdc21
--- /dev/null
+++ b/03_instructional_team/assignment_1_answers.ipynb
@@ -0,0 +1,802 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "7b0bcac6-5086-4f4e-928a-570a9ff7ae58",
+ "metadata": {},
+ "source": [
+ "# Assignment 1"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5fce0350-2a17-4e93-8d4c-0b8748fdfc32",
+ "metadata": {},
+ "source": [
+ "You only need to write one line of code for each question. When answering questions that ask you to identify or interpret something, the length of your response doesn’t matter. For example, if the answer is just ‘yes,’ ‘no,’ or a number, you can just give that answer without adding anything else.\n",
+ "\n",
+ "We will go through comparable code and concepts in the live learning session. If you run into trouble, start by using the help `help()` function in Python, to get information about the datasets and function in question. The internet is also a great resource when coding (though note that **no outside searches are required by the assignment!**). If you do incorporate code from the internet, please cite the source within your code (providing a URL is sufficient).\n",
+ "\n",
+ "Please bring questions that you cannot work out on your own to office hours, work periods or share with your peers on Slack. We will work with you through the issue."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5fc5001c-7715-4ebe-b0f7-e4bd04349629",
+ "metadata": {},
+ "source": [
+ "### Classification using KNN\n",
+ "\n",
+ "Let's set up our workspace and use the **Wine dataset** from `scikit-learn`. This dataset contains 178 wine samples with 13 chemical features, used to classify wines into different classes based on their origin.\n",
+ "\n",
+ "The **response variable** is `class`, which indicates the type of wine. We'll use all of the chemical features to predict this response variable."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "4a3485d6-ba58-4660-a983-5680821c5719",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Import standard libraries\n",
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "import random\n",
+ "import matplotlib.pyplot as plt\n",
+ "import matplotlib.colors as mcolors\n",
+ "from sklearn.preprocessing import StandardScaler\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "from sklearn.neighbors import KNeighborsClassifier\n",
+ "from sklearn.metrics import recall_score, precision_score\n",
+ "from sklearn.model_selection import cross_validate\n",
+ "from sklearn.model_selection import GridSearchCV\n",
+ "from sklearn.metrics import accuracy_score"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "a431d282-f9ca-4d5d-8912-71ffc9d8ea19",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
alcohol
\n",
+ "
malic_acid
\n",
+ "
ash
\n",
+ "
alcalinity_of_ash
\n",
+ "
magnesium
\n",
+ "
total_phenols
\n",
+ "
flavanoids
\n",
+ "
nonflavanoid_phenols
\n",
+ "
proanthocyanins
\n",
+ "
color_intensity
\n",
+ "
hue
\n",
+ "
od280/od315_of_diluted_wines
\n",
+ "
proline
\n",
+ "
class
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "
\n",
+ "
0
\n",
+ "
14.23
\n",
+ "
1.71
\n",
+ "
2.43
\n",
+ "
15.6
\n",
+ "
127.0
\n",
+ "
2.80
\n",
+ "
3.06
\n",
+ "
0.28
\n",
+ "
2.29
\n",
+ "
5.64
\n",
+ "
1.04
\n",
+ "
3.92
\n",
+ "
1065.0
\n",
+ "
0
\n",
+ "
\n",
+ "
\n",
+ "
1
\n",
+ "
13.20
\n",
+ "
1.78
\n",
+ "
2.14
\n",
+ "
11.2
\n",
+ "
100.0
\n",
+ "
2.65
\n",
+ "
2.76
\n",
+ "
0.26
\n",
+ "
1.28
\n",
+ "
4.38
\n",
+ "
1.05
\n",
+ "
3.40
\n",
+ "
1050.0
\n",
+ "
0
\n",
+ "
\n",
+ "
\n",
+ "
2
\n",
+ "
13.16
\n",
+ "
2.36
\n",
+ "
2.67
\n",
+ "
18.6
\n",
+ "
101.0
\n",
+ "
2.80
\n",
+ "
3.24
\n",
+ "
0.30
\n",
+ "
2.81
\n",
+ "
5.68
\n",
+ "
1.03
\n",
+ "
3.17
\n",
+ "
1185.0
\n",
+ "
0
\n",
+ "
\n",
+ "
\n",
+ "
3
\n",
+ "
14.37
\n",
+ "
1.95
\n",
+ "
2.50
\n",
+ "
16.8
\n",
+ "
113.0
\n",
+ "
3.85
\n",
+ "
3.49
\n",
+ "
0.24
\n",
+ "
2.18
\n",
+ "
7.80
\n",
+ "
0.86
\n",
+ "
3.45
\n",
+ "
1480.0
\n",
+ "
0
\n",
+ "
\n",
+ "
\n",
+ "
4
\n",
+ "
13.24
\n",
+ "
2.59
\n",
+ "
2.87
\n",
+ "
21.0
\n",
+ "
118.0
\n",
+ "
2.80
\n",
+ "
2.69
\n",
+ "
0.39
\n",
+ "
1.82
\n",
+ "
4.32
\n",
+ "
1.04
\n",
+ "
2.93
\n",
+ "
735.0
\n",
+ "
0
\n",
+ "
\n",
+ "
\n",
+ "
...
\n",
+ "
...
\n",
+ "
...
\n",
+ "
...
\n",
+ "
...
\n",
+ "
...
\n",
+ "
...
\n",
+ "
...
\n",
+ "
...
\n",
+ "
...
\n",
+ "
...
\n",
+ "
...
\n",
+ "
...
\n",
+ "
...
\n",
+ "
...
\n",
+ "
\n",
+ "
\n",
+ "
173
\n",
+ "
13.71
\n",
+ "
5.65
\n",
+ "
2.45
\n",
+ "
20.5
\n",
+ "
95.0
\n",
+ "
1.68
\n",
+ "
0.61
\n",
+ "
0.52
\n",
+ "
1.06
\n",
+ "
7.70
\n",
+ "
0.64
\n",
+ "
1.74
\n",
+ "
740.0
\n",
+ "
2
\n",
+ "
\n",
+ "
\n",
+ "
174
\n",
+ "
13.40
\n",
+ "
3.91
\n",
+ "
2.48
\n",
+ "
23.0
\n",
+ "
102.0
\n",
+ "
1.80
\n",
+ "
0.75
\n",
+ "
0.43
\n",
+ "
1.41
\n",
+ "
7.30
\n",
+ "
0.70
\n",
+ "
1.56
\n",
+ "
750.0
\n",
+ "
2
\n",
+ "
\n",
+ "
\n",
+ "
175
\n",
+ "
13.27
\n",
+ "
4.28
\n",
+ "
2.26
\n",
+ "
20.0
\n",
+ "
120.0
\n",
+ "
1.59
\n",
+ "
0.69
\n",
+ "
0.43
\n",
+ "
1.35
\n",
+ "
10.20
\n",
+ "
0.59
\n",
+ "
1.56
\n",
+ "
835.0
\n",
+ "
2
\n",
+ "
\n",
+ "
\n",
+ "
176
\n",
+ "
13.17
\n",
+ "
2.59
\n",
+ "
2.37
\n",
+ "
20.0
\n",
+ "
120.0
\n",
+ "
1.65
\n",
+ "
0.68
\n",
+ "
0.53
\n",
+ "
1.46
\n",
+ "
9.30
\n",
+ "
0.60
\n",
+ "
1.62
\n",
+ "
840.0
\n",
+ "
2
\n",
+ "
\n",
+ "
\n",
+ "
177
\n",
+ "
14.13
\n",
+ "
4.10
\n",
+ "
2.74
\n",
+ "
24.5
\n",
+ "
96.0
\n",
+ "
2.05
\n",
+ "
0.76
\n",
+ "
0.56
\n",
+ "
1.35
\n",
+ "
9.20
\n",
+ "
0.61
\n",
+ "
1.60
\n",
+ "
560.0
\n",
+ "
2
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
178 rows × 14 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " alcohol malic_acid ash alcalinity_of_ash magnesium total_phenols \\\n",
+ "0 14.23 1.71 2.43 15.6 127.0 2.80 \n",
+ "1 13.20 1.78 2.14 11.2 100.0 2.65 \n",
+ "2 13.16 2.36 2.67 18.6 101.0 2.80 \n",
+ "3 14.37 1.95 2.50 16.8 113.0 3.85 \n",
+ "4 13.24 2.59 2.87 21.0 118.0 2.80 \n",
+ ".. ... ... ... ... ... ... \n",
+ "173 13.71 5.65 2.45 20.5 95.0 1.68 \n",
+ "174 13.40 3.91 2.48 23.0 102.0 1.80 \n",
+ "175 13.27 4.28 2.26 20.0 120.0 1.59 \n",
+ "176 13.17 2.59 2.37 20.0 120.0 1.65 \n",
+ "177 14.13 4.10 2.74 24.5 96.0 2.05 \n",
+ "\n",
+ " flavanoids nonflavanoid_phenols proanthocyanins color_intensity hue \\\n",
+ "0 3.06 0.28 2.29 5.64 1.04 \n",
+ "1 2.76 0.26 1.28 4.38 1.05 \n",
+ "2 3.24 0.30 2.81 5.68 1.03 \n",
+ "3 3.49 0.24 2.18 7.80 0.86 \n",
+ "4 2.69 0.39 1.82 4.32 1.04 \n",
+ ".. ... ... ... ... ... \n",
+ "173 0.61 0.52 1.06 7.70 0.64 \n",
+ "174 0.75 0.43 1.41 7.30 0.70 \n",
+ "175 0.69 0.43 1.35 10.20 0.59 \n",
+ "176 0.68 0.53 1.46 9.30 0.60 \n",
+ "177 0.76 0.56 1.35 9.20 0.61 \n",
+ "\n",
+ " od280/od315_of_diluted_wines proline class \n",
+ "0 3.92 1065.0 0 \n",
+ "1 3.40 1050.0 0 \n",
+ "2 3.17 1185.0 0 \n",
+ "3 3.45 1480.0 0 \n",
+ "4 2.93 735.0 0 \n",
+ ".. ... ... ... \n",
+ "173 1.74 740.0 2 \n",
+ "174 1.56 750.0 2 \n",
+ "175 1.56 835.0 2 \n",
+ "176 1.62 840.0 2 \n",
+ "177 1.60 560.0 2 \n",
+ "\n",
+ "[178 rows x 14 columns]"
+ ]
+ },
+ "execution_count": 2,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from sklearn.datasets import load_wine\n",
+ "\n",
+ "# Load the Wine dataset\n",
+ "wine_data = load_wine()\n",
+ "\n",
+ "# Convert to DataFrame\n",
+ "wine_df = pd.DataFrame(wine_data.data, columns=wine_data.feature_names)\n",
+ "\n",
+ "# Bind the 'class' (wine target) to the DataFrame\n",
+ "wine_df['class'] = wine_data.target\n",
+ "\n",
+ "# Display the DataFrame\n",
+ "wine_df\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "721b2b17",
+ "metadata": {},
+ "source": [
+ "#### **Question 1:** \n",
+ "#### Data inspection\n",
+ "\n",
+ "Before fitting any model, it is essential to understand our data. **Use Python code** to answer the following questions about the **Wine dataset**:\n",
+ "\n",
+ "_(i)_ How many observations (rows) does the dataset contain?"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "56916892",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Number of observations: 178\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Number of observations (rows)\n",
+ "num_observations = wine_df.shape[0]\n",
+ "print(f\"Number of observations: {num_observations}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f7573b59",
+ "metadata": {},
+ "source": [
+ "_(ii)_ How many variables (columns) does the dataset contain?"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "df0ef103",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Number of variables: 14\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Number of variables (columns)\n",
+ "num_variables = wine_df.shape[1]\n",
+ "print(f\"Number of variables: {num_variables}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "cb5180c7",
+ "metadata": {},
+ "source": [
+ "_(iii)_ What is the 'variable type' of the response variable `class` (e.g., 'integer', 'category', etc.)? What are the 'levels' (unique values) of the variable?"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "47989426",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Variable type of 'class': int64\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Variable type of the response variable 'class'\n",
+ "response_variable_type = wine_df['class'].dtype\n",
+ "print(f\"Variable type of 'class': {response_variable_type}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a25f5e1b",
+ "metadata": {},
+ "source": [
+ "\n",
+ "_(iv)_ How many predictor variables do we have (Hint: all variables other than `class`)? "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "bd7b0910",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Levels of 'class': [0 1 2]\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Levels (unique values) of the response variable 'class'\n",
+ "response_variable_levels = wine_df['class'].unique()\n",
+ "print(f\"Levels of 'class': {response_variable_levels}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d631e8e3",
+ "metadata": {},
+ "source": [
+ "You can use `print()` and `describe()` to help answer these questions."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "fa3832d7",
+ "metadata": {},
+ "source": [
+ "#### **Question 2:** \n",
+ "#### Standardization and data-splitting\n",
+ "\n",
+ "Next, we must preform 'pre-processing' or 'data munging', to prepare our data for classification/prediction. For KNN, there are three essential steps. A first essential step is to 'standardize' the predictor variables. We can achieve this using the scaler method, provided as follows:"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "cc899b59",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " alcohol malic_acid ash alcalinity_of_ash magnesium \\\n",
+ "0 1.518613 -0.562250 0.232053 -1.169593 1.913905 \n",
+ "1 0.246290 -0.499413 -0.827996 -2.490847 0.018145 \n",
+ "2 0.196879 0.021231 1.109334 -0.268738 0.088358 \n",
+ "3 1.691550 -0.346811 0.487926 -0.809251 0.930918 \n",
+ "4 0.295700 0.227694 1.840403 0.451946 1.281985 \n",
+ "\n",
+ " total_phenols flavanoids nonflavanoid_phenols proanthocyanins \\\n",
+ "0 0.808997 1.034819 -0.659563 1.224884 \n",
+ "1 0.568648 0.733629 -0.820719 -0.544721 \n",
+ "2 0.808997 1.215533 -0.498407 2.135968 \n",
+ "3 2.491446 1.466525 -0.981875 1.032155 \n",
+ "4 0.808997 0.663351 0.226796 0.401404 \n",
+ "\n",
+ " color_intensity hue od280/od315_of_diluted_wines proline \n",
+ "0 0.251717 0.362177 1.847920 1.013009 \n",
+ "1 -0.293321 0.406051 1.113449 0.965242 \n",
+ "2 0.269020 0.318304 0.788587 1.395148 \n",
+ "3 1.186068 -0.427544 1.184071 2.334574 \n",
+ "4 -0.319276 0.362177 0.449601 -0.037874 \n"
+ ]
+ }
+ ],
+ "source": [
+ "# Select predictors (excluding the last column)\n",
+ "predictors = wine_df.iloc[:, :-1]\n",
+ "\n",
+ "# Standardize the predictors\n",
+ "scaler = StandardScaler()\n",
+ "predictors_standardized = pd.DataFrame(scaler.fit_transform(predictors), columns=predictors.columns)\n",
+ "\n",
+ "# Display the head of the standardized predictors\n",
+ "print(predictors_standardized.head())"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "9981ca48",
+ "metadata": {},
+ "source": [
+ "(i) Why is it important to standardize the predictor variables?"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "403ef0bb",
+ "metadata": {},
+ "source": [
+ "> Standardizing predictor variables ensures that they contribute equally to the model and improves performance, especially for algorithms sensitive to variable scales. "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "8e2e1bea",
+ "metadata": {},
+ "source": [
+ "(ii) Why did we elect not to standard our response variable `Class`?"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "fdee5a15",
+ "metadata": {},
+ "source": [
+ "> We did not standardize the response variable `Class` because it is categorical and standardization is not meaningful for non-numeric labels."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "8077ec21",
+ "metadata": {},
+ "source": [
+ "(iii) A second essential step is to set a random seed. Do so below (Hint: use the random.seed function). Why is setting a seed important? Is the particular seed value important? Why or why not?"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f0676c21",
+ "metadata": {},
+ "source": [
+ "> **Setting a seed** is important because it ensures that the random processes (like data splitting or random initialization) produce the same results each time the code is run, which makes experiments reproducible. The **particular seed value** is not important; what matters is that the seed is set consistently to allow for reproducibility. Different seed values will generate different sequences of random numbers, but any specific seed will always produce the same sequence, which is crucial for consistent results in experiments."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "36ab9229",
+ "metadata": {},
+ "source": [
+ "(iv) A third essential step is to split our standardized data into separate training and testing sets. We will split into 75% training and 25% testing. The provided code randomly partitions our data, and creates linked training sets for the predictors and response variables. \n",
+ "\n",
+ "Extend the code to create a non-overlapping test set for the predictors and response variables."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "id": "72c101f2",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Do not touch\n",
+ "np.random.seed(123)\n",
+ "# Create a random vector of True and False values to split the data\n",
+ "split = np.random.choice([True, False], size=len(predictors_standardized), replace=True, p=[0.75, 0.25])\n",
+ "\n",
+ "# Define the training set for X (predictors)\n",
+ "training_X = predictors_standardized[split]\n",
+ "\n",
+ "# Define the training set for Y (response)\n",
+ "training_Y = wine_df.loc[split, 'class']\n",
+ "\n",
+ "# Define the testing set for X (predictors)\n",
+ "testing_X = predictors_standardized[~split]\n",
+ "\n",
+ "# Define the testing set for Y (response)\n",
+ "testing_Y = wine_df.loc[~split, 'class']"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4604ee03",
+ "metadata": {},
+ "source": [
+ "#### **Question 3:**\n",
+ "#### Model initialization and cross-validation\n",
+ "We are finally set to fit the KNN model. \n",
+ "\n",
+ "\n",
+ "Perform a grid search to tune the `n_neighbors` hyperparameter using 10-fold cross-validation. Follow these steps:\n",
+ "\n",
+ "1. Initialize the KNN classifier using `KNeighborsClassifier()`.\n",
+ "2. Define a parameter grid for `n_neighbors` ranging from 1 to 50.\n",
+ "3. Implement a grid search using `GridSearchCV` with 10-fold cross-validation to find the optimal number of neighbors.\n",
+ "4. After fitting the model on the training data, identify and return the best value for `n_neighbors` based on the grid search results."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "id": "08818c64",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'n_neighbors': 8}"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Initialize the KNN classifier\n",
+ "knn = KNeighborsClassifier()\n",
+ "\n",
+ "# Define the parameter grid for n_neighbors\n",
+ "parameter_grid = {\n",
+ " \"n_neighbors\": range(1, 51),\n",
+ "}\n",
+ "\n",
+ "# Set up GridSearchCV with 10-fold cross-validation\n",
+ "wine_tune_grid = GridSearchCV(\n",
+ " estimator=knn,\n",
+ " param_grid=parameter_grid,\n",
+ " cv=10\n",
+ ")\n",
+ "\n",
+ "# Fit the grid search to the training data\n",
+ "wine_tune_grid.fit(training_X, training_Y)\n",
+ "\n",
+ "# Output the best hyperparameter found by GridSearchCV\n",
+ "wine_tune_grid.best_params_"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3f76bf62",
+ "metadata": {},
+ "source": [
+ "#### **Question 4:**\n",
+ "#### Model evaluation\n",
+ "\n",
+ "Using the best value for `n_neighbors`, fit a KNN model on the training data and evaluate its performance on the test set using `accuracy_score`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "id": "ffefa9f2",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0.9473684210526315"
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Get the best value for n_neighbors from the grid search\n",
+ "best_n_neighbors = wine_tune_grid.best_params_['n_neighbors']\n",
+ "\n",
+ "# Initialize the KNN model with the best n_neighbors value\n",
+ "best_knn = KNeighborsClassifier(n_neighbors=best_n_neighbors)\n",
+ "\n",
+ "# Fit the model on the training data\n",
+ "best_knn.fit(training_X, training_Y)\n",
+ "\n",
+ "# Predict on the test data\n",
+ "test_predictions = best_knn.predict(testing_X)\n",
+ "\n",
+ "# Evaluate the model's performance on the test set\n",
+ "test_accuracy = accuracy_score(testing_Y, test_predictions)\n",
+ "test_accuracy"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6f8a69db",
+ "metadata": {},
+ "source": [
+ "# Criteria\n",
+ "\n",
+ "\n",
+ "| **Criteria** | **Complete** | **Incomplete** |\n",
+ "|--------------------------------------------------------|---------------------------------------------------|--------------------------------------------------|\n",
+ "| **Data Inspection** | Data is inspected for number of variables, observations and data types. | Data inspection is missing or incomplete. |\n",
+ "| **Data Scaling** | Data scaling or normalization is applied where necessary (e.g., using `StandardScaler`). | Data scaling or normalization is missing or incorrectly applied. |\n",
+ "| **Model Initialization** | The KNN model is correctly initialized and a random seed is set for reproducibility. | The KNN model is not initialized, is incorrect, or lacks a random seed for reproducibility. |\n",
+ "| **Parameter Grid for `n_neighbors`** | The parameter grid for `n_neighbors` is correctly defined. | The parameter grid is missing or incorrectly defined. |\n",
+ "| **Cross-Validation Setup** | Cross-validation is set up correctly with 10 folds. | Cross-validation is missing or incorrectly set up. |\n",
+ "| **Best Hyperparameter (`n_neighbors`) Selection** | The best value for `n_neighbors` is identified using the grid search results. | The best `n_neighbors` is not selected or incorrect. |\n",
+ "| **Model Evaluation on Test Data** | The model is evaluated on the test data using accuracy. | The model evaluation is missing or uses the wrong metric. |\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "0b4390cc",
+ "metadata": {},
+ "source": [
+ "## Submission Information\n",
+ "\n",
+ "🚨 **Please review our [Assignment Submission Guide](https://github.com/UofT-DSI/onboarding/blob/main/onboarding_documents/submissions.md)** 🚨 for detailed instructions on how to format, branch, and submit your work. Following these guidelines is crucial for your submissions to be evaluated correctly.\n",
+ "\n",
+ "### Note:\n",
+ "\n",
+ "If you like, you may collaborate with others in the cohort. If you choose to do so, please indicate with whom you have worked with in your pull request by tagging their GitHub username. Separate submissions are required.\n",
+ "\n",
+ "### Submission Parameters:\n",
+ "* Submission Due Date: `HH:MM AM/PM - DD/MM/YYYY`\n",
+ "* The branch name for your repo should be: `assignment-1`\n",
+ "* What to submit for this assignment:\n",
+ " * This Jupyter Notebook (assignment_1.ipynb) should be populated and should be the only change in your pull request.\n",
+ "* What the pull request link should look like for this assignment: `https://github.com//applying_statistical_concepts/pull/`\n",
+ " * Open a private window in your browser. Copy and paste the link to your pull request into the address bar. Make sure you can see your pull request properly. This helps the technical facilitator and learning support staff review your submission easily.\n",
+ "\n",
+ "Checklist:\n",
+ "- [ ] Created a branch with the correct naming convention.\n",
+ "- [ ] Ensured that the repository is public.\n",
+ "- [ ] Reviewed the PR description guidelines and adhered to them.\n",
+ "- [ ] Verify that the link is accessible in a private browser window.\n",
+ "\n",
+ "If you encounter any difficulties or have questions, please don't hesitate to reach out to our team via our Slack at `#cohort-4-help`. Our Technical Facilitators and Learning Support staff are here to help you navigate any challenges.\n"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3.10.4",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.10"
+ },
+ "vscode": {
+ "interpreter": {
+ "hash": "497a84dc8fec8cf8d24e7e87b6d954c9a18a327edc66feb9b9ea7e9e72cc5c7e"
+ }
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/03_instructional_team/assignment_2_answers.ipynb b/03_instructional_team/assignment_2_answers.ipynb
new file mode 100644
index 00000000..31fdb24b
--- /dev/null
+++ b/03_instructional_team/assignment_2_answers.ipynb
@@ -0,0 +1,815 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "7b0bcac6-5086-4f4e-928a-570a9ff7ae58",
+ "metadata": {},
+ "source": [
+ "# Assignment 2"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5fce0350-2a17-4e93-8d4c-0b8748fdfc32",
+ "metadata": {},
+ "source": [
+ "You only need to write one line of code for each question. When answering questions that ask you to identify or interpret something, the length of your response doesn’t matter. For example, if the answer is just ‘yes,’ ‘no,’ or a number, you can just give that answer without adding anything else.\n",
+ "\n",
+ "We will go through comparable code and concepts in the live learning session. If you run into trouble, start by using the help `help()` function in Python, to get information about the datasets and function in question. The internet is also a great resource when coding (though note that **no outside searches are required by the assignment!**). If you do incorporate code from the internet, please cite the source within your code (providing a URL is sufficient).\n",
+ "\n",
+ "Please bring questions that you cannot work out on your own to office hours, work periods or share with your peers on Slack. We will work with you through the issue."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5fc5001c-7715-4ebe-b0f7-e4bd04349629",
+ "metadata": {},
+ "source": [
+ "### Linear Regression\n",
+ "\n",
+ "Let's set up our workspace and use the **Diabetes dataset** from `scikit-learn`. This dataset contains 10 baseline variables (e.g., age, sex, BMI) and a quantitative measure of disease progression one year after baseline.\n",
+ "\n",
+ "Here, we will model diabetes progression (continuous outcome) based on various health-related factors."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "id": "4a3485d6-ba58-4660-a983-5680821c5719",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Import standard libraries\n",
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "import random\n",
+ "import matplotlib.pyplot as plt\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "from sklearn.linear_model import LinearRegression\n",
+ "from sklearn.metrics import mean_squared_error"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "a431d282-f9ca-4d5d-8912-71ffc9d8ea19",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
age
\n",
+ "
sex
\n",
+ "
bmi
\n",
+ "
bp
\n",
+ "
s1
\n",
+ "
s2
\n",
+ "
s3
\n",
+ "
s4
\n",
+ "
s5
\n",
+ "
s6
\n",
+ "
progression
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "
\n",
+ "
0
\n",
+ "
0.038076
\n",
+ "
0.050680
\n",
+ "
0.061696
\n",
+ "
0.021872
\n",
+ "
-0.044223
\n",
+ "
-0.034821
\n",
+ "
-0.043401
\n",
+ "
-0.002592
\n",
+ "
0.019907
\n",
+ "
-0.017646
\n",
+ "
151.0
\n",
+ "
\n",
+ "
\n",
+ "
1
\n",
+ "
-0.001882
\n",
+ "
-0.044642
\n",
+ "
-0.051474
\n",
+ "
-0.026328
\n",
+ "
-0.008449
\n",
+ "
-0.019163
\n",
+ "
0.074412
\n",
+ "
-0.039493
\n",
+ "
-0.068332
\n",
+ "
-0.092204
\n",
+ "
75.0
\n",
+ "
\n",
+ "
\n",
+ "
2
\n",
+ "
0.085299
\n",
+ "
0.050680
\n",
+ "
0.044451
\n",
+ "
-0.005670
\n",
+ "
-0.045599
\n",
+ "
-0.034194
\n",
+ "
-0.032356
\n",
+ "
-0.002592
\n",
+ "
0.002861
\n",
+ "
-0.025930
\n",
+ "
141.0
\n",
+ "
\n",
+ "
\n",
+ "
3
\n",
+ "
-0.089063
\n",
+ "
-0.044642
\n",
+ "
-0.011595
\n",
+ "
-0.036656
\n",
+ "
0.012191
\n",
+ "
0.024991
\n",
+ "
-0.036038
\n",
+ "
0.034309
\n",
+ "
0.022688
\n",
+ "
-0.009362
\n",
+ "
206.0
\n",
+ "
\n",
+ "
\n",
+ "
4
\n",
+ "
0.005383
\n",
+ "
-0.044642
\n",
+ "
-0.036385
\n",
+ "
0.021872
\n",
+ "
0.003935
\n",
+ "
0.015596
\n",
+ "
0.008142
\n",
+ "
-0.002592
\n",
+ "
-0.031988
\n",
+ "
-0.046641
\n",
+ "
135.0
\n",
+ "
\n",
+ "
\n",
+ "
...
\n",
+ "
...
\n",
+ "
...
\n",
+ "
...
\n",
+ "
...
\n",
+ "
...
\n",
+ "
...
\n",
+ "
...
\n",
+ "
...
\n",
+ "
...
\n",
+ "
...
\n",
+ "
...
\n",
+ "
\n",
+ "
\n",
+ "
437
\n",
+ "
0.041708
\n",
+ "
0.050680
\n",
+ "
0.019662
\n",
+ "
0.059744
\n",
+ "
-0.005697
\n",
+ "
-0.002566
\n",
+ "
-0.028674
\n",
+ "
-0.002592
\n",
+ "
0.031193
\n",
+ "
0.007207
\n",
+ "
178.0
\n",
+ "
\n",
+ "
\n",
+ "
438
\n",
+ "
-0.005515
\n",
+ "
0.050680
\n",
+ "
-0.015906
\n",
+ "
-0.067642
\n",
+ "
0.049341
\n",
+ "
0.079165
\n",
+ "
-0.028674
\n",
+ "
0.034309
\n",
+ "
-0.018114
\n",
+ "
0.044485
\n",
+ "
104.0
\n",
+ "
\n",
+ "
\n",
+ "
439
\n",
+ "
0.041708
\n",
+ "
0.050680
\n",
+ "
-0.015906
\n",
+ "
0.017293
\n",
+ "
-0.037344
\n",
+ "
-0.013840
\n",
+ "
-0.024993
\n",
+ "
-0.011080
\n",
+ "
-0.046883
\n",
+ "
0.015491
\n",
+ "
132.0
\n",
+ "
\n",
+ "
\n",
+ "
440
\n",
+ "
-0.045472
\n",
+ "
-0.044642
\n",
+ "
0.039062
\n",
+ "
0.001215
\n",
+ "
0.016318
\n",
+ "
0.015283
\n",
+ "
-0.028674
\n",
+ "
0.026560
\n",
+ "
0.044529
\n",
+ "
-0.025930
\n",
+ "
220.0
\n",
+ "
\n",
+ "
\n",
+ "
441
\n",
+ "
-0.045472
\n",
+ "
-0.044642
\n",
+ "
-0.073030
\n",
+ "
-0.081413
\n",
+ "
0.083740
\n",
+ "
0.027809
\n",
+ "
0.173816
\n",
+ "
-0.039493
\n",
+ "
-0.004222
\n",
+ "
0.003064
\n",
+ "
57.0
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
442 rows × 11 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " age sex bmi bp s1 s2 s3 \\\n",
+ "0 0.038076 0.050680 0.061696 0.021872 -0.044223 -0.034821 -0.043401 \n",
+ "1 -0.001882 -0.044642 -0.051474 -0.026328 -0.008449 -0.019163 0.074412 \n",
+ "2 0.085299 0.050680 0.044451 -0.005670 -0.045599 -0.034194 -0.032356 \n",
+ "3 -0.089063 -0.044642 -0.011595 -0.036656 0.012191 0.024991 -0.036038 \n",
+ "4 0.005383 -0.044642 -0.036385 0.021872 0.003935 0.015596 0.008142 \n",
+ ".. ... ... ... ... ... ... ... \n",
+ "437 0.041708 0.050680 0.019662 0.059744 -0.005697 -0.002566 -0.028674 \n",
+ "438 -0.005515 0.050680 -0.015906 -0.067642 0.049341 0.079165 -0.028674 \n",
+ "439 0.041708 0.050680 -0.015906 0.017293 -0.037344 -0.013840 -0.024993 \n",
+ "440 -0.045472 -0.044642 0.039062 0.001215 0.016318 0.015283 -0.028674 \n",
+ "441 -0.045472 -0.044642 -0.073030 -0.081413 0.083740 0.027809 0.173816 \n",
+ "\n",
+ " s4 s5 s6 progression \n",
+ "0 -0.002592 0.019907 -0.017646 151.0 \n",
+ "1 -0.039493 -0.068332 -0.092204 75.0 \n",
+ "2 -0.002592 0.002861 -0.025930 141.0 \n",
+ "3 0.034309 0.022688 -0.009362 206.0 \n",
+ "4 -0.002592 -0.031988 -0.046641 135.0 \n",
+ ".. ... ... ... ... \n",
+ "437 -0.002592 0.031193 0.007207 178.0 \n",
+ "438 0.034309 -0.018114 0.044485 104.0 \n",
+ "439 -0.011080 -0.046883 0.015491 132.0 \n",
+ "440 0.026560 0.044529 -0.025930 220.0 \n",
+ "441 -0.039493 -0.004222 0.003064 57.0 \n",
+ "\n",
+ "[442 rows x 11 columns]"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "from sklearn.datasets import load_diabetes\n",
+ "\n",
+ "# Load the diabetes dataset\n",
+ "diabetes_data = load_diabetes()\n",
+ "\n",
+ "# Convert to DataFrame\n",
+ "diabetes_df = pd.DataFrame(diabetes_data.data, columns=diabetes_data.feature_names)\n",
+ "\n",
+ "# Bind the disease progression (diabetes target) to the DataFrame\n",
+ "diabetes_df['progression'] = diabetes_data.target\n",
+ "\n",
+ "\n",
+ "# Display the DataFrame\n",
+ "diabetes_df\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "721b2b17",
+ "metadata": {},
+ "source": [
+ "#### **Question 1:** \n",
+ "#### Data inspection\n",
+ "\n",
+ "Before fitting any model, it is essential to understand our data. **Use Python code** to answer the following questions about the **Wine dataset**:\n",
+ "\n",
+ "_(i)_ How many observations (rows) does the dataset contain?"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "05725e2a",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Number of observations: 442\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Number of observations (rows)\n",
+ "num_observations = diabetes_df.shape[0]\n",
+ "print(f\"Number of observations: {num_observations}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a599c73b",
+ "metadata": {},
+ "source": [
+ "_(ii)_ How many variables (columns) does the dataset contain?"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "9c8621b4",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Number of variables: 11\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Number of variables (columns)\n",
+ "num_variables = diabetes_df.shape[1]\n",
+ "print(f\"Number of variables: {num_variables}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6ad6b6a4",
+ "metadata": {},
+ "source": [
+ "_(iii)_ What is the 'variable type' of the response variable `progression` (e.g., 'integer', 'category', etc.)?"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "02157236",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Variable type of 'progression': float64\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Variable type of the response variable 'progression'\n",
+ "response_variable_type = diabetes_df['progression'].dtype\n",
+ "print(f\"Variable type of 'progression': {response_variable_type}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "cbc54d2c",
+ "metadata": {},
+ "source": [
+ "_(iv)_ How many predictor variables do we have (Hint: all variables other than `progression`)? "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "44cbb0ce",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Number of predictor variables: 10\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Count the number of predictor variables (excluding 'progression')\n",
+ "predictor_variables = diabetes_df.drop(columns=['progression']).columns\n",
+ "num_predictors = len(predictor_variables)\n",
+ "\n",
+ "print(f\"Number of predictor variables: {num_predictors}\")\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1741cf23",
+ "metadata": {},
+ "source": [
+ "You can use `print()` and `describe()` to help answer these questions."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "fa3832d7",
+ "metadata": {},
+ "source": [
+ "#### **Question 2:** \n",
+ "#### Data-visualization\n",
+ "\n",
+ "Before we fit and review model outputs, we should visualize our data. Review the code and plot, shown below. Answer the following questions:\n",
+ "\n",
+ "_(i)_ Describe the associations being plotted ? (i.e., positive association, negative association, no association)\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f67e57ab",
+ "metadata": {},
+ "source": [
+ "> Positive Association: If the points form an upward trend from left to right, meaning that as the feature values increase, the target ('progression') also increases. The slope of the regression line would be positive.\n",
+ "\n",
+ "> Negative Association: If the points form a downward trend, meaning that as the feature values increase, the target decreases. The slope of the regression line would be negative.\n",
+ "\n",
+ "> No Association: If the points are scattered randomly without forming a clear trend or line, there may be little to no relationship between the feature and the target variable. The regression line would be nearly horizontal (slope close to zero)."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5325992e",
+ "metadata": {},
+ "source": [
+ "_(ii)_ What concept ‘defines’ the plotted line?"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "843f9eef",
+ "metadata": {},
+ "source": [
+ "> The plotted line in your scatter plots represents the line of best fit, which is defined by the concept of linear regression."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "732784d8",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ "
"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# Exclude the 'sex' column from the feature names since it's categoricla and we are plotting continuous relationships\n",
+ "feature_names = diabetes_df.columns.difference(['progression', 'sex'])\n",
+ "\n",
+ "# Loop through each feature (column) in diabetes_df\n",
+ "for feature in feature_names:\n",
+ " # Extract the feature column and target variable (progression)\n",
+ " X_feature = diabetes_df[[feature]].values # Extract as a 2D array\n",
+ " y = diabetes_df['progression'].values # Target variable\n",
+ " \n",
+ " # Create a scatter plot for the feature against the target (progression)\n",
+ " plt.figure(figsize=(6, 4))\n",
+ " plt.scatter(X_feature, y, label='Data', color='blue')\n",
+ "\n",
+ " # Fit a linear regression model\n",
+ " lm = LinearRegression()\n",
+ " lm.fit(X_feature, y)\n",
+ "\n",
+ " # Plot the regression line\n",
+ " plt.plot(X_feature, lm.predict(X_feature), color='red', label='Regression Line')\n",
+ "\n",
+ " # Add labels and title\n",
+ " plt.xlabel(feature)\n",
+ " plt.ylabel('progression')\n",
+ " plt.title(f'{feature} vs progression')\n",
+ "\n",
+ " # Add a legend\n",
+ " plt.legend()\n",
+ "\n",
+ " # Show the plot\n",
+ " plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "2b1ff3cf",
+ "metadata": {},
+ "source": [
+ "Consider the variables plotted above. In the context of the `Diabetes` dataset:\n",
+ "\n",
+ "(iii) What is the (implied) null hypothesis? What is the (implied) alternative hypothesis?\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "2ea782fc",
+ "metadata": {},
+ "source": [
+ "> Null: The predictor variable has no association with the response variable (progression). In other words, the slope of the regression line for this predictor is zero.\n",
+ "\n",
+ "> Alternative: The predictor variable is associated with the response variable (progression). In other words, the slope of the regression line for this predictor is not zero"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4604ee03",
+ "metadata": {},
+ "source": [
+ "#### **Question 3:** \n",
+ "#### Model fit\n",
+ "Now, let’s fit a multivariable linear regression model, using the general syntax lm(). As above, use progression as the response variable Y, and all other variables as the predictors.\n",
+ "\n",
+ "**Step 1: Split the dataset into test and train.**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "399225f4",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Select predictors (excluding the last column)\n",
+ "predictors = diabetes_df.iloc[:, :-1]\n",
+ "\n",
+ "# Create a random vector of True and False values to split the data\n",
+ "split = np.random.choice([True, False], size=len(predictors), replace=True, p=[0.75, 0.25])\n",
+ "\n",
+ "# Split the diabetes dataset into 75% training data and 25% test data\n",
+ "diabetes_train, diabetes_test = train_test_split(diabetes_df, train_size=0.75)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f76b8f5c",
+ "metadata": {},
+ "source": [
+ "**Step 2: Fit the linear regression model.**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "id": "ac1e1117",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " predictor slope intercept\n",
+ "0 age -37.455806 153.706643\n",
+ "1 sex -212.936457 153.706643\n",
+ "2 bmi 553.747247 153.706643\n",
+ "3 bp 298.132752 153.706643\n",
+ "4 s1 -826.173236 153.706643\n",
+ "5 s2 522.515299 153.706643\n",
+ "6 s3 125.497614 153.706643\n",
+ "7 s4 186.286860 153.706643\n",
+ "8 s5 725.999011 153.706643\n",
+ "9 s6 66.457698 153.706643\n"
+ ]
+ }
+ ],
+ "source": [
+ "# fit the linear regression model\n",
+ "lm = LinearRegression()\n",
+ "\n",
+ "lm.fit(\n",
+ " diabetes_train[predictor_variables], # Two predictors: square footage and number of bedrooms\n",
+ " diabetes_train[\"progression\"] # Target variable: house prices\n",
+ ")\n",
+ "\n",
+ "# Create a DataFrame containing the slope (coefficients) and intercept\n",
+ "coefficients_df = pd.DataFrame({\"predictor\": predictor_variables, \"slope\": lm.coef_, \"intercept\": [lm.intercept_] * len(lm.coef_)})\n",
+ "\n",
+ "# Display the coefficients DataFrame\n",
+ "print(coefficients_df)\n",
+ "\n",
+ "# lm.coef_ gives the coefficients for each predictor (change in diabetes progression per unit change in each predictor variable)\n",
+ "# lm.intercept_ gives the intercept b_0 (the predicted diabetes progression when all predictors are set to 0)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3f76bf62",
+ "metadata": {},
+ "source": [
+ "#### **Question 4:** \n",
+ "#### Model evaluation\n",
+ "**Step 3. Finally, we predict on the test data set to assess how well our model does.** \n",
+ "\n",
+ "We will evaluate our final model's test error measured by RMSP."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "id": "ffefa9f2",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "RMSPE for the multivariable model: 56.69876420327839\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Predict the progression for the test set\n",
+ "diabetes_test[\"predicted\"] = lm.predict(diabetes_test[predictor_variables])\n",
+ "\n",
+ "# Calculate RMSPE for the multivariable model\n",
+ "lm_test_RMSPE = mean_squared_error(\n",
+ " y_true=diabetes_test[\"progression\"],\n",
+ " y_pred=diabetes_test[\"predicted\"]\n",
+ ")**(1/2)\n",
+ "\n",
+ "# Output the RMSPE\n",
+ "print(f\"RMSPE for the multivariable model: {lm_test_RMSPE}\")\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6f8a69db",
+ "metadata": {},
+ "source": [
+ "# Criteria\n",
+ "\n",
+ "| **Criteria** | **Complete** | **Incomplete** |\n",
+ "|--------------------------------------------------------|---------------------------------------------------|--------------------------------------------------|\n",
+ "| **Data Inspection** | Data is inspected for the number of variables, observations, and data types. | Data inspection is missing or incomplete. |\n",
+ "| **Data Visualization** | Visualizations (e.g., scatter plots, histograms) are properly interepreted to explore the relationships between variables. | Data visualization were not correctly interpreted. |\n",
+ "| **Model Initialization** | The linear regression model is correctly initialized. | The linear regression model is not initialized or is incorrect. |\n",
+ "| **Model Evaluation on Test Data** | The model is evaluated on the test data using appropriate metrics (e.g., RMSE). | The model evaluation is missing or uses the wrong metric. |\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "0b4390cc",
+ "metadata": {},
+ "source": [
+ "## Submission Information\n",
+ "\n",
+ "🚨 **Please review our [Assignment Submission Guide](https://github.com/UofT-DSI/onboarding/blob/main/onboarding_documents/submissions.md)** 🚨 for detailed instructions on how to format, branch, and submit your work. Following these guidelines is crucial for your submissions to be evaluated correctly.\n",
+ "\n",
+ "### Note:\n",
+ "\n",
+ "If you like, you may collaborate with others in the cohort. If you choose to do so, please indicate with whom you have worked with in your pull request by tagging their GitHub username. Separate submissions are required.\n",
+ "\n",
+ "### Submission Parameters:\n",
+ "* Submission Due Date: `HH:MM AM/PM - DD/MM/YYYY`\n",
+ "* The branch name for your repo should be: `assignment-1`\n",
+ "* What to submit for this assignment:\n",
+ " * This Jupyter Notebook (assignment_1.ipynb) should be populated and should be the only change in your pull request.\n",
+ "* What the pull request link should look like for this assignment: `https://github.com//applying_statistical_concepts/pull/`\n",
+ " * Open a private window in your browser. Copy and paste the link to your pull request into the address bar. Make sure you can see your pull request properly. This helps the technical facilitator and learning support staff review your submission easily.\n",
+ "\n",
+ "Checklist:\n",
+ "- [ ] Created a branch with the correct naming convention.\n",
+ "- [ ] Ensured that the repository is public.\n",
+ "- [ ] Reviewed the PR description guidelines and adhered to them.\n",
+ "- [ ] Verify that the link is accessible in a private browser window.\n",
+ "\n",
+ "If you encounter any difficulties or have questions, please don't hesitate to reach out to our team via our Slack at `#cohort-4-help`. Our Technical Facilitators and Learning Support staff are here to help you navigate any challenges.\n"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3.10.4",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.10"
+ },
+ "vscode": {
+ "interpreter": {
+ "hash": "497a84dc8fec8cf8d24e7e87b6d954c9a18a327edc66feb9b9ea7e9e72cc5c7e"
+ }
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
From 025c70a4d7e75e78134b81ca03f150df6ba93a1d Mon Sep 17 00:00:00 2001
From: juliagallucci <78810440+juliagallucci@users.noreply.github.com>
Date: Thu, 26 Sep 2024 11:07:09 -0400
Subject: [PATCH 2/5] Add files via upload
add answers to assignment 3
---
.../assignment_3_answers.ipynb | 502 ++++++++++++++++++
1 file changed, 502 insertions(+)
create mode 100644 03_instructional_team/assignment_3_answers.ipynb
diff --git a/03_instructional_team/assignment_3_answers.ipynb b/03_instructional_team/assignment_3_answers.ipynb
new file mode 100644
index 00000000..f885814d
--- /dev/null
+++ b/03_instructional_team/assignment_3_answers.ipynb
@@ -0,0 +1,502 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "7b0bcac6-5086-4f4e-928a-570a9ff7ae58",
+ "metadata": {},
+ "source": [
+ "# Assignment 3"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5fce0350-2a17-4e93-8d4c-0b8748fdfc32",
+ "metadata": {},
+ "source": [
+ "You only need to write one line of code for each question. When answering questions that ask you to identify or interpret something, the length of your response doesn’t matter. For example, if the answer is just ‘yes,’ ‘no,’ or a number, you can just give that answer without adding anything else.\n",
+ "\n",
+ "We will go through comparable code and concepts in the live learning session. If you run into trouble, start by using the help `help()` function in Python, to get information about the datasets and function in question. The internet is also a great resource when coding (though note that **no outside searches are required by the assignment!**). If you do incorporate code from the internet, please cite the source within your code (providing a URL is sufficient).\n",
+ "\n",
+ "Please bring questions that you cannot work out on your own to office hours, work periods or share with your peers on Slack. We will work with you through the issue."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5fc5001c-7715-4ebe-b0f7-e4bd04349629",
+ "metadata": {},
+ "source": [
+ "### Clustering and Resampling\n",
+ "\n",
+ "Let's set up our workspace and use the **Iris dataset** from `scikit-learn`. This dataset is a classic dataset in machine learning and statistics, widely used for clustering tasks. It consists of 150 samples of iris flowers, each belonging to one of three species: Iris setosa, Iris versicolor, and Iris virginica. Here are the key features and characteristics of the dataset:\n",
+ "\n",
+ "##### Features:\n",
+ "1. **Sepal Length**: The length of the sepal in centimeters.\n",
+ "2. **Sepal Width**: The width of the sepal in centimeters.\n",
+ "3. **Petal Length**: The length of the petal in centimeters.\n",
+ "4. **Petal Width**: The width of the petal in centimeters.\n",
+ "\n",
+ "##### Target Variable:\n",
+ "- **Species**: The species of the iris flower, which can take one of the following values:\n",
+ " - 0: Iris setosa\n",
+ " - 1: Iris versicolor\n",
+ " - 2: Iris virginica"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "4a3485d6-ba58-4660-a983-5680821c5719",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Import standard libraries\n",
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "import matplotlib.pyplot as plt\n",
+ "from sklearn.preprocessing import StandardScaler\n",
+ "from sklearn.cluster import KMeans\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3491d919",
+ "metadata": {},
+ "source": [
+ "#### **Question 1:** \n",
+ "#### Data inspection\n",
+ "\n",
+ "#### Load the Iris dataset:\n",
+ "\n",
+ "Use scikit-learn to load the Iris dataset and convert it into a Pandas DataFrame.\n",
+ "Display the first few rows of the dataset. How many observations (rows) and features (columns) does the dataset contain?"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "a431d282-f9ca-4d5d-8912-71ffc9d8ea19",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " sepal length (cm) sepal width (cm) petal length (cm) petal width (cm) \\\n",
+ "0 5.1 3.5 1.4 0.2 \n",
+ "1 4.9 3.0 1.4 0.2 \n",
+ "2 4.7 3.2 1.3 0.2 \n",
+ "3 4.6 3.1 1.5 0.2 \n",
+ "4 5.0 3.6 1.4 0.2 \n",
+ ".. ... ... ... ... \n",
+ "145 6.7 3.0 5.2 2.3 \n",
+ "146 6.3 2.5 5.0 1.9 \n",
+ "147 6.5 3.0 5.2 2.0 \n",
+ "148 6.2 3.4 5.4 2.3 \n",
+ "149 5.9 3.0 5.1 1.8 \n",
+ "\n",
+ " species \n",
+ "0 0 \n",
+ "1 0 \n",
+ "2 0 \n",
+ "3 0 \n",
+ "4 0 \n",
+ ".. ... \n",
+ "145 2 \n",
+ "146 2 \n",
+ "147 2 \n",
+ "148 2 \n",
+ "149 2 \n",
+ "\n",
+ "[150 rows x 5 columns]\n",
+ "Number of observations: 150\n",
+ "Number of variables: 5\n"
+ ]
+ }
+ ],
+ "source": [
+ "from sklearn.datasets import load_iris\n",
+ "# Load the Iris dataset\n",
+ "iris_data = load_iris()\n",
+ "\n",
+ "# Convert to DataFrame\n",
+ "iris_df = pd.DataFrame(iris_data.data, columns=iris_data.feature_names)\n",
+ "\n",
+ "# Bind the disease progression (diabetes target) to the DataFrame\n",
+ "iris_df['species'] = iris_data.target\n",
+ "\n",
+ "\n",
+ "# Display the DataFrame\n",
+ "print(iris_df)\n",
+ "\n",
+ "# Number of observations (rows)\n",
+ "num_observations = iris_df.shape[0]\n",
+ "print(f\"Number of observations: {num_observations}\")\n",
+ "\n",
+ "# Number of variables (columns)\n",
+ "num_variables = iris_df.shape[1]\n",
+ "print(f\"Number of variables: {num_variables}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "fa3832d7",
+ "metadata": {},
+ "source": [
+ "#### **Question 2:** \n",
+ "#### Data-visualization\n",
+ "\n",
+ "Create plots to visualize the relationships between the features (sepal length, sepal width, petal length, petal width).\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "id": "efd6dc0c",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ "
"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# Get feature names\n",
+ "feature_names = iris_df.columns.difference(['species'])\n",
+ "\n",
+ "# Create a figure for the scatter plots\n",
+ "plt.figure(figsize=(12, 10))\n",
+ "\n",
+ "# Counter for subplot index\n",
+ "plot_number = 1\n",
+ "\n",
+ "# Loop through each pair of features\n",
+ "for i in range(len(feature_names)):\n",
+ " for j in range(i + 1, len(feature_names)):\n",
+ " plt.subplot(len(feature_names)-1, len(feature_names)-1, plot_number)\n",
+ " plt.scatter(iris_df[feature_names[i]], iris_df[feature_names[j]], alpha=0.7)\n",
+ " plt.xlabel(feature_names[i])\n",
+ " plt.ylabel(feature_names[j])\n",
+ " plt.title(f'{feature_names[i]} vs {feature_names[j]}')\n",
+ " \n",
+ " # Increment the plot number\n",
+ " plot_number += 1\n",
+ "\n",
+ "# Adjust layout to prevent overlap\n",
+ "plt.tight_layout()\n",
+ "\n",
+ "# Show the plot\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d596b3af",
+ "metadata": {},
+ "source": [
+ "#### **Question 3:** \n",
+ "#### Data cleaning"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "id": "b2cfec72",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " sepal length (cm) sepal width (cm) petal length (cm) petal width (cm) \\\n",
+ "0 -0.900681 1.019004 -1.340227 -1.315444 \n",
+ "1 -1.143017 -0.131979 -1.340227 -1.315444 \n",
+ "2 -1.385353 0.328414 -1.397064 -1.315444 \n",
+ "3 -1.506521 0.098217 -1.283389 -1.315444 \n",
+ "4 -1.021849 1.249201 -1.340227 -1.315444 \n",
+ "\n",
+ " species \n",
+ "0 0 \n",
+ "1 0 \n",
+ "2 0 \n",
+ "3 0 \n",
+ "4 0 \n"
+ ]
+ }
+ ],
+ "source": [
+ "# Initialize the StandardScaler\n",
+ "scaler = StandardScaler()\n",
+ "\n",
+ "# Scale the features (excluding the species column)\n",
+ "scaled_features = scaler.fit_transform(iris_df.iloc[:, :-1])\n",
+ "\n",
+ "# Create a new DataFrame with scaled features\n",
+ "scaled_iris_df = pd.DataFrame(scaled_features, columns=iris_data.feature_names)\n",
+ "\n",
+ "# Add the species column back to the scaled DataFrame\n",
+ "scaled_iris_df['species'] = iris_df['species'].values\n",
+ "\n",
+ "# Display the first few rows of the scaled DataFrame\n",
+ "print(scaled_iris_df.head())"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "035fa019",
+ "metadata": {},
+ "source": [
+ "Why is it important to standardize the features of a dataset before applying clustering algorithms like K-Means? Discuss the implications of using unstandardized data in your analysis."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "53d77d5c",
+ "metadata": {},
+ "source": [
+ "> Standardizing features is essential for effective clustering. It ensures that all features are treated equally, leading to more accurate and interpretable results. Not standardizing can result in misleading insights and ineffective analysis. \n",
+ "\n",
+ "> Clustering relies on measuring distances between data points. Standardization ensures that these distances accurately reflect similarities across all features, not just those with larger values."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4604ee03",
+ "metadata": {},
+ "source": [
+ "#### **Question 4:** \n",
+ "#### K-means clustering \n",
+ "Apply the K-Means clustering algorithm to the Iris dataset.\n",
+ "Choose the number of clusters (K=3, since there are three species) and fit the model.\n",
+ "Assign cluster labels to the original data and add them as a new column in the DataFrame."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "id": "909df219",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Cluster 0 1 2\n",
+ "Species \n",
+ "0 0 50 0\n",
+ "1 38 0 12\n",
+ "2 14 0 36\n",
+ "Cluster 0 1 2\n",
+ "Species \n",
+ "0 50 0 0\n",
+ "1 0 38 12\n",
+ "2 0 14 36\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/Users/juliagallucci/miniconda3/lib/python3.10/site-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning\n",
+ " warnings.warn(\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Perform K-means clustering\n",
+ "kmeans = KMeans(n_clusters=3, random_state=12345) # Specify the number of clusters\n",
+ "clusters = kmeans.fit(scaled_features)\n",
+ "# Add cluster labels to the original DataFrame\n",
+ "scaled_iris_df['Cluster'] = clusters.labels_\n",
+ "\n",
+ "\n",
+ "# Create a contingency table of species vs clusters\n",
+ "species_cluster_table = pd.crosstab(scaled_iris_df['species'], scaled_iris_df['Cluster'], \n",
+ " rownames=['Species'], colnames=['Cluster'])\n",
+ "\n",
+ "# Display the table\n",
+ "print(species_cluster_table)\n",
+ "\n",
+ "# Remap clusters if needed based on your findings\n",
+ "# Here, we swap cluster 0 and 1 as an example\n",
+ "cluster_remap = {0: 1, 1: 0, 2: 2} # Modify this based on your analysis\n",
+ "scaled_iris_df['Mapped_Cluster'] = scaled_iris_df['Cluster'].map(cluster_remap)\n",
+ "\n",
+ "# Create a new contingency table with mapped clusters\n",
+ "mapped_species_cluster_table = pd.crosstab(scaled_iris_df['species'], scaled_iris_df['Mapped_Cluster'], \n",
+ " rownames=['Species'], colnames=['Cluster'])\n",
+ "\n",
+ "print(mapped_species_cluster_table)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "0aefdee5",
+ "metadata": {},
+ "source": [
+ "Discuss the results of the K-Means clustering. How well did the clusters match the true species?"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "7bcebc16",
+ "metadata": {},
+ "source": [
+ "> Overall, the K-Means clustering results show a strong performance for species 0, but there is room for improvement in distinguishing between species 1 and species 2. "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3f76bf62",
+ "metadata": {},
+ "source": [
+ "#### **Question 5:** \n",
+ "#### Bootstrapping \n",
+ "\n",
+ " Implement bootstrapping on the mean of one of the sepal or petal measurement variables (e.g., Sepal Length, Petal Width) to assess the stability of the mean estimate. Generate 1000 bootstrap samples, calculate the mean for each sample, and compute a 95% confidence interval to evaluate the variability in the estimate."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 67,
+ "id": "be4c4011",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Mean of Sepal Length: 5.84\n",
+ "95% Confidence Interval of Mean Sepal Length: (5.72, 5.98)\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Calculate the mean of the Sepal Length\n",
+ "sepal_length_mean = iris_df['sepal length (cm)'].mean()\n",
+ "\n",
+ "# Display the result\n",
+ "print(f\"Mean of Sepal Length: {sepal_length_mean:.2f}\")\n",
+ "\n",
+ "np.random.seed(123)\n",
+ "# Initialize an empty list to store the bootstrap samples\n",
+ "bootstrap_samples = []\n",
+ "\n",
+ "for i in range(1000):\n",
+ " sample = iris_df.drop(columns='species').sample(frac=1, replace=True)\n",
+ " sample = sample.assign(replicate=i) # Add replicate number\n",
+ " bootstrap_samples.append(sample) # Store the sample\n",
+ "\n",
+ "# Combine all bootstrap samples into one DataFrame\n",
+ "boot1000 = pd.concat(bootstrap_samples)\n",
+ "\n",
+ "# Calculate the mean price for each bootstrap sample (replicate)\n",
+ "boot_means = boot1000.groupby('replicate')['sepal length (cm)'].mean().reset_index(name='sepal length (cm)')\n",
+ "boot_means\n",
+ "\n",
+ "\n",
+ "# Calculate the 95% confidence interval bounds (2.5th and 97.5th percentiles) for the mean price\n",
+ "ci_bounds = boot_means[\"sepal length (cm)\"].quantile([0.025, 0.975])\n",
+ "\n",
+ "# Display the result\n",
+ "print(f\"95% Confidence Interval of Mean Sepal Length: ({ci_bounds.iloc[0]:.2f}, {ci_bounds.iloc[1]:.2f})\")\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "29096311",
+ "metadata": {},
+ "source": [
+ "Reflect on the variability observed in the bootstrapped means and discuss whether the mean of the selected variable appears to be a stable and reliable estimate based on the confidence interval and the spread of the bootstrapped means."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "0a7e6778",
+ "metadata": {},
+ "source": [
+ "> Point Estimate: The sample mean of Sepal Length is 5.84\n",
+ "95% Confidence Interval: We estimate that the true mean of Sepal Length is between 5.72 and 5.98\n",
+ "\n",
+ "Based on the bootstrapping results for Sepal Length in the Iris dataset, the bootstrapped mean distribution was narrow, and the 95% confidence interval was small, suggesting that the sample mean is a stable and reliable estimate of the true mean."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6f8a69db",
+ "metadata": {},
+ "source": [
+ "# Criteria\n",
+ "\n",
+ "\n",
+ "| **Criteria** | **Complete** | **Incomplete** |\n",
+ "|--------------------------------------------------------|---------------------------------------------------|--------------------------------------------------|\n",
+ "| **Data Inspection** | Data is thoroughly inspected for the number of variables, observations, and data types, and relevant insights are noted. | Data inspection is missing or lacks detail. |\n",
+ "| **Data Visualization** | Visualizations (e.g., scatter plots) are well-constructed and correctly interpreted to explore relationships between features and species. | Visualizations are poorly constructed or not correctly interpreted. |\n",
+ "| **Clustering Implementation** | K-Means clustering is correctly implemented, and cluster labels are appropriately assigned to the dataset. | K-Means clustering is missing or incorrectly implemented. |\n",
+ "| **Bootstrapping Process** | Bootstrapping is correctly performed, and results are used to assess variable mean stability. | Bootstrapping is missing or incorrectly performed. |"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "0b4390cc",
+ "metadata": {},
+ "source": [
+ "## Submission Information\n",
+ "\n",
+ "🚨 **Please review our [Assignment Submission Guide](https://github.com/UofT-DSI/onboarding/blob/main/onboarding_documents/submissions.md)** 🚨 for detailed instructions on how to format, branch, and submit your work. Following these guidelines is crucial for your submissions to be evaluated correctly.\n",
+ "\n",
+ "### Note:\n",
+ "\n",
+ "If you like, you may collaborate with others in the cohort. If you choose to do so, please indicate with whom you have worked with in your pull request by tagging their GitHub username. Separate submissions are required.\n",
+ "\n",
+ "### Submission Parameters:\n",
+ "* Submission Due Date: `HH:MM AM/PM - DD/MM/YYYY`\n",
+ "* The branch name for your repo should be: `assignment-1`\n",
+ "* What to submit for this assignment:\n",
+ " * This Jupyter Notebook (assignment_1.ipynb) should be populated and should be the only change in your pull request.\n",
+ "* What the pull request link should look like for this assignment: `https://github.com//applying_statistical_concepts/pull/`\n",
+ " * Open a private window in your browser. Copy and paste the link to your pull request into the address bar. Make sure you can see your pull request properly. This helps the technical facilitator and learning support staff review your submission easily.\n",
+ "\n",
+ "Checklist:\n",
+ "- [ ] Created a branch with the correct naming convention.\n",
+ "- [ ] Ensured that the repository is public.\n",
+ "- [ ] Reviewed the PR description guidelines and adhered to them.\n",
+ "- [ ] Verify that the link is accessible in a private browser window.\n",
+ "\n",
+ "If you encounter any difficulties or have questions, please don't hesitate to reach out to our team via our Slack at `#cohort-4-help`. Our Technical Facilitators and Learning Support staff are here to help you navigate any challenges.\n"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3.10.4",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.10"
+ },
+ "vscode": {
+ "interpreter": {
+ "hash": "497a84dc8fec8cf8d24e7e87b6d954c9a18a327edc66feb9b9ea7e9e72cc5c7e"
+ }
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
From bd68d92fe3c9adf0bbaa99507c54f603ffd1258c Mon Sep 17 00:00:00 2001
From: juliagallucci <78810440+juliagallucci@users.noreply.github.com>
Date: Thu, 26 Sep 2024 11:07:32 -0400
Subject: [PATCH 3/5] Add files via upload
modified assignment 3 question 5 difficulty level
---
02_activities/assignments/assignment_3.ipynb | 261 ++++++++++++++++---
1 file changed, 218 insertions(+), 43 deletions(-)
diff --git a/02_activities/assignments/assignment_3.ipynb b/02_activities/assignments/assignment_3.ipynb
index d5cb0f50..2de1febc 100644
--- a/02_activities/assignments/assignment_3.ipynb
+++ b/02_activities/assignments/assignment_3.ipynb
@@ -73,10 +73,167 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 14,
"id": "a431d282-f9ca-4d5d-8912-71ffc9d8ea19",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
sepal length (cm)
\n",
+ "
sepal width (cm)
\n",
+ "
petal length (cm)
\n",
+ "
petal width (cm)
\n",
+ "
species
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "
\n",
+ "
0
\n",
+ "
5.1
\n",
+ "
3.5
\n",
+ "
1.4
\n",
+ "
0.2
\n",
+ "
0
\n",
+ "
\n",
+ "
\n",
+ "
1
\n",
+ "
4.9
\n",
+ "
3.0
\n",
+ "
1.4
\n",
+ "
0.2
\n",
+ "
0
\n",
+ "
\n",
+ "
\n",
+ "
2
\n",
+ "
4.7
\n",
+ "
3.2
\n",
+ "
1.3
\n",
+ "
0.2
\n",
+ "
0
\n",
+ "
\n",
+ "
\n",
+ "
3
\n",
+ "
4.6
\n",
+ "
3.1
\n",
+ "
1.5
\n",
+ "
0.2
\n",
+ "
0
\n",
+ "
\n",
+ "
\n",
+ "
4
\n",
+ "
5.0
\n",
+ "
3.6
\n",
+ "
1.4
\n",
+ "
0.2
\n",
+ "
0
\n",
+ "
\n",
+ "
\n",
+ "
...
\n",
+ "
...
\n",
+ "
...
\n",
+ "
...
\n",
+ "
...
\n",
+ "
...
\n",
+ "
\n",
+ "
\n",
+ "
145
\n",
+ "
6.7
\n",
+ "
3.0
\n",
+ "
5.2
\n",
+ "
2.3
\n",
+ "
2
\n",
+ "
\n",
+ "
\n",
+ "
146
\n",
+ "
6.3
\n",
+ "
2.5
\n",
+ "
5.0
\n",
+ "
1.9
\n",
+ "
2
\n",
+ "
\n",
+ "
\n",
+ "
147
\n",
+ "
6.5
\n",
+ "
3.0
\n",
+ "
5.2
\n",
+ "
2.0
\n",
+ "
2
\n",
+ "
\n",
+ "
\n",
+ "
148
\n",
+ "
6.2
\n",
+ "
3.4
\n",
+ "
5.4
\n",
+ "
2.3
\n",
+ "
2
\n",
+ "
\n",
+ "
\n",
+ "
149
\n",
+ "
5.9
\n",
+ "
3.0
\n",
+ "
5.1
\n",
+ "
1.8
\n",
+ "
2
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
150 rows × 5 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " sepal length (cm) sepal width (cm) petal length (cm) petal width (cm) \\\n",
+ "0 5.1 3.5 1.4 0.2 \n",
+ "1 4.9 3.0 1.4 0.2 \n",
+ "2 4.7 3.2 1.3 0.2 \n",
+ "3 4.6 3.1 1.5 0.2 \n",
+ "4 5.0 3.6 1.4 0.2 \n",
+ ".. ... ... ... ... \n",
+ "145 6.7 3.0 5.2 2.3 \n",
+ "146 6.3 2.5 5.0 1.9 \n",
+ "147 6.5 3.0 5.2 2.0 \n",
+ "148 6.2 3.4 5.4 2.3 \n",
+ "149 5.9 3.0 5.1 1.8 \n",
+ "\n",
+ " species \n",
+ "0 0 \n",
+ "1 0 \n",
+ "2 0 \n",
+ "3 0 \n",
+ "4 0 \n",
+ ".. ... \n",
+ "145 2 \n",
+ "146 2 \n",
+ "147 2 \n",
+ "148 2 \n",
+ "149 2 \n",
+ "\n",
+ "[150 rows x 5 columns]"
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"from sklearn.datasets import load_iris\n",
"# Load the Iris dataset\n",
@@ -134,80 +291,100 @@
]
},
{
- "cell_type": "markdown",
- "id": "4604ee03",
+ "cell_type": "code",
+ "execution_count": 16,
+ "id": "b8971d89",
"metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " sepal length (cm) sepal width (cm) petal length (cm) petal width (cm) \\\n",
+ "0 -0.900681 1.019004 -1.340227 -1.315444 \n",
+ "1 -1.143017 -0.131979 -1.340227 -1.315444 \n",
+ "2 -1.385353 0.328414 -1.397064 -1.315444 \n",
+ "3 -1.506521 0.098217 -1.283389 -1.315444 \n",
+ "4 -1.021849 1.249201 -1.340227 -1.315444 \n",
+ "\n",
+ " species \n",
+ "0 0 \n",
+ "1 0 \n",
+ "2 0 \n",
+ "3 0 \n",
+ "4 0 \n"
+ ]
+ }
+ ],
"source": [
- "#### **Question 4:** \n",
- "#### K-means clustering \n",
- "Apply the K-Means clustering algorithm to the Iris dataset.\n",
- "Choose the number of clusters (K=3, since there are three species) and fit the model.\n",
- "Assign cluster labels to the original data and add them as a new column in the DataFrame."
+ "# Initialize the StandardScaler\n",
+ "scaler = StandardScaler()\n",
+ "\n",
+ "# Scale the features (excluding the species column)\n",
+ "scaled_features = scaler.fit_transform(iris_df.iloc[:, :-1])\n",
+ "\n",
+ "# Create a new DataFrame with scaled features\n",
+ "scaled_iris_df = pd.DataFrame(scaled_features, columns=iris_data.feature_names)\n",
+ "\n",
+ "# Add the species column back to the scaled DataFrame\n",
+ "scaled_iris_df['species'] = iris_df['species'].values\n",
+ "\n",
+ "# Display the first few rows of the scaled DataFrame\n",
+ "print(scaled_iris_df.head())"
]
},
{
- "cell_type": "code",
- "execution_count": null,
- "id": "909df219",
+ "cell_type": "markdown",
+ "id": "b326e039",
"metadata": {},
- "outputs": [],
"source": [
- "# Your code here ..."
+ "Why is it important to standardize the features of a dataset before applying clustering algorithms like K-Means? Discuss the implications of using unstandardized data in your analysis."
]
},
{
"cell_type": "markdown",
- "id": "0aefdee5",
+ "id": "fc34b4b7",
"metadata": {},
"source": [
- "Discuss the results of the K-Means clustering. How well did the clusters match the true species?"
+ "> Your answer here ... "
]
},
{
"cell_type": "markdown",
- "id": "7bcebc16",
+ "id": "4604ee03",
"metadata": {},
"source": [
- "> Your answer here ..."
+ "#### **Question 4:** \n",
+ "#### K-means clustering \n",
+ "Apply the K-Means clustering algorithm to the Iris dataset.\n",
+ "Choose the number of clusters (K=3, since there are three species) and fit the model.\n",
+ "Assign cluster labels to the original data and add them as a new column in the DataFrame."
]
},
{
"cell_type": "code",
"execution_count": null,
- "id": "cbca5c4b",
+ "id": "909df219",
"metadata": {},
"outputs": [],
"source": [
- "# Initialize the StandardScaler\n",
- "scaler = StandardScaler()\n",
- "\n",
- "# Scale the features (excluding the species column)\n",
- "scaled_features = scaler.fit_transform(iris_df.iloc[:, :-1])\n",
- "\n",
- "# Create a new DataFrame with scaled features\n",
- "scaled_iris_df = pd.DataFrame(scaled_features, columns=iris_data.feature_names)\n",
- "\n",
- "# Add the species column back to the scaled DataFrame\n",
- "scaled_iris_df['species'] = iris_df['species'].values\n",
- "\n",
- "# Display the first few rows of the scaled DataFrame\n",
- "print(scaled_iris_df.head())"
+ "# Your code here ..."
]
},
{
"cell_type": "markdown",
- "id": "68f4231e",
+ "id": "0aefdee5",
"metadata": {},
"source": [
- "Why is it important to standardize the features of a dataset before applying clustering algorithms like K-Means? Discuss the implications of using unstandardized data in your analysis."
+ "Discuss the results of the K-Means clustering. How well did the clusters match the true species?"
]
},
{
"cell_type": "markdown",
- "id": "057ec7e9",
+ "id": "7bcebc16",
"metadata": {},
"source": [
- "> Your answer here ... "
+ "> Your answer here ..."
]
},
{
@@ -216,11 +393,9 @@
"metadata": {},
"source": [
"#### **Question 5:** \n",
- "#### Bootstrapping for Cluster Stability.\n",
+ "#### Bootstrapping \n",
"\n",
- "Implement bootstrapping to assess the stability of the clusters obtained from K-Means.\n",
- "Generate 100 bootstrap samples from the original dataset.\n",
- "For each bootstrap sample, fit the K-Means model and record the cluster labels."
+ " Implement bootstrapping on the mean of one of the sepal or petal measurement variables (e.g., Sepal Length, Petal Width) to assess the stability of the mean estimate. Generate 1000 bootstrap samples, calculate the mean for each sample, and compute a 95% confidence interval to evaluate the variability in the estimate."
]
},
{
@@ -238,7 +413,7 @@
"id": "29096311",
"metadata": {},
"source": [
- "Reflect on the stability of the clusters based on the bootstrapping results. Are there samples that consistently change clusters across bootstraps?"
+ "Reflect on the variability observed in the bootstrapped means and discuss whether the mean of the selected variable appears to be a stable and reliable estimate based on the confidence interval and the spread of the bootstrapped means."
]
},
{
@@ -262,7 +437,7 @@
"| **Data Inspection** | Data is thoroughly inspected for the number of variables, observations, and data types, and relevant insights are noted. | Data inspection is missing or lacks detail. |\n",
"| **Data Visualization** | Visualizations (e.g., scatter plots) are well-constructed and correctly interpreted to explore relationships between features and species. | Visualizations are poorly constructed or not correctly interpreted. |\n",
"| **Clustering Implementation** | K-Means clustering is correctly implemented, and cluster labels are appropriately assigned to the dataset. | K-Means clustering is missing or incorrectly implemented. |\n",
- "| **Bootstrapping Process** | Bootstrapping is correctly performed, and results are used to assess cluster stability. | Bootstrapping is missing or incorrectly performed. |"
+ "| **Bootstrapping Process** | Bootstrapping is correctly performed, and results are used to assess variable mean stability. | Bootstrapping is missing or incorrectly performed. |"
]
},
{
From 93c442fa4783ef8f8339709d37ed8f4d1b417d2f Mon Sep 17 00:00:00 2001
From: Polar
Date: Thu, 26 Sep 2024 17:56:20 -0400
Subject: [PATCH 4/5] homework 1
---
02_activities/assignments/assignment_1.ipynb | 8 ++++++++
1 file changed, 8 insertions(+)
diff --git a/02_activities/assignments/assignment_1.ipynb b/02_activities/assignments/assignment_1.ipynb
index 0ca7965b..ccebf248 100644
--- a/02_activities/assignments/assignment_1.ipynb
+++ b/02_activities/assignments/assignment_1.ipynb
@@ -359,6 +359,14 @@
"\n",
"If you encounter any difficulties or have questions, please don't hesitate to reach out to our team via our Slack at `#cohort-4-help`. Our Technical Facilitators and Learning Support staff are here to help you navigate any challenges.\n"
]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e44cb44b",
+ "metadata": {},
+ "source": [
+ "Thanks Vishnou"
+ ]
}
],
"metadata": {
From df13359976b7d977b0d39eaee7334e4d51d92bbf Mon Sep 17 00:00:00 2001
From: Polar
Date: Thu, 26 Sep 2024 18:43:22 -0400
Subject: [PATCH 5/5] Change Assignment--1
---
02_activities/assignments/assignment_1.ipynb | 478 +++++++++++++++++--
1 file changed, 450 insertions(+), 28 deletions(-)
diff --git a/02_activities/assignments/assignment_1.ipynb b/02_activities/assignments/assignment_1.ipynb
index ccebf248..cd3abbf6 100644
--- a/02_activities/assignments/assignment_1.ipynb
+++ b/02_activities/assignments/assignment_1.ipynb
@@ -34,7 +34,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 15,
"id": "4a3485d6-ba58-4660-a983-5680821c5719",
"metadata": {},
"outputs": [],
@@ -56,10 +56,288 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 16,
"id": "a431d282-f9ca-4d5d-8912-71ffc9d8ea19",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
alcohol
\n",
+ "
malic_acid
\n",
+ "
ash
\n",
+ "
alcalinity_of_ash
\n",
+ "
magnesium
\n",
+ "
total_phenols
\n",
+ "
flavanoids
\n",
+ "
nonflavanoid_phenols
\n",
+ "
proanthocyanins
\n",
+ "
color_intensity
\n",
+ "
hue
\n",
+ "
od280/od315_of_diluted_wines
\n",
+ "
proline
\n",
+ "
class
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "
\n",
+ "
0
\n",
+ "
14.23
\n",
+ "
1.71
\n",
+ "
2.43
\n",
+ "
15.6
\n",
+ "
127.0
\n",
+ "
2.80
\n",
+ "
3.06
\n",
+ "
0.28
\n",
+ "
2.29
\n",
+ "
5.64
\n",
+ "
1.04
\n",
+ "
3.92
\n",
+ "
1065.0
\n",
+ "
0
\n",
+ "
\n",
+ "
\n",
+ "
1
\n",
+ "
13.20
\n",
+ "
1.78
\n",
+ "
2.14
\n",
+ "
11.2
\n",
+ "
100.0
\n",
+ "
2.65
\n",
+ "
2.76
\n",
+ "
0.26
\n",
+ "
1.28
\n",
+ "
4.38
\n",
+ "
1.05
\n",
+ "
3.40
\n",
+ "
1050.0
\n",
+ "
0
\n",
+ "
\n",
+ "
\n",
+ "
2
\n",
+ "
13.16
\n",
+ "
2.36
\n",
+ "
2.67
\n",
+ "
18.6
\n",
+ "
101.0
\n",
+ "
2.80
\n",
+ "
3.24
\n",
+ "
0.30
\n",
+ "
2.81
\n",
+ "
5.68
\n",
+ "
1.03
\n",
+ "
3.17
\n",
+ "
1185.0
\n",
+ "
0
\n",
+ "
\n",
+ "
\n",
+ "
3
\n",
+ "
14.37
\n",
+ "
1.95
\n",
+ "
2.50
\n",
+ "
16.8
\n",
+ "
113.0
\n",
+ "
3.85
\n",
+ "
3.49
\n",
+ "
0.24
\n",
+ "
2.18
\n",
+ "
7.80
\n",
+ "
0.86
\n",
+ "
3.45
\n",
+ "
1480.0
\n",
+ "
0
\n",
+ "
\n",
+ "
\n",
+ "
4
\n",
+ "
13.24
\n",
+ "
2.59
\n",
+ "
2.87
\n",
+ "
21.0
\n",
+ "
118.0
\n",
+ "
2.80
\n",
+ "
2.69
\n",
+ "
0.39
\n",
+ "
1.82
\n",
+ "
4.32
\n",
+ "
1.04
\n",
+ "
2.93
\n",
+ "
735.0
\n",
+ "
0
\n",
+ "
\n",
+ "
\n",
+ "
...
\n",
+ "
...
\n",
+ "
...
\n",
+ "
...
\n",
+ "
...
\n",
+ "
...
\n",
+ "
...
\n",
+ "
...
\n",
+ "
...
\n",
+ "
...
\n",
+ "
...
\n",
+ "
...
\n",
+ "
...
\n",
+ "
...
\n",
+ "
...
\n",
+ "
\n",
+ "
\n",
+ "
173
\n",
+ "
13.71
\n",
+ "
5.65
\n",
+ "
2.45
\n",
+ "
20.5
\n",
+ "
95.0
\n",
+ "
1.68
\n",
+ "
0.61
\n",
+ "
0.52
\n",
+ "
1.06
\n",
+ "
7.70
\n",
+ "
0.64
\n",
+ "
1.74
\n",
+ "
740.0
\n",
+ "
2
\n",
+ "
\n",
+ "
\n",
+ "
174
\n",
+ "
13.40
\n",
+ "
3.91
\n",
+ "
2.48
\n",
+ "
23.0
\n",
+ "
102.0
\n",
+ "
1.80
\n",
+ "
0.75
\n",
+ "
0.43
\n",
+ "
1.41
\n",
+ "
7.30
\n",
+ "
0.70
\n",
+ "
1.56
\n",
+ "
750.0
\n",
+ "
2
\n",
+ "
\n",
+ "
\n",
+ "
175
\n",
+ "
13.27
\n",
+ "
4.28
\n",
+ "
2.26
\n",
+ "
20.0
\n",
+ "
120.0
\n",
+ "
1.59
\n",
+ "
0.69
\n",
+ "
0.43
\n",
+ "
1.35
\n",
+ "
10.20
\n",
+ "
0.59
\n",
+ "
1.56
\n",
+ "
835.0
\n",
+ "
2
\n",
+ "
\n",
+ "
\n",
+ "
176
\n",
+ "
13.17
\n",
+ "
2.59
\n",
+ "
2.37
\n",
+ "
20.0
\n",
+ "
120.0
\n",
+ "
1.65
\n",
+ "
0.68
\n",
+ "
0.53
\n",
+ "
1.46
\n",
+ "
9.30
\n",
+ "
0.60
\n",
+ "
1.62
\n",
+ "
840.0
\n",
+ "
2
\n",
+ "
\n",
+ "
\n",
+ "
177
\n",
+ "
14.13
\n",
+ "
4.10
\n",
+ "
2.74
\n",
+ "
24.5
\n",
+ "
96.0
\n",
+ "
2.05
\n",
+ "
0.76
\n",
+ "
0.56
\n",
+ "
1.35
\n",
+ "
9.20
\n",
+ "
0.61
\n",
+ "
1.60
\n",
+ "
560.0
\n",
+ "
2
\n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
178 rows × 14 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " alcohol malic_acid ash alcalinity_of_ash magnesium total_phenols \\\n",
+ "0 14.23 1.71 2.43 15.6 127.0 2.80 \n",
+ "1 13.20 1.78 2.14 11.2 100.0 2.65 \n",
+ "2 13.16 2.36 2.67 18.6 101.0 2.80 \n",
+ "3 14.37 1.95 2.50 16.8 113.0 3.85 \n",
+ "4 13.24 2.59 2.87 21.0 118.0 2.80 \n",
+ ".. ... ... ... ... ... ... \n",
+ "173 13.71 5.65 2.45 20.5 95.0 1.68 \n",
+ "174 13.40 3.91 2.48 23.0 102.0 1.80 \n",
+ "175 13.27 4.28 2.26 20.0 120.0 1.59 \n",
+ "176 13.17 2.59 2.37 20.0 120.0 1.65 \n",
+ "177 14.13 4.10 2.74 24.5 96.0 2.05 \n",
+ "\n",
+ " flavanoids nonflavanoid_phenols proanthocyanins color_intensity hue \\\n",
+ "0 3.06 0.28 2.29 5.64 1.04 \n",
+ "1 2.76 0.26 1.28 4.38 1.05 \n",
+ "2 3.24 0.30 2.81 5.68 1.03 \n",
+ "3 3.49 0.24 2.18 7.80 0.86 \n",
+ "4 2.69 0.39 1.82 4.32 1.04 \n",
+ ".. ... ... ... ... ... \n",
+ "173 0.61 0.52 1.06 7.70 0.64 \n",
+ "174 0.75 0.43 1.41 7.30 0.70 \n",
+ "175 0.69 0.43 1.35 10.20 0.59 \n",
+ "176 0.68 0.53 1.46 9.30 0.60 \n",
+ "177 0.76 0.56 1.35 9.20 0.61 \n",
+ "\n",
+ " od280/od315_of_diluted_wines proline class \n",
+ "0 3.92 1065.0 0 \n",
+ "1 3.40 1050.0 0 \n",
+ "2 3.17 1185.0 0 \n",
+ "3 3.45 1480.0 0 \n",
+ "4 2.93 735.0 0 \n",
+ ".. ... ... ... \n",
+ "173 1.74 740.0 2 \n",
+ "174 1.56 750.0 2 \n",
+ "175 1.56 835.0 2 \n",
+ "176 1.62 840.0 2 \n",
+ "177 1.60 560.0 2 \n",
+ "\n",
+ "[178 rows x 14 columns]"
+ ]
+ },
+ "execution_count": 16,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
"from sklearn.datasets import load_wine\n",
"\n",
@@ -91,12 +369,23 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 17,
"id": "56916892",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "178"
+ ]
+ },
+ "execution_count": 17,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "# Your answer here"
+ "178 # Your answer here"
]
},
{
@@ -109,12 +398,23 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 18,
"id": "df0ef103",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "14"
+ ]
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "# Your answer here"
+ "14 # Your answer here"
]
},
{
@@ -127,12 +427,23 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 19,
"id": "47989426",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "'integer'"
+ ]
+ },
+ "execution_count": 19,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "# Your answer here"
+ "'integer' # Your answer here"
]
},
{
@@ -146,12 +457,23 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 20,
"id": "bd7b0910",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "13"
+ ]
+ },
+ "execution_count": 20,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "# Your answer here"
+ "13 # Your answer here"
]
},
{
@@ -175,10 +497,37 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 21,
"id": "cc899b59",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " alcohol malic_acid ash alcalinity_of_ash magnesium \\\n",
+ "0 1.518613 -0.562250 0.232053 -1.169593 1.913905 \n",
+ "1 0.246290 -0.499413 -0.827996 -2.490847 0.018145 \n",
+ "2 0.196879 0.021231 1.109334 -0.268738 0.088358 \n",
+ "3 1.691550 -0.346811 0.487926 -0.809251 0.930918 \n",
+ "4 0.295700 0.227694 1.840403 0.451946 1.281985 \n",
+ "\n",
+ " total_phenols flavanoids nonflavanoid_phenols proanthocyanins \\\n",
+ "0 0.808997 1.034819 -0.659563 1.224884 \n",
+ "1 0.568648 0.733629 -0.820719 -0.544721 \n",
+ "2 0.808997 1.215533 -0.498407 2.135968 \n",
+ "3 2.491446 1.466525 -0.981875 1.032155 \n",
+ "4 0.808997 0.663351 0.226796 0.401404 \n",
+ "\n",
+ " color_intensity hue od280/od315_of_diluted_wines proline \n",
+ "0 0.251717 0.362177 1.847920 1.013009 \n",
+ "1 -0.293321 0.406051 1.113449 0.965242 \n",
+ "2 0.269020 0.318304 0.788587 1.395148 \n",
+ "3 1.186068 -0.427544 1.184071 2.334574 \n",
+ "4 -0.319276 0.362177 0.449601 -0.037874 \n"
+ ]
+ }
+ ],
"source": [
"# Select predictors (excluding the last column)\n",
"predictors = wine_df.iloc[:, :-1]\n",
@@ -204,7 +553,12 @@
"id": "403ef0bb",
"metadata": {},
"source": [
- "> Your answer here..."
+ "Standardizing predictor variables (also known as z-score normalization) is crucial for several reasons:\n",
+ "\n",
+ "Equal Weighting: Many statistical methods (like linear regression or k-means clustering) are sensitive to the scale of the data. If predictor variables are on different scales, those with larger ranges can dominate the model, leading to biased results.\n",
+ "Improved Convergence: Algorithms that rely on gradient descent (e.g., logistic regression, neural networks) benefit from standardized variables, as it helps them converge more quickly and effectively.\n",
+ "Interpretability: Standardization allows for easier comparison between the effects of different predictors, as they are now on the same scale.\n",
+ "Distance Metrics: In algorithms that use distance metrics (like k-NN), standardization ensures that all dimensions contribute equally to the distance calculations."
]
},
{
@@ -220,7 +574,11 @@
"id": "fdee5a15",
"metadata": {},
"source": [
- "> Your answer here..."
+ "The response variable \"Class,\" which takes on binary values (0 or 2), typically does not require standardization because:\n",
+ "\n",
+ "Binary Nature: Standardizing a binary variable can distort its interpretation. It changes the scale of the variable, which is inherently categorical and meant to represent distinct classes.\n",
+ "Model Compatibility: Many classification algorithms (like logistic regression) are designed to handle binary outcomes directly without the need for standardization. In these cases, the model can interpret the original values correctly (e.g., 0 as \"negative\" and 2 as \"positive\").\n",
+ "Avoiding Misinterpretation: If you standardize a binary class, you might introduce values that don't correspond to the original categories, complicating interpretation."
]
},
{
@@ -236,7 +594,15 @@
"id": "f0676c21",
"metadata": {},
"source": [
- "> Your answer here..."
+ "Setting a random seed is important because:\n",
+ "\n",
+ "Reproducibility: It ensures that your results can be replicated. When you set a seed, the random number generator produces the same sequence of numbers every time, leading to consistent results across runs.\n",
+ "Controlled Experiments: In machine learning and statistical experiments, random processes (like data splitting or initializing weights) can introduce variability. A seed controls this variability, allowing for fair comparisons between models or configurations.\n",
+ "Regarding the particular seed value:\n",
+ "\n",
+ "Not Significantly Important: The specific number chosen as the seed (e.g., 42, 123) does not inherently affect the analysis; what matters is that the same seed is used across experiments for consistency.\n",
+ "Different Seeds for Variation: Different seed values can lead to different splits or initializations, which might help in assessing the robustness of your model. It can be beneficial to try multiple seeds to ensure your results are stable across different random configurations.\n",
+ "In summary, standardizing predictors enhances model performance, keeping the response variable as-is preserves its categorical nature, and setting a seed is vital for reproducibility without concern for the specific seed value itself."
]
},
{
@@ -251,7 +617,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 22,
"id": "72c101f2",
"metadata": {},
"outputs": [],
@@ -282,12 +648,46 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 25,
"id": "08818c64",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "The best value for n_neighbors is: 11\n"
+ ]
+ }
+ ],
"source": [
- "# Your code here..."
+ "import numpy as np\n",
+ "from sklearn.model_selection import GridSearchCV\n",
+ "from sklearn.neighbors import KNeighborsClassifier\n",
+ "from sklearn.datasets import load_iris\n",
+ "from sklearn.model_selection import train_test_split\n",
+ "\n",
+ "# Load dataset (you can replace this with your own dataset)\n",
+ "data = load_iris()\n",
+ "X = data.data\n",
+ "y = data.target\n",
+ "\n",
+ "# Split the dataset into training and testing sets\n",
+ "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n",
+ "\n",
+ "# Step 1: Initialize the KNN classifier\n",
+ "knn = KNeighborsClassifier()\n",
+ "\n",
+ "# Step 2: Define a parameter grid for n_neighbors\n",
+ "param_grid = {'n_neighbors': np.arange(1, 51)}\n",
+ "\n",
+ "# Step 3: Implement a grid search using GridSearchCV\n",
+ "grid_search = GridSearchCV(knn, param_grid, cv=10, scoring='accuracy')\n",
+ "grid_search.fit(X_train, y_train)\n",
+ "\n",
+ "# Step 4: Identify and return the best value for n_neighbors\n",
+ "best_n_neighbors = grid_search.best_params_['n_neighbors']\n",
+ "print(f\"The best value for n_neighbors is: {best_n_neighbors}\")"
]
},
{
@@ -303,12 +703,34 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 26,
"id": "ffefa9f2",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Model accuracy on the test set: 1.00\n"
+ ]
+ }
+ ],
"source": [
- "# Your code here..."
+ "from sklearn.metrics import accuracy_score\n",
+ "\n",
+ "# Assume the previous code has already defined X_train, X_test, y_train, y_test,\n",
+ "# and obtained best_n_neighbors from the grid search.\n",
+ "\n",
+ "# Step 1: Fit the KNN model using the best n_neighbors\n",
+ "best_knn = KNeighborsClassifier(n_neighbors=best_n_neighbors)\n",
+ "best_knn.fit(X_train, y_train)\n",
+ "\n",
+ "# Step 2: Make predictions on the test set\n",
+ "y_pred = best_knn.predict(X_test)\n",
+ "\n",
+ "# Step 3: Evaluate the model using accuracy_score\n",
+ "accuracy = accuracy_score(y_test, y_pred)\n",
+ "print(f\"Model accuracy on the test set: {accuracy:.2f}\")"
]
},
{
@@ -385,7 +807,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.9.19"
+ "version": "3.9.15"
},
"vscode": {
"interpreter": {