From 1a7921d27283642663a6cd252294dbe81dca4261 Mon Sep 17 00:00:00 2001
From: Daniel R <daniel.razavi@utoronto.ca>
Date: Mon, 7 Oct 2024 14:32:16 -0400
Subject: [PATCH] Updating A3 based on Given Feedback

---
 02_activities/assignments/assignment_3.ipynb | 104 +++++++++++++------
 1 file changed, 71 insertions(+), 33 deletions(-)

diff --git a/02_activities/assignments/assignment_3.ipynb b/02_activities/assignments/assignment_3.ipynb
index 7b34dd44..a09de274 100644
--- a/02_activities/assignments/assignment_3.ipynb
+++ b/02_activities/assignments/assignment_3.ipynb
@@ -33,13 +33,7 @@
     "1. **Sepal Length**: The length of the sepal in centimeters.\n",
     "2. **Sepal Width**: The width of the sepal in centimeters.\n",
     "3. **Petal Length**: The length of the petal in centimeters.\n",
-    "4. **Petal Width**: The width of the petal in centimeters.\n",
-    "\n",
-    "##### Target Variable:\n",
-    "- **Species**: The species of the iris flower, which can take one of the following values:\n",
-    "  - 0: Iris setosa\n",
-    "  - 1: Iris versicolor\n",
-    "  - 2: Iris virginica"
+    "4. **Petal Width**: The width of the petal in centimeters."
    ]
   },
   {
@@ -85,13 +79,10 @@
     "# Convert to DataFrame\n",
     "iris_df = pd.DataFrame(iris_data.data, columns=iris_data.feature_names)\n",
     "\n",
-    "# Bind the disease progression (diabetes target) to the DataFrame\n",
-    "iris_df['species'] = iris_data.target\n",
-    "\n",
     "# Display the DataFrame\n",
-    "iris_df\n",
+    "print(iris_df)\n",
     "\n",
-    "# Your Code...\n"
+    "# Your code here..."
    ]
   },
   {
@@ -102,17 +93,62 @@
     "#### **Question 2:** \n",
     "#### Data-visualization\n",
     "\n",
-    "Create plots to visualize the relationships between the features (sepal length, sepal width, petal length, petal width).\n"
+    "Let's create plots to visualize the relationships between the features (sepal length, sepal width, petal length, petal width).\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "id": "efd6dc0c",
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Your code here ..."
+    "def plot_feature_pairs(data, feature_names, color_labels=None, title_prefix=''):\n",
+    "    \"\"\"\n",
+    "    Helper function to create scatter plots for all possible pairs of features.\n",
+    "    \n",
+    "    Parameters:\n",
+    "    - data: DataFrame containing the features to be plotted.\n",
+    "    - feature_names: List of feature names to be used in plotting.\n",
+    "    - color_labels: Optional. Cluster or class labels to color the scatter plots.\n",
+    "    - title_prefix: Optional. Prefix for plot titles to distinguish between different sets of plots.\n",
+    "    \"\"\"\n",
+    "    # Create a figure for the scatter plots\n",
+    "    plt.figure(figsize=(12, 10))\n",
+    "    \n",
+    "    # Counter for subplot index\n",
+    "    plot_number = 1\n",
+    "    \n",
+    "    # Loop through each pair of features\n",
+    "    for i in range(len(feature_names)):\n",
+    "        for j in range(i + 1, len(feature_names)):\n",
+    "            plt.subplot(len(feature_names)-1, len(feature_names)-1, plot_number)\n",
+    "            \n",
+    "            # Scatter plot colored by labels if provided\n",
+    "            if color_labels is not None:\n",
+    "                plt.scatter(data[feature_names[i]], data[feature_names[j]], \n",
+    "                            c=color_labels, cmap='viridis', alpha=0.7)\n",
+    "            else:\n",
+    "                plt.scatter(data[feature_names[i]], data[feature_names[j]], alpha=0.7)\n",
+    "            \n",
+    "            plt.xlabel(feature_names[i])\n",
+    "            plt.ylabel(feature_names[j])\n",
+    "            plt.title(f'{title_prefix}{feature_names[i]} vs {feature_names[j]}')\n",
+    "            \n",
+    "            # Increment the plot number\n",
+    "            plot_number += 1\n",
+    "\n",
+    "    # Adjust layout to prevent overlap\n",
+    "    plt.tight_layout()\n",
+    "\n",
+    "    # Show the plot\n",
+    "    plt.show()\n",
+    "\n",
+    "# Get feature names\n",
+    "feature_names = iris_df.columns\n",
+    "\n",
+    "# Use the helper function to plot scatter plots without coloring by cluster labels\n",
+    "plot_feature_pairs(iris_df, feature_names, title_prefix='Original Data: ')"
    ]
   },
   {
@@ -120,7 +156,7 @@
    "id": "a9701cd4",
    "metadata": {},
    "source": [
-    "**Follow-up Question:**\n",
+    "**Question:**\n",
     "- Do you notice any patterns or relationships between the different features? How might these patterns help in distinguishing between different species?"
    ]
   },
@@ -151,15 +187,12 @@
     "# Initialize the StandardScaler\n",
     "scaler = StandardScaler()\n",
     "\n",
-    "# Scale the features (excluding the species column)\n",
-    "scaled_features = scaler.fit_transform(iris_df.iloc[:, :-1])\n",
+    "# Scale all the features in the dataset\n",
+    "scaled_features = scaler.fit_transform(iris_df)\n",
     "\n",
     "# Create a new DataFrame with scaled features\n",
     "scaled_iris_df = pd.DataFrame(scaled_features, columns=iris_data.feature_names)\n",
     "\n",
-    "# Add the species column back to the scaled DataFrame\n",
-    "scaled_iris_df['species'] = iris_df['species'].values\n",
-    "\n",
     "# Display the first few rows of the scaled DataFrame\n",
     "print(scaled_iris_df.head())"
    ]
@@ -187,35 +220,42 @@
    "source": [
     "#### **Question 4:** \n",
     "#### K-means clustering \n",
-    "Apply the K-Means clustering algorithm to the Iris dataset.\n",
-    "Choose the number of clusters (K=3, since there are three species) and fit the model.\n",
-    "Assign cluster labels to the original data and add them as a new column in the DataFrame."
+    "\n",
+    "Apply the K-Means clustering algorithm to the Iris dataset. Choose the value 3 for the number of clusters (`k=3`) and fit the model. Assign cluster labels to the original data and add them as a new column in the DataFrame."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": null,
    "id": "909df219",
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Your answer..."
+    "# Your answer...\n",
+    "\n",
+    "clustered_iris_data = 🤷‍♂️\n",
+    "\n",
+    "\n",
+    "# Use the helper function to plot scatter plots, colored by cluster labels\n",
+    "plot_feature_pairs(clustered_iris_data, feature_names, color_labels=clustered_iris_data['Cluster'], title_prefix='Clustered Data: ')"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "0aefdee5",
+   "id": "46914737",
    "metadata": {},
    "source": [
-    "Discuss the results of the K-Means clustering. How well did the clusters match the true species?"
+    "In the previous question, we chose `k=3` for the number of clusters arbitrarily. However, in a real-world scenario, it is important to determine the optimal number of clusters using appropriate methods.\n",
+    "\n",
+    "**Question**: What is one method commonly used to determine the optimal number of clusters in K-means clustering, and why is this method helpful?"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "7bcebc16",
+   "id": "83349688",
    "metadata": {},
    "source": [
-    "> Your answer here ..."
+    "> Your answer here..."
    ]
   },
   {
@@ -226,9 +266,7 @@
     "#### **Question 5:** \n",
     "#### Bootstrapping \n",
     "\n",
-    "Implement bootstrapping on the mean of Petal Width. Generate 10000 bootstrap samples, calculate the mean for each sample, and compute a 90% confidence interval.\n",
-    "\n",
-    "hint: `use np.random.seed(123)`"
+    "Implement bootstrapping on the mean of Petal Width. Generate 10000 bootstrap samples, calculate the mean for each sample, and compute a 90% confidence interval."
    ]
   },
   {