From 1a7921d27283642663a6cd252294dbe81dca4261 Mon Sep 17 00:00:00 2001 From: Daniel R Date: Mon, 7 Oct 2024 14:32:16 -0400 Subject: [PATCH] Updating A3 based on Given Feedback --- 02_activities/assignments/assignment_3.ipynb | 104 +++++++++++++------ 1 file changed, 71 insertions(+), 33 deletions(-) diff --git a/02_activities/assignments/assignment_3.ipynb b/02_activities/assignments/assignment_3.ipynb index 7b34dd44..a09de274 100644 --- a/02_activities/assignments/assignment_3.ipynb +++ b/02_activities/assignments/assignment_3.ipynb @@ -33,13 +33,7 @@ "1. **Sepal Length**: The length of the sepal in centimeters.\n", "2. **Sepal Width**: The width of the sepal in centimeters.\n", "3. **Petal Length**: The length of the petal in centimeters.\n", - "4. **Petal Width**: The width of the petal in centimeters.\n", - "\n", - "##### Target Variable:\n", - "- **Species**: The species of the iris flower, which can take one of the following values:\n", - " - 0: Iris setosa\n", - " - 1: Iris versicolor\n", - " - 2: Iris virginica" + "4. **Petal Width**: The width of the petal in centimeters." ] }, { @@ -85,13 +79,10 @@ "# Convert to DataFrame\n", "iris_df = pd.DataFrame(iris_data.data, columns=iris_data.feature_names)\n", "\n", - "# Bind the disease progression (diabetes target) to the DataFrame\n", - "iris_df['species'] = iris_data.target\n", - "\n", "# Display the DataFrame\n", - "iris_df\n", + "print(iris_df)\n", "\n", - "# Your Code...\n" + "# Your code here..." ] }, { @@ -102,17 +93,62 @@ "#### **Question 2:** \n", "#### Data-visualization\n", "\n", - "Create plots to visualize the relationships between the features (sepal length, sepal width, petal length, petal width).\n" + "Let's create plots to visualize the relationships between the features (sepal length, sepal width, petal length, petal width).\n" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "efd6dc0c", "metadata": {}, "outputs": [], "source": [ - "# Your code here ..." + "def plot_feature_pairs(data, feature_names, color_labels=None, title_prefix=''):\n", + " \"\"\"\n", + " Helper function to create scatter plots for all possible pairs of features.\n", + " \n", + " Parameters:\n", + " - data: DataFrame containing the features to be plotted.\n", + " - feature_names: List of feature names to be used in plotting.\n", + " - color_labels: Optional. Cluster or class labels to color the scatter plots.\n", + " - title_prefix: Optional. Prefix for plot titles to distinguish between different sets of plots.\n", + " \"\"\"\n", + " # Create a figure for the scatter plots\n", + " plt.figure(figsize=(12, 10))\n", + " \n", + " # Counter for subplot index\n", + " plot_number = 1\n", + " \n", + " # Loop through each pair of features\n", + " for i in range(len(feature_names)):\n", + " for j in range(i + 1, len(feature_names)):\n", + " plt.subplot(len(feature_names)-1, len(feature_names)-1, plot_number)\n", + " \n", + " # Scatter plot colored by labels if provided\n", + " if color_labels is not None:\n", + " plt.scatter(data[feature_names[i]], data[feature_names[j]], \n", + " c=color_labels, cmap='viridis', alpha=0.7)\n", + " else:\n", + " plt.scatter(data[feature_names[i]], data[feature_names[j]], alpha=0.7)\n", + " \n", + " plt.xlabel(feature_names[i])\n", + " plt.ylabel(feature_names[j])\n", + " plt.title(f'{title_prefix}{feature_names[i]} vs {feature_names[j]}')\n", + " \n", + " # Increment the plot number\n", + " plot_number += 1\n", + "\n", + " # Adjust layout to prevent overlap\n", + " plt.tight_layout()\n", + "\n", + " # Show the plot\n", + " plt.show()\n", + "\n", + "# Get feature names\n", + "feature_names = iris_df.columns\n", + "\n", + "# Use the helper function to plot scatter plots without coloring by cluster labels\n", + "plot_feature_pairs(iris_df, feature_names, title_prefix='Original Data: ')" ] }, { @@ -120,7 +156,7 @@ "id": "a9701cd4", "metadata": {}, "source": [ - "**Follow-up Question:**\n", + "**Question:**\n", "- Do you notice any patterns or relationships between the different features? How might these patterns help in distinguishing between different species?" ] }, @@ -151,15 +187,12 @@ "# Initialize the StandardScaler\n", "scaler = StandardScaler()\n", "\n", - "# Scale the features (excluding the species column)\n", - "scaled_features = scaler.fit_transform(iris_df.iloc[:, :-1])\n", + "# Scale all the features in the dataset\n", + "scaled_features = scaler.fit_transform(iris_df)\n", "\n", "# Create a new DataFrame with scaled features\n", "scaled_iris_df = pd.DataFrame(scaled_features, columns=iris_data.feature_names)\n", "\n", - "# Add the species column back to the scaled DataFrame\n", - "scaled_iris_df['species'] = iris_df['species'].values\n", - "\n", "# Display the first few rows of the scaled DataFrame\n", "print(scaled_iris_df.head())" ] @@ -187,35 +220,42 @@ "source": [ "#### **Question 4:** \n", "#### K-means clustering \n", - "Apply the K-Means clustering algorithm to the Iris dataset.\n", - "Choose the number of clusters (K=3, since there are three species) and fit the model.\n", - "Assign cluster labels to the original data and add them as a new column in the DataFrame." + "\n", + "Apply the K-Means clustering algorithm to the Iris dataset. Choose the value 3 for the number of clusters (`k=3`) and fit the model. Assign cluster labels to the original data and add them as a new column in the DataFrame." ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "id": "909df219", "metadata": {}, "outputs": [], "source": [ - "# Your answer..." + "# Your answer...\n", + "\n", + "clustered_iris_data = 🤷‍♂️\n", + "\n", + "\n", + "# Use the helper function to plot scatter plots, colored by cluster labels\n", + "plot_feature_pairs(clustered_iris_data, feature_names, color_labels=clustered_iris_data['Cluster'], title_prefix='Clustered Data: ')" ] }, { "cell_type": "markdown", - "id": "0aefdee5", + "id": "46914737", "metadata": {}, "source": [ - "Discuss the results of the K-Means clustering. How well did the clusters match the true species?" + "In the previous question, we chose `k=3` for the number of clusters arbitrarily. However, in a real-world scenario, it is important to determine the optimal number of clusters using appropriate methods.\n", + "\n", + "**Question**: What is one method commonly used to determine the optimal number of clusters in K-means clustering, and why is this method helpful?" ] }, { "cell_type": "markdown", - "id": "7bcebc16", + "id": "83349688", "metadata": {}, "source": [ - "> Your answer here ..." + "> Your answer here..." ] }, { @@ -226,9 +266,7 @@ "#### **Question 5:** \n", "#### Bootstrapping \n", "\n", - "Implement bootstrapping on the mean of Petal Width. Generate 10000 bootstrap samples, calculate the mean for each sample, and compute a 90% confidence interval.\n", - "\n", - "hint: `use np.random.seed(123)`" + "Implement bootstrapping on the mean of Petal Width. Generate 10000 bootstrap samples, calculate the mean for each sample, and compute a 90% confidence interval." ] }, {