data-bootcamp-v4 · KlaudijaK · Sep 4, 2025
diff --git a/lab-hypothesis-testing.ipynb b/lab-hypothesis-testing.ipynb
@@ -51,7 +51,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [
     {
@@ -278,7 +278,7 @@
        "[800 rows x 11 columns]"
       ]
      },
-     "execution_count": 3,
+     "execution_count": 2,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -288,6 +288,42 @@
     "df"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "RangeIndex: 800 entries, 0 to 799\n",
+      "Data columns (total 11 columns):\n",
+      " #   Column      Non-Null Count  Dtype \n",
+      "---  ------      --------------  ----- \n",
+      " 0   Name        799 non-null    object\n",
+      " 1   Type 1      800 non-null    object\n",
+      " 2   Type 2      414 non-null    object\n",
+      " 3   HP          800 non-null    int64 \n",
+      " 4   Attack      800 non-null    int64 \n",
+      " 5   Defense     800 non-null    int64 \n",
+      " 6   Sp. Atk     800 non-null    int64 \n",
+      " 7   Sp. Def     800 non-null    int64 \n",
+      " 8   Speed       800 non-null    int64 \n",
+      " 9   Generation  800 non-null    int64 \n",
+      " 10  Legendary   800 non-null    bool  \n",
+      "dtypes: bool(1), int64(7), object(3)\n",
+      "memory usage: 63.4+ KB\n"
+     ]
+    }
+   ],
+   "source": [
+    "df.info()\n",
+    "\n",
+    "\n"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -297,11 +333,33 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 8,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Reject the null hypothesis: Dragon-type Pokémon have significantly higher HP on average than Grass-type.\n"
+     ]
+    }
+   ],
    "source": [
-    "#code here"
+    "import pandas as pd\n",
+    "from scipy import stats\n",
+    "\n",
+    "\n",
+    "df_dragon = df[df['Type 1'] == 'Dragon']\n",
+    "df_grass = df[df['Type 1'] == 'Grass']\n",
+    "\n",
+    "# Conduct Two Sample T-Test\n",
+    "t_stat, p_value = stats.ttest_ind(df_dragon['HP'], df_grass['HP'], equal_var=True)\n",
+    "\n",
+    "# Compare the p-value with the significance level\n",
+    "if p_value < 0.05:\n",
+    "    print(\"Reject the null hypothesis: Dragon-type Pokémon have significantly higher HP on average than Grass-type.\")\n",
+    "else:\n",
+    "    print(\"Fail to reject the null hypothesis: No significant difference in HP between Dragon and Grass types.\")"
    ]
   },
   {
@@ -313,11 +371,36 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 9,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Reject the null hypothesis for HP: Legendary Pokémon have significantly different HP compared to Non-Legendary.\n",
+      "Reject the null hypothesis for Attack: Legendary Pokémon have significantly different Attack compared to Non-Legendary.\n",
+      "Reject the null hypothesis for Defense: Legendary Pokémon have significantly different Defense compared to Non-Legendary.\n",
+      "Reject the null hypothesis for Sp. Atk: Legendary Pokémon have significantly different Sp. Atk compared to Non-Legendary.\n",
+      "Reject the null hypothesis for Sp. Def: Legendary Pokémon have significantly different Sp. Def compared to Non-Legendary.\n",
+      "Reject the null hypothesis for Speed: Legendary Pokémon have significantly different Speed compared to Non-Legendary.\n"
+     ]
+    }
+   ],
    "source": [
-    "#code here"
+    "legendary_pokemon = df[df['Legendary'] == True]\n",
+    "non_legendary_pokemon = df[df['Legendary'] == False]\n",
+    "\n",
+    "# Stats columns to compare\n",
+    "stats_columns = ['HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed']\n",
+    "\n",
+    "# Iterate over each stat and perform a t-test\n",
+    "for stat in stats_columns:\n",
+    "    t_stat, p_value = stats.ttest_ind(legendary_pokemon[stat], non_legendary_pokemon[stat], equal_var=True)\n",
+    "    if p_value < 0.05:\n",
+    "        print(f\"Reject the null hypothesis for {stat}: Legendary Pokémon have significantly different {stat} compared to Non-Legendary.\")\n",
+    "    else:\n",
+    "        print(f\"Fail to reject the null hypothesis for {stat}: No significant difference in {stat} between Legendary and Non-Legendary.\")"
    ]
   },
   {
@@ -337,7 +420,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [
     {
@@ -453,7 +536,7 @@
        "4       624.0       262.0         1.9250             65500.0  "
       ]
      },
-     "execution_count": 5,
+     "execution_count": 10,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -463,6 +546,38 @@
     "df.head()"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "RangeIndex: 17000 entries, 0 to 16999\n",
+      "Data columns (total 9 columns):\n",
+      " #   Column              Non-Null Count  Dtype  \n",
+      "---  ------              --------------  -----  \n",
+      " 0   longitude           17000 non-null  float64\n",
+      " 1   latitude            17000 non-null  float64\n",
+      " 2   housing_median_age  17000 non-null  float64\n",
+      " 3   total_rooms         17000 non-null  float64\n",
+      " 4   total_bedrooms      17000 non-null  float64\n",
+      " 5   population          17000 non-null  float64\n",
+      " 6   households          17000 non-null  float64\n",
+      " 7   median_income       17000 non-null  float64\n",
+      " 8   median_house_value  17000 non-null  float64\n",
+      "dtypes: float64(9)\n",
+      "memory usage: 1.2 MB\n"
+     ]
+    }
+   ],
+   "source": [
+    "df.info()"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -483,10 +598,45 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 12,
    "metadata": {},
-   "outputs": [],
-   "source": []
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Reject the null hypothesis: Houses close to a school/hospital have significantly different median house values.\n"
+     ]
+    }
+   ],
+   "source": [
+    "def euclidean_distance(lon1, lat1, lon2, lat2):\n",
+    "    return np.sqrt((lon1 - lon2)**2 + (lat1 - lat2)**2)\n",
+    "\n",
+    "# Define school and hospital coordinates\n",
+    "school_coords = (-118, 34)\n",
+    "hospital_coords = (-122, 37)\n",
+    "\n",
+    "# Load your dataset into a DataFrame, assuming `df` is your DataFrame\n",
+    "df['distance_to_school'] = euclidean_distance(df['longitude'], df['latitude'], *school_coords)\n",
+    "df['distance_to_hospital'] = euclidean_distance(df['longitude'], df['latitude'], *hospital_coords)\n",
+    "\n",
+    "# Define \"close\" as having a distance of less than 0.50 to either a school or a hospital\n",
+    "df['close_to_school_or_hospital'] = (df['distance_to_school'] < 0.50) | (df['distance_to_hospital'] < 0.50)\n",
+    "\n",
+    "# Divide the dataset\n",
+    "close_houses = df[df['close_to_school_or_hospital'] == True]\n",
+    "far_houses = df[df['close_to_school_or_hospital'] == False]\n",
+    "\n",
+    "# Perform Two Sample T-Test\n",
+    "t_stat, p_value = stats.ttest_ind(close_houses['median_house_value'], far_houses['median_house_value'], equal_var=True)\n",
+    "\n",
+    "# Interpret the results\n",
+    "if p_value < 0.05:\n",
+    "    print(\"Reject the null hypothesis: Houses close to a school/hospital have significantly different median house values.\")\n",
+    "else:\n",
+    "    print(\"Fail to reject the null hypothesis: No significant difference in median house values between houses close to and far from a school/hospital.\")"
+   ]
   },
   {
    "cell_type": "code",
@@ -498,7 +648,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "base",
    "language": "python",
    "name": "python3"
   },
@@ -512,7 +662,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.9"
+   "version": "3.12.7"
   }
  },
  "nbformat": 4,