data-bootcamp-v4 · arnaurr94 · Aug 28, 2025
diff --git a/lab-hypothesis-testing.ipynb b/lab-hypothesis-testing.ipynb
@@ -38,20 +38,21 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 19,
    "metadata": {},
    "outputs": [],
    "source": [
     "#libraries\n",
     "import pandas as pd\n",
     "import scipy.stats as st\n",
+    "from statsmodels.multivariate.manova import MANOVA\n",
     "import numpy as np\n",
     "\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 20,
    "metadata": {},
    "outputs": [
     {
@@ -278,7 +279,7 @@
        "[800 rows x 11 columns]"
       ]
      },
-     "execution_count": 3,
+     "execution_count": 20,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -288,6 +289,17 @@
     "df"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Normalise column names\n",
+    "df.columns = df.columns.str.replace('.', '')\n",
+    "df.columns = df.columns.str.replace(' ', '')"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -297,11 +309,33 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 64,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "We reject the null hypotesis\n"
+     ]
+    }
+   ],
    "source": [
-    "#code here"
+    "df_dragon = df[(df[\"Type1\"]==\"Dragon\") | (df[\"Type2\"]==\"Dragon\")][\"HP\"]\n",
+    "df_grass = df[(df[\"Type1\"]==\"Grass\") | (df[\"Type2\"]==\"Grass\")][\"HP\"]\n",
+    "\n",
+    "#Set the hypothesis\n",
+    "#H0: mu_HP Dragon > mu_HP Grass\n",
+    "#H1: mu_HP Dragon <= mu_HP Grass\n",
+    "#significance level = 0.05\n",
+    "alpha = 0.05\n",
+    "\n",
+    "p_value = st.ttest_ind(df_dragon, df_grass, equal_var =False, alternative = \"greater\")[1]\n",
+    "\n",
+    "if p_value > alpha:\n",
+    "    print(\"We are not able to reject the null hypothesis\")\n",
+    "else:\n",
+    "    print(\"We reject the null hypotesis\")"
    ]
   },
   {
@@ -313,11 +347,99 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": null,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "P value for HP: 0.0000\n",
+      "We reject the null hypotesis, HP is different for Legendary Pokémons\n",
+      "\n",
+      "P value for Attack: 0.0000\n",
+      "We reject the null hypotesis, Attack is different for Legendary Pokémons\n",
+      "\n",
+      "P value for Defense: 0.0000\n",
+      "We reject the null hypotesis, Defense is different for Legendary Pokémons\n",
+      "\n",
+      "P value for SpAtk: 0.0000\n",
+      "We reject the null hypotesis, SpAtk is different for Legendary Pokémons\n",
+      "\n",
+      "P value for SpDef: 0.0000\n",
+      "We reject the null hypotesis, SpDef is different for Legendary Pokémons\n",
+      "\n",
+      "P value for Speed: 0.0000\n",
+      "We reject the null hypotesis, Speed is different for Legendary Pokémons\n",
+      "\n"
+     ]
+    }
+   ],
    "source": [
-    "#code here"
+    "#Set the hypothesis\n",
+    "\n",
+    "#H0: mu_stat Legendary = mu_stats non-Legendary\n",
+    "#H1: mu_stats Legendary != mu_stats non-Legendary\n",
+    "#significance level = 0.05\n",
+    "alpha = 0.05\n",
+    "\n",
+    "stats_col = ['HP', 'Attack', 'Defense', 'SpAtk', 'SpDef', 'Speed']\n",
+    "for stat in stats_col:\n",
+    "    df_legendary = df[df[\"Legendary\"]==True][stat]\n",
+    "    df_non_legendary = df[df[\"Legendary\"]==False][stat]\n",
+    "    p_value = st.f_oneway(df_legendary, df_non_legendary)[1]\n",
+    "    print(f'P value for {stat}: {p_value:.4f}')\n",
+    "    if p_value > alpha:\n",
+    "        print(f\"We are not able to reject the null hypothesis.\")\n",
+    "    else:\n",
+    "        print(f\"We reject the null hypotesis, {stat} is different for Legendary Pokémons\\n\")\n",
+    "\n",
+    "## However this only tests for INDIVIDUAL relationships, not as a whole."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "                   Multivariate linear model\n",
+      "================================================================\n",
+      "                                                                \n",
+      "----------------------------------------------------------------\n",
+      "       Intercept         Value  Num DF  Den DF   F Value  Pr > F\n",
+      "----------------------------------------------------------------\n",
+      "          Wilks' lambda  0.0592 6.0000 793.0000 2100.8338 0.0000\n",
+      "         Pillai's trace  0.9408 6.0000 793.0000 2100.8338 0.0000\n",
+      " Hotelling-Lawley trace 15.8953 6.0000 793.0000 2100.8338 0.0000\n",
+      "    Roy's greatest root 15.8953 6.0000 793.0000 2100.8338 0.0000\n",
+      "----------------------------------------------------------------\n",
+      "                                                                \n",
+      "----------------------------------------------------------------\n",
+      "          Legendary        Value  Num DF  Den DF  F Value Pr > F\n",
+      "----------------------------------------------------------------\n",
+      "             Wilks' lambda 0.7331 6.0000 793.0000 48.1098 0.0000\n",
+      "            Pillai's trace 0.2669 6.0000 793.0000 48.1098 0.0000\n",
+      "    Hotelling-Lawley trace 0.3640 6.0000 793.0000 48.1098 0.0000\n",
+      "       Roy's greatest root 0.3640 6.0000 793.0000 48.1098 0.0000\n",
+      "================================================================\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "## MANOVA is an appropriate test for multivariate analysis, but I don't know how to interpret the results and the syntax is not properly \n",
+    "## described in statsmodels documentation. Did this with ChatGPT.\n",
+    "\n",
+    "# Select only relevant columns\n",
+    "manova_df = df[['Legendary', 'HP', 'Attack', 'Defense', 'SpAtk', 'SpDef', 'Speed']]\n",
+    "maov = MANOVA.from_formula('HP + Attack + Defense + SpAtk + SpDef + Speed ~ Legendary', data=manova_df)\n",
+    "print(maov.mv_test())\n",
+    "\n",
+    "## Because all tests yield P values (Pr > F) lower than 0.05, we reject H0."
    ]
   },
   {
@@ -337,7 +459,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 43,
    "metadata": {},
    "outputs": [
     {
@@ -453,14 +575,14 @@
        "4       624.0       262.0         1.9250             65500.0  "
       ]
      },
-     "execution_count": 5,
+     "execution_count": 43,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "df = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/california_housing.csv\")\n",
-    "df.head()"
+    "df_housing = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/california_housing.csv\")\n",
+    "df_housing.head()"
    ]
   },
   {
@@ -483,22 +605,73 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 55,
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "def get_distances(row):\n",
+    "    sch_coord = [-118, 34]\n",
+    "    hos_coord = [-122, 37]\n",
+    "\n",
+    "    dist_sch = np.sqrt((sch_coord[0] - row[\"longitude\"])**2 + (sch_coord[1] - row[\"latitude\"])**2)\n",
+    "    dist_hos = np.sqrt((hos_coord[0] - row[\"longitude\"])**2 + (hos_coord[1] - row[\"latitude\"])**2)\n",
+    "    return pd.Series({\n",
+    "                    'school_dist': 'Close' if dist_sch < 0.5 else 'Far',\n",
+    "                    'hospital_dist': 'Close' if dist_hos < 0.5 else 'Far'\n",
+    "    })"
+   ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 57,
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "df_housing[['school_dist', 'hospital_dist']] = df_housing.apply(get_distances, axis=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 67,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_close = df_housing[(df_housing['school_dist']=='Close') | (df_housing['hospital_dist']=='Close')]['median_house_value']\n",
+    "df_far = df_housing[(df_housing['school_dist']=='Far') | (df_housing['hospital_dist']=='Far')]['median_house_value']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 68,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "We reject the null hypotesis\n"
+     ]
+    }
+   ],
+   "source": [
+    "#Set the hypothesis\n",
+    "\n",
+    "#H0: median_house_value_close <= median_house_value_far\n",
+    "#H1: median_house_value_close > median_house_value_far\n",
+    "#significance level = 0.05\n",
+    "alpha = 0.05\n",
+    "\n",
+    "p_value = st.ttest_ind(df_close,df_far, equal_var=False, alternative='greater')[1]\n",
+    "if p_value > alpha:\n",
+    "    print(\"We are not able to reject the null hypothesis\")\n",
+    "else:\n",
+    "    print(\"We reject the null hypotesis\")"
+   ]
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "base",
    "language": "python",
    "name": "python3"
   },
@@ -512,7 +685,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.9"
+   "version": "3.13.5"
   }
  },
  "nbformat": 4,