From d0da0726b82c5f51a05fa802302420182e68bdf7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arnau=20Rodr=C3=ADguez=20Rubio?= Date: Thu, 28 Aug 2025 19:22:37 +0200 Subject: [PATCH] Solved lab --- lab-hypothesis-testing.ipynb | 211 +++++++++++++++++++++++++++++++---- 1 file changed, 192 insertions(+), 19 deletions(-) diff --git a/lab-hypothesis-testing.ipynb b/lab-hypothesis-testing.ipynb index 0cc26d5..0ce8b8b 100644 --- a/lab-hypothesis-testing.ipynb +++ b/lab-hypothesis-testing.ipynb @@ -38,20 +38,21 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "#libraries\n", "import pandas as pd\n", "import scipy.stats as st\n", + "from statsmodels.multivariate.manova import MANOVA\n", "import numpy as np\n", "\n" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 20, "metadata": {}, "outputs": [ { @@ -278,7 +279,7 @@ "[800 rows x 11 columns]" ] }, - "execution_count": 3, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -288,6 +289,17 @@ "df" ] }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "# Normalise column names\n", + "df.columns = df.columns.str.replace('.', '')\n", + "df.columns = df.columns.str.replace(' ', '')" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -297,11 +309,33 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 64, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "We reject the null hypotesis\n" + ] + } + ], "source": [ - "#code here" + "df_dragon = df[(df[\"Type1\"]==\"Dragon\") | (df[\"Type2\"]==\"Dragon\")][\"HP\"]\n", + "df_grass = df[(df[\"Type1\"]==\"Grass\") | (df[\"Type2\"]==\"Grass\")][\"HP\"]\n", + "\n", + "#Set the hypothesis\n", + "#H0: mu_HP Dragon > mu_HP Grass\n", + "#H1: mu_HP Dragon <= mu_HP Grass\n", + "#significance level = 0.05\n", + "alpha = 0.05\n", + "\n", + "p_value = st.ttest_ind(df_dragon, df_grass, equal_var =False, alternative = \"greater\")[1]\n", + "\n", + "if p_value > alpha:\n", + " print(\"We are not able to reject the null hypothesis\")\n", + "else:\n", + " print(\"We reject the null hypotesis\")" ] }, { @@ -313,11 +347,99 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": null, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "P value for HP: 0.0000\n", + "We reject the null hypotesis, HP is different for Legendary Pokémons\n", + "\n", + "P value for Attack: 0.0000\n", + "We reject the null hypotesis, Attack is different for Legendary Pokémons\n", + "\n", + "P value for Defense: 0.0000\n", + "We reject the null hypotesis, Defense is different for Legendary Pokémons\n", + "\n", + "P value for SpAtk: 0.0000\n", + "We reject the null hypotesis, SpAtk is different for Legendary Pokémons\n", + "\n", + "P value for SpDef: 0.0000\n", + "We reject the null hypotesis, SpDef is different for Legendary Pokémons\n", + "\n", + "P value for Speed: 0.0000\n", + "We reject the null hypotesis, Speed is different for Legendary Pokémons\n", + "\n" + ] + } + ], "source": [ - "#code here" + "#Set the hypothesis\n", + "\n", + "#H0: mu_stat Legendary = mu_stats non-Legendary\n", + "#H1: mu_stats Legendary != mu_stats non-Legendary\n", + "#significance level = 0.05\n", + "alpha = 0.05\n", + "\n", + "stats_col = ['HP', 'Attack', 'Defense', 'SpAtk', 'SpDef', 'Speed']\n", + "for stat in stats_col:\n", + " df_legendary = df[df[\"Legendary\"]==True][stat]\n", + " df_non_legendary = df[df[\"Legendary\"]==False][stat]\n", + " p_value = st.f_oneway(df_legendary, df_non_legendary)[1]\n", + " print(f'P value for {stat}: {p_value:.4f}')\n", + " if p_value > alpha:\n", + " print(f\"We are not able to reject the null hypothesis.\")\n", + " else:\n", + " print(f\"We reject the null hypotesis, {stat} is different for Legendary Pokémons\\n\")\n", + "\n", + "## However this only tests for INDIVIDUAL relationships, not as a whole." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Multivariate linear model\n", + "================================================================\n", + " \n", + "----------------------------------------------------------------\n", + " Intercept Value Num DF Den DF F Value Pr > F\n", + "----------------------------------------------------------------\n", + " Wilks' lambda 0.0592 6.0000 793.0000 2100.8338 0.0000\n", + " Pillai's trace 0.9408 6.0000 793.0000 2100.8338 0.0000\n", + " Hotelling-Lawley trace 15.8953 6.0000 793.0000 2100.8338 0.0000\n", + " Roy's greatest root 15.8953 6.0000 793.0000 2100.8338 0.0000\n", + "----------------------------------------------------------------\n", + " \n", + "----------------------------------------------------------------\n", + " Legendary Value Num DF Den DF F Value Pr > F\n", + "----------------------------------------------------------------\n", + " Wilks' lambda 0.7331 6.0000 793.0000 48.1098 0.0000\n", + " Pillai's trace 0.2669 6.0000 793.0000 48.1098 0.0000\n", + " Hotelling-Lawley trace 0.3640 6.0000 793.0000 48.1098 0.0000\n", + " Roy's greatest root 0.3640 6.0000 793.0000 48.1098 0.0000\n", + "================================================================\n", + "\n" + ] + } + ], + "source": [ + "## MANOVA is an appropriate test for multivariate analysis, but I don't know how to interpret the results and the syntax is not properly \n", + "## described in statsmodels documentation. Did this with ChatGPT.\n", + "\n", + "# Select only relevant columns\n", + "manova_df = df[['Legendary', 'HP', 'Attack', 'Defense', 'SpAtk', 'SpDef', 'Speed']]\n", + "maov = MANOVA.from_formula('HP + Attack + Defense + SpAtk + SpDef + Speed ~ Legendary', data=manova_df)\n", + "print(maov.mv_test())\n", + "\n", + "## Because all tests yield P values (Pr > F) lower than 0.05, we reject H0." ] }, { @@ -337,7 +459,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 43, "metadata": {}, "outputs": [ { @@ -453,14 +575,14 @@ "4 624.0 262.0 1.9250 65500.0 " ] }, - "execution_count": 5, + "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/california_housing.csv\")\n", - "df.head()" + "df_housing = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/california_housing.csv\")\n", + "df_housing.head()" ] }, { @@ -483,22 +605,73 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 55, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "def get_distances(row):\n", + " sch_coord = [-118, 34]\n", + " hos_coord = [-122, 37]\n", + "\n", + " dist_sch = np.sqrt((sch_coord[0] - row[\"longitude\"])**2 + (sch_coord[1] - row[\"latitude\"])**2)\n", + " dist_hos = np.sqrt((hos_coord[0] - row[\"longitude\"])**2 + (hos_coord[1] - row[\"latitude\"])**2)\n", + " return pd.Series({\n", + " 'school_dist': 'Close' if dist_sch < 0.5 else 'Far',\n", + " 'hospital_dist': 'Close' if dist_hos < 0.5 else 'Far'\n", + " })" + ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 57, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "df_housing[['school_dist', 'hospital_dist']] = df_housing.apply(get_distances, axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": {}, + "outputs": [], + "source": [ + "df_close = df_housing[(df_housing['school_dist']=='Close') | (df_housing['hospital_dist']=='Close')]['median_house_value']\n", + "df_far = df_housing[(df_housing['school_dist']=='Far') | (df_housing['hospital_dist']=='Far')]['median_house_value']" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "We reject the null hypotesis\n" + ] + } + ], + "source": [ + "#Set the hypothesis\n", + "\n", + "#H0: median_house_value_close <= median_house_value_far\n", + "#H1: median_house_value_close > median_house_value_far\n", + "#significance level = 0.05\n", + "alpha = 0.05\n", + "\n", + "p_value = st.ttest_ind(df_close,df_far, equal_var=False, alternative='greater')[1]\n", + "if p_value > alpha:\n", + " print(\"We are not able to reject the null hypothesis\")\n", + "else:\n", + " print(\"We reject the null hypotesis\")" + ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "base", "language": "python", "name": "python3" }, @@ -512,7 +685,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.9" + "version": "3.13.5" } }, "nbformat": 4,