Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
211 changes: 192 additions & 19 deletions lab-hypothesis-testing.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -38,20 +38,21 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"#libraries\n",
"import pandas as pd\n",
"import scipy.stats as st\n",
"from statsmodels.multivariate.manova import MANOVA\n",
"import numpy as np\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 20,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -278,7 +279,7 @@
"[800 rows x 11 columns]"
]
},
"execution_count": 3,
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -288,6 +289,17 @@
"df"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"# Normalise column names\n",
"df.columns = df.columns.str.replace('.', '')\n",
"df.columns = df.columns.str.replace(' ', '')"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand All @@ -297,11 +309,33 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 64,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"We reject the null hypotesis\n"
]
}
],
"source": [
"#code here"
"df_dragon = df[(df[\"Type1\"]==\"Dragon\") | (df[\"Type2\"]==\"Dragon\")][\"HP\"]\n",
"df_grass = df[(df[\"Type1\"]==\"Grass\") | (df[\"Type2\"]==\"Grass\")][\"HP\"]\n",
"\n",
"#Set the hypothesis\n",
"#H0: mu_HP Dragon > mu_HP Grass\n",
"#H1: mu_HP Dragon <= mu_HP Grass\n",
"#significance level = 0.05\n",
"alpha = 0.05\n",
"\n",
"p_value = st.ttest_ind(df_dragon, df_grass, equal_var =False, alternative = \"greater\")[1]\n",
"\n",
"if p_value > alpha:\n",
" print(\"We are not able to reject the null hypothesis\")\n",
"else:\n",
" print(\"We reject the null hypotesis\")"
]
},
{
Expand All @@ -313,11 +347,99 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": null,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"P value for HP: 0.0000\n",
"We reject the null hypotesis, HP is different for Legendary Pokémons\n",
"\n",
"P value for Attack: 0.0000\n",
"We reject the null hypotesis, Attack is different for Legendary Pokémons\n",
"\n",
"P value for Defense: 0.0000\n",
"We reject the null hypotesis, Defense is different for Legendary Pokémons\n",
"\n",
"P value for SpAtk: 0.0000\n",
"We reject the null hypotesis, SpAtk is different for Legendary Pokémons\n",
"\n",
"P value for SpDef: 0.0000\n",
"We reject the null hypotesis, SpDef is different for Legendary Pokémons\n",
"\n",
"P value for Speed: 0.0000\n",
"We reject the null hypotesis, Speed is different for Legendary Pokémons\n",
"\n"
]
}
],
"source": [
"#code here"
"#Set the hypothesis\n",
"\n",
"#H0: mu_stat Legendary = mu_stats non-Legendary\n",
"#H1: mu_stats Legendary != mu_stats non-Legendary\n",
"#significance level = 0.05\n",
"alpha = 0.05\n",
"\n",
"stats_col = ['HP', 'Attack', 'Defense', 'SpAtk', 'SpDef', 'Speed']\n",
"for stat in stats_col:\n",
" df_legendary = df[df[\"Legendary\"]==True][stat]\n",
" df_non_legendary = df[df[\"Legendary\"]==False][stat]\n",
" p_value = st.f_oneway(df_legendary, df_non_legendary)[1]\n",
" print(f'P value for {stat}: {p_value:.4f}')\n",
" if p_value > alpha:\n",
" print(f\"We are not able to reject the null hypothesis.\")\n",
" else:\n",
" print(f\"We reject the null hypotesis, {stat} is different for Legendary Pokémons\\n\")\n",
"\n",
"## However this only tests for INDIVIDUAL relationships, not as a whole."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" Multivariate linear model\n",
"================================================================\n",
" \n",
"----------------------------------------------------------------\n",
" Intercept Value Num DF Den DF F Value Pr > F\n",
"----------------------------------------------------------------\n",
" Wilks' lambda 0.0592 6.0000 793.0000 2100.8338 0.0000\n",
" Pillai's trace 0.9408 6.0000 793.0000 2100.8338 0.0000\n",
" Hotelling-Lawley trace 15.8953 6.0000 793.0000 2100.8338 0.0000\n",
" Roy's greatest root 15.8953 6.0000 793.0000 2100.8338 0.0000\n",
"----------------------------------------------------------------\n",
" \n",
"----------------------------------------------------------------\n",
" Legendary Value Num DF Den DF F Value Pr > F\n",
"----------------------------------------------------------------\n",
" Wilks' lambda 0.7331 6.0000 793.0000 48.1098 0.0000\n",
" Pillai's trace 0.2669 6.0000 793.0000 48.1098 0.0000\n",
" Hotelling-Lawley trace 0.3640 6.0000 793.0000 48.1098 0.0000\n",
" Roy's greatest root 0.3640 6.0000 793.0000 48.1098 0.0000\n",
"================================================================\n",
"\n"
]
}
],
"source": [
"## MANOVA is an appropriate test for multivariate analysis, but I don't know how to interpret the results and the syntax is not properly \n",
"## described in statsmodels documentation. Did this with ChatGPT.\n",
"\n",
"# Select only relevant columns\n",
"manova_df = df[['Legendary', 'HP', 'Attack', 'Defense', 'SpAtk', 'SpDef', 'Speed']]\n",
"maov = MANOVA.from_formula('HP + Attack + Defense + SpAtk + SpDef + Speed ~ Legendary', data=manova_df)\n",
"print(maov.mv_test())\n",
"\n",
"## Because all tests yield P values (Pr > F) lower than 0.05, we reject H0."
]
},
{
Expand All @@ -337,7 +459,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 43,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -453,14 +575,14 @@
"4 624.0 262.0 1.9250 65500.0 "
]
},
"execution_count": 5,
"execution_count": 43,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/california_housing.csv\")\n",
"df.head()"
"df_housing = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/california_housing.csv\")\n",
"df_housing.head()"
]
},
{
Expand All @@ -483,22 +605,73 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 55,
"metadata": {},
"outputs": [],
"source": []
"source": [
"def get_distances(row):\n",
" sch_coord = [-118, 34]\n",
" hos_coord = [-122, 37]\n",
"\n",
" dist_sch = np.sqrt((sch_coord[0] - row[\"longitude\"])**2 + (sch_coord[1] - row[\"latitude\"])**2)\n",
" dist_hos = np.sqrt((hos_coord[0] - row[\"longitude\"])**2 + (hos_coord[1] - row[\"latitude\"])**2)\n",
" return pd.Series({\n",
" 'school_dist': 'Close' if dist_sch < 0.5 else 'Far',\n",
" 'hospital_dist': 'Close' if dist_hos < 0.5 else 'Far'\n",
" })"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 57,
"metadata": {},
"outputs": [],
"source": []
"source": [
"df_housing[['school_dist', 'hospital_dist']] = df_housing.apply(get_distances, axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 67,
"metadata": {},
"outputs": [],
"source": [
"df_close = df_housing[(df_housing['school_dist']=='Close') | (df_housing['hospital_dist']=='Close')]['median_house_value']\n",
"df_far = df_housing[(df_housing['school_dist']=='Far') | (df_housing['hospital_dist']=='Far')]['median_house_value']"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"We reject the null hypotesis\n"
]
}
],
"source": [
"#Set the hypothesis\n",
"\n",
"#H0: median_house_value_close <= median_house_value_far\n",
"#H1: median_house_value_close > median_house_value_far\n",
"#significance level = 0.05\n",
"alpha = 0.05\n",
"\n",
"p_value = st.ttest_ind(df_close,df_far, equal_var=False, alternative='greater')[1]\n",
"if p_value > alpha:\n",
" print(\"We are not able to reject the null hypothesis\")\n",
"else:\n",
" print(\"We reject the null hypotesis\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"display_name": "base",
"language": "python",
"name": "python3"
},
Expand All @@ -512,7 +685,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.9"
"version": "3.13.5"
}
},
"nbformat": 4,
Expand Down