Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
180 changes: 165 additions & 15 deletions lab-hypothesis-testing.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 2,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -278,7 +278,7 @@
"[800 rows x 11 columns]"
]
},
"execution_count": 3,
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -288,6 +288,42 @@
"df"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 800 entries, 0 to 799\n",
"Data columns (total 11 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 Name 799 non-null object\n",
" 1 Type 1 800 non-null object\n",
" 2 Type 2 414 non-null object\n",
" 3 HP 800 non-null int64 \n",
" 4 Attack 800 non-null int64 \n",
" 5 Defense 800 non-null int64 \n",
" 6 Sp. Atk 800 non-null int64 \n",
" 7 Sp. Def 800 non-null int64 \n",
" 8 Speed 800 non-null int64 \n",
" 9 Generation 800 non-null int64 \n",
" 10 Legendary 800 non-null bool \n",
"dtypes: bool(1), int64(7), object(3)\n",
"memory usage: 63.4+ KB\n"
]
}
],
"source": [
"df.info()\n",
"\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand All @@ -297,11 +333,33 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 8,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Reject the null hypothesis: Dragon-type Pokémon have significantly higher HP on average than Grass-type.\n"
]
}
],
"source": [
"#code here"
"import pandas as pd\n",
"from scipy import stats\n",
"\n",
"\n",
"df_dragon = df[df['Type 1'] == 'Dragon']\n",
"df_grass = df[df['Type 1'] == 'Grass']\n",
"\n",
"# Conduct Two Sample T-Test\n",
"t_stat, p_value = stats.ttest_ind(df_dragon['HP'], df_grass['HP'], equal_var=True)\n",
"\n",
"# Compare the p-value with the significance level\n",
"if p_value < 0.05:\n",
" print(\"Reject the null hypothesis: Dragon-type Pokémon have significantly higher HP on average than Grass-type.\")\n",
"else:\n",
" print(\"Fail to reject the null hypothesis: No significant difference in HP between Dragon and Grass types.\")"
]
},
{
Expand All @@ -313,11 +371,36 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": 9,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Reject the null hypothesis for HP: Legendary Pokémon have significantly different HP compared to Non-Legendary.\n",
"Reject the null hypothesis for Attack: Legendary Pokémon have significantly different Attack compared to Non-Legendary.\n",
"Reject the null hypothesis for Defense: Legendary Pokémon have significantly different Defense compared to Non-Legendary.\n",
"Reject the null hypothesis for Sp. Atk: Legendary Pokémon have significantly different Sp. Atk compared to Non-Legendary.\n",
"Reject the null hypothesis for Sp. Def: Legendary Pokémon have significantly different Sp. Def compared to Non-Legendary.\n",
"Reject the null hypothesis for Speed: Legendary Pokémon have significantly different Speed compared to Non-Legendary.\n"
]
}
],
"source": [
"#code here"
"legendary_pokemon = df[df['Legendary'] == True]\n",
"non_legendary_pokemon = df[df['Legendary'] == False]\n",
"\n",
"# Stats columns to compare\n",
"stats_columns = ['HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed']\n",
"\n",
"# Iterate over each stat and perform a t-test\n",
"for stat in stats_columns:\n",
" t_stat, p_value = stats.ttest_ind(legendary_pokemon[stat], non_legendary_pokemon[stat], equal_var=True)\n",
" if p_value < 0.05:\n",
" print(f\"Reject the null hypothesis for {stat}: Legendary Pokémon have significantly different {stat} compared to Non-Legendary.\")\n",
" else:\n",
" print(f\"Fail to reject the null hypothesis for {stat}: No significant difference in {stat} between Legendary and Non-Legendary.\")"
]
},
{
Expand All @@ -337,7 +420,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 10,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -453,7 +536,7 @@
"4 624.0 262.0 1.9250 65500.0 "
]
},
"execution_count": 5,
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -463,6 +546,38 @@
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"RangeIndex: 17000 entries, 0 to 16999\n",
"Data columns (total 9 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 longitude 17000 non-null float64\n",
" 1 latitude 17000 non-null float64\n",
" 2 housing_median_age 17000 non-null float64\n",
" 3 total_rooms 17000 non-null float64\n",
" 4 total_bedrooms 17000 non-null float64\n",
" 5 population 17000 non-null float64\n",
" 6 households 17000 non-null float64\n",
" 7 median_income 17000 non-null float64\n",
" 8 median_house_value 17000 non-null float64\n",
"dtypes: float64(9)\n",
"memory usage: 1.2 MB\n"
]
}
],
"source": [
"df.info()"
]
},
{
"cell_type": "markdown",
"metadata": {},
Expand All @@ -483,10 +598,45 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": []
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Reject the null hypothesis: Houses close to a school/hospital have significantly different median house values.\n"
]
}
],
"source": [
"def euclidean_distance(lon1, lat1, lon2, lat2):\n",
" return np.sqrt((lon1 - lon2)**2 + (lat1 - lat2)**2)\n",
"\n",
"# Define school and hospital coordinates\n",
"school_coords = (-118, 34)\n",
"hospital_coords = (-122, 37)\n",
"\n",
"# Load your dataset into a DataFrame, assuming `df` is your DataFrame\n",
"df['distance_to_school'] = euclidean_distance(df['longitude'], df['latitude'], *school_coords)\n",
"df['distance_to_hospital'] = euclidean_distance(df['longitude'], df['latitude'], *hospital_coords)\n",
"\n",
"# Define \"close\" as having a distance of less than 0.50 to either a school or a hospital\n",
"df['close_to_school_or_hospital'] = (df['distance_to_school'] < 0.50) | (df['distance_to_hospital'] < 0.50)\n",
"\n",
"# Divide the dataset\n",
"close_houses = df[df['close_to_school_or_hospital'] == True]\n",
"far_houses = df[df['close_to_school_or_hospital'] == False]\n",
"\n",
"# Perform Two Sample T-Test\n",
"t_stat, p_value = stats.ttest_ind(close_houses['median_house_value'], far_houses['median_house_value'], equal_var=True)\n",
"\n",
"# Interpret the results\n",
"if p_value < 0.05:\n",
" print(\"Reject the null hypothesis: Houses close to a school/hospital have significantly different median house values.\")\n",
"else:\n",
" print(\"Fail to reject the null hypothesis: No significant difference in median house values between houses close to and far from a school/hospital.\")"
]
},
{
"cell_type": "code",
Expand All @@ -498,7 +648,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"display_name": "base",
"language": "python",
"name": "python3"
},
Expand All @@ -512,7 +662,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.9"
"version": "3.12.7"
}
},
"nbformat": 4,
Expand Down