diff --git a/lab-hypothesis-testing.ipynb b/lab-hypothesis-testing.ipynb index 0cc26d5..b6d1583 100644 --- a/lab-hypothesis-testing.ipynb +++ b/lab-hypothesis-testing.ipynb @@ -51,7 +51,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -278,7 +278,7 @@ "[800 rows x 11 columns]" ] }, - "execution_count": 3, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } @@ -288,6 +288,42 @@ "df" ] }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 800 entries, 0 to 799\n", + "Data columns (total 11 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 Name 799 non-null object\n", + " 1 Type 1 800 non-null object\n", + " 2 Type 2 414 non-null object\n", + " 3 HP 800 non-null int64 \n", + " 4 Attack 800 non-null int64 \n", + " 5 Defense 800 non-null int64 \n", + " 6 Sp. Atk 800 non-null int64 \n", + " 7 Sp. Def 800 non-null int64 \n", + " 8 Speed 800 non-null int64 \n", + " 9 Generation 800 non-null int64 \n", + " 10 Legendary 800 non-null bool \n", + "dtypes: bool(1), int64(7), object(3)\n", + "memory usage: 63.4+ KB\n" + ] + } + ], + "source": [ + "df.info()\n", + "\n", + "\n" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -297,11 +333,33 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 8, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Reject the null hypothesis: Dragon-type Pokémon have significantly higher HP on average than Grass-type.\n" + ] + } + ], "source": [ - "#code here" + "import pandas as pd\n", + "from scipy import stats\n", + "\n", + "\n", + "df_dragon = df[df['Type 1'] == 'Dragon']\n", + "df_grass = df[df['Type 1'] == 'Grass']\n", + "\n", + "# Conduct Two Sample T-Test\n", + "t_stat, p_value = stats.ttest_ind(df_dragon['HP'], df_grass['HP'], equal_var=True)\n", + "\n", + "# Compare the p-value with the significance level\n", + "if p_value < 0.05:\n", + " print(\"Reject the null hypothesis: Dragon-type Pokémon have significantly higher HP on average than Grass-type.\")\n", + "else:\n", + " print(\"Fail to reject the null hypothesis: No significant difference in HP between Dragon and Grass types.\")" ] }, { @@ -313,11 +371,36 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 9, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Reject the null hypothesis for HP: Legendary Pokémon have significantly different HP compared to Non-Legendary.\n", + "Reject the null hypothesis for Attack: Legendary Pokémon have significantly different Attack compared to Non-Legendary.\n", + "Reject the null hypothesis for Defense: Legendary Pokémon have significantly different Defense compared to Non-Legendary.\n", + "Reject the null hypothesis for Sp. Atk: Legendary Pokémon have significantly different Sp. Atk compared to Non-Legendary.\n", + "Reject the null hypothesis for Sp. Def: Legendary Pokémon have significantly different Sp. Def compared to Non-Legendary.\n", + "Reject the null hypothesis for Speed: Legendary Pokémon have significantly different Speed compared to Non-Legendary.\n" + ] + } + ], "source": [ - "#code here" + "legendary_pokemon = df[df['Legendary'] == True]\n", + "non_legendary_pokemon = df[df['Legendary'] == False]\n", + "\n", + "# Stats columns to compare\n", + "stats_columns = ['HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed']\n", + "\n", + "# Iterate over each stat and perform a t-test\n", + "for stat in stats_columns:\n", + " t_stat, p_value = stats.ttest_ind(legendary_pokemon[stat], non_legendary_pokemon[stat], equal_var=True)\n", + " if p_value < 0.05:\n", + " print(f\"Reject the null hypothesis for {stat}: Legendary Pokémon have significantly different {stat} compared to Non-Legendary.\")\n", + " else:\n", + " print(f\"Fail to reject the null hypothesis for {stat}: No significant difference in {stat} between Legendary and Non-Legendary.\")" ] }, { @@ -337,7 +420,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -453,7 +536,7 @@ "4 624.0 262.0 1.9250 65500.0 " ] }, - "execution_count": 5, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -463,6 +546,38 @@ "df.head()" ] }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 17000 entries, 0 to 16999\n", + "Data columns (total 9 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 longitude 17000 non-null float64\n", + " 1 latitude 17000 non-null float64\n", + " 2 housing_median_age 17000 non-null float64\n", + " 3 total_rooms 17000 non-null float64\n", + " 4 total_bedrooms 17000 non-null float64\n", + " 5 population 17000 non-null float64\n", + " 6 households 17000 non-null float64\n", + " 7 median_income 17000 non-null float64\n", + " 8 median_house_value 17000 non-null float64\n", + "dtypes: float64(9)\n", + "memory usage: 1.2 MB\n" + ] + } + ], + "source": [ + "df.info()" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -483,10 +598,45 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Reject the null hypothesis: Houses close to a school/hospital have significantly different median house values.\n" + ] + } + ], + "source": [ + "def euclidean_distance(lon1, lat1, lon2, lat2):\n", + " return np.sqrt((lon1 - lon2)**2 + (lat1 - lat2)**2)\n", + "\n", + "# Define school and hospital coordinates\n", + "school_coords = (-118, 34)\n", + "hospital_coords = (-122, 37)\n", + "\n", + "# Load your dataset into a DataFrame, assuming `df` is your DataFrame\n", + "df['distance_to_school'] = euclidean_distance(df['longitude'], df['latitude'], *school_coords)\n", + "df['distance_to_hospital'] = euclidean_distance(df['longitude'], df['latitude'], *hospital_coords)\n", + "\n", + "# Define \"close\" as having a distance of less than 0.50 to either a school or a hospital\n", + "df['close_to_school_or_hospital'] = (df['distance_to_school'] < 0.50) | (df['distance_to_hospital'] < 0.50)\n", + "\n", + "# Divide the dataset\n", + "close_houses = df[df['close_to_school_or_hospital'] == True]\n", + "far_houses = df[df['close_to_school_or_hospital'] == False]\n", + "\n", + "# Perform Two Sample T-Test\n", + "t_stat, p_value = stats.ttest_ind(close_houses['median_house_value'], far_houses['median_house_value'], equal_var=True)\n", + "\n", + "# Interpret the results\n", + "if p_value < 0.05:\n", + " print(\"Reject the null hypothesis: Houses close to a school/hospital have significantly different median house values.\")\n", + "else:\n", + " print(\"Fail to reject the null hypothesis: No significant difference in median house values between houses close to and far from a school/hospital.\")" + ] }, { "cell_type": "code", @@ -498,7 +648,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "base", "language": "python", "name": "python3" }, @@ -512,7 +662,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.9" + "version": "3.12.7" } }, "nbformat": 4,