Skip to content
Open

lab #355

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
271 changes: 255 additions & 16 deletions lab-hypothesis-testing.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -51,7 +51,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 14,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -278,7 +278,7 @@
"[800 rows x 11 columns]"
]
},
"execution_count": 3,
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -297,11 +297,67 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 15,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Dragon: n=32, mean=83.31, sd=23.80\n",
"Grass : n=70, mean=67.27, sd=19.52\n",
"Welch t = 3.335, df ≈ 50.8, one-sided p = 0.0007994\n",
"Mean difference (Dragon - Grass) = 16.04 [95% CI: 6.38, 25.70]\n",
"Hedges' g = 0.760\n"
]
}
],
"source": [
"#code here"
"#code here\n",
"type_col = \"Type 1\" if \"Type 1\" in df.columns else (\n",
" \"type\" if \"type\" in df.columns else [c for c in df.columns if \"type\" in c.lower()][0]\n",
")\n",
"\n",
"# Extract HP for Dragon vs Grass\n",
"dragon = df.loc[df[type_col].str.lower() == \"dragon\", \"HP\"].dropna().astype(float)\n",
"grass = df.loc[df[type_col].str.lower() == \"grass\", \"HP\"].dropna().astype(float)\n",
"\n",
"print(f\"Dragon: n={len(dragon)}, mean={dragon.mean():.2f}, sd={dragon.std(ddof=1):.2f}\")\n",
"print(f\"Grass : n={len(grass)}, mean={grass.mean():.2f}, sd={grass.std(ddof=1):.2f}\")\n",
"\n",
"# One-sided Welch's t-test: H0: mu_Dragon <= mu_Grass vs H1: mu_Dragon > mu_Grass\n",
"try:\n",
" # SciPy ≥ 1.9 supports 'alternative'\n",
" res = st.ttest_ind(dragon, grass, equal_var=False, alternative='greater')\n",
" tstat, p_one = res.statistic, res.pvalue\n",
" # compute Welch DOF for CI/effect size if you want\n",
" s1, s2 = dragon.var(ddof=1), grass.var(ddof=1)\n",
" n1, n2 = len(dragon), len(grass)\n",
" se = np.sqrt(s1/n1 + s2/n2)\n",
" dof = (s1/n1 + s2/n2)**2 / ((s1**2)/((n1**2)*(n1-1)) + (s2**2)/((n2**2)*(n2-1)))\n",
"except TypeError:\n",
" # Fallback for older SciPy: compute one-sided p manually\n",
" s1, s2 = dragon.var(ddof=1), grass.var(ddof=1)\n",
" n1, n2 = len(dragon), len(grass)\n",
" se = np.sqrt(s1/n1 + s2/n2)\n",
" tstat = (dragon.mean() - grass.mean()) / se\n",
" dof = (s1/n1 + s2/n2)**2 / ((s1**2)/((n1**2)*(n1-1)) + (s2**2)/((n2**2)*(n2-1)))\n",
" p_one = st.t.sf(tstat, dof) # one-sided tail\n",
"\n",
"print(f\"Welch t = {tstat:.3f}, df ≈ {dof:.1f}, one-sided p = {p_one:.4g}\")\n",
"\n",
"# (Optional) 95% CI for the mean difference (two-sided)\n",
"diff = dragon.mean() - grass.mean()\n",
"tcrit = st.t.ppf(0.975, dof)\n",
"ci_low, ci_high = diff - tcrit*se, diff + tcrit*se\n",
"print(f\"Mean difference (Dragon - Grass) = {diff:.2f} [95% CI: {ci_low:.2f}, {ci_high:.2f}]\")\n",
"\n",
"# (Optional) Effect size (Hedges' g)\n",
"sp2 = ((n1-1)*s1 + (n2-1)*s2) / (n1+n2-2) # pooled variance\n",
"d = diff / np.sqrt(sp2) # Cohen's d\n",
"J = 1 - 3/(4*(n1+n2)-9) # small-sample correction\n",
"g = J * d\n",
"print(f\"Hedges' g = {g:.3f}\")\n"
]
},
{
Expand All @@ -313,11 +369,117 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": 16,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" stat mean_legendary mean_nonlegend diff(L-N) hedges_g p_raw p_holm reject_0.05\n",
"Sp. Atk 122.184615 68.454422 53.730194 1.834685 1.551461e-21 9.308768e-21 True\n",
" Speed 100.184615 65.455782 34.728833 1.262463 1.049016e-18 5.245082e-18 True\n",
" Attack 116.676923 75.669388 41.007535 1.344181 2.520372e-16 1.008149e-15 True\n",
"Sp. Def 105.938462 68.892517 37.045945 1.426976 2.294933e-15 6.884798e-15 True\n",
" HP 92.738462 67.182313 25.556149 1.038922 1.002691e-13 2.005382e-13 True\n",
"Defense 99.661538 71.559184 28.102355 0.928401 4.826998e-11 4.826998e-11 True\n",
"\n",
"MANOVA (Pillai’s trace):\n",
" Multivariate linear model\n",
"================================================================\n",
" \n",
"----------------------------------------------------------------\n",
" Intercept Value Num DF Den DF F Value Pr > F\n",
"----------------------------------------------------------------\n",
" Wilks' lambda 0.0592 6.0000 793.0000 2100.8338 0.0000\n",
" Pillai's trace 0.9408 6.0000 793.0000 2100.8338 0.0000\n",
" Hotelling-Lawley trace 15.8953 6.0000 793.0000 2100.8338 0.0000\n",
" Roy's greatest root 15.8953 6.0000 793.0000 2100.8338 0.0000\n",
"----------------------------------------------------------------\n",
" \n",
"----------------------------------------------------------------\n",
" C(LegendaryFlag) Value Num DF Den DF F Value Pr > F\n",
"----------------------------------------------------------------\n",
" Wilks' lambda 0.7331 6.0000 793.0000 48.1098 0.0000\n",
" Pillai's trace 0.2669 6.0000 793.0000 48.1098 0.0000\n",
" Hotelling-Lawley trace 0.3640 6.0000 793.0000 48.1098 0.0000\n",
" Roy's greatest root 0.3640 6.0000 793.0000 48.1098 0.0000\n",
"================================================================\n",
"\n"
]
}
],
"source": [
"#code here"
"#code here\n",
"# Columns\n",
"flag_col = \"Legendary\" if \"Legendary\" in df.columns else [c for c in df.columns if \"legend\" in c.lower()][0]\n",
"stats_cols = [\"HP\",\"Attack\",\"Defense\",\"Sp. Atk\",\"Sp. Def\",\"Speed\"]\n",
"stats_cols = [c for c in stats_cols if c in df.columns] # keep only those present\n",
"\n",
"# Split groups\n",
"leg = df[df[flag_col] == True][stats_cols].astype(float)\n",
"non = df[df[flag_col] == False][stats_cols].astype(float)\n",
"\n",
"# Per-stat Welch t-tests (two-sided) + effect size\n",
"rows = []\n",
"for col in stats_cols:\n",
" x, y = leg[col].dropna().values, non[col].dropna().values\n",
" res = st.ttest_ind(x, y, equal_var=False) # two-sided\n",
" # Hedges' g\n",
" n1, n2 = len(x), len(y)\n",
" s1, s2 = np.var(x, ddof=1), np.var(y, ddof=1)\n",
" sp2 = ((n1-1)*s1 + (n2-1)*s2) / (n1+n2-2)\n",
" d = (x.mean() - y.mean()) / np.sqrt(sp2)\n",
" J = 1 - 3/(4*(n1+n2)-9) # small-sample correction\n",
" g = J*d\n",
" rows.append({\n",
" \"stat\": col,\n",
" \"mean_legendary\": x.mean(),\n",
" \"mean_nonlegend\": y.mean(),\n",
" \"diff(L-N)\": x.mean()-y.mean(),\n",
" \"t\": res.statistic,\n",
" \"p_raw\": res.pvalue,\n",
" \"hedges_g\": g\n",
" })\n",
"\n",
"out = pd.DataFrame(rows)\n",
"\n",
"# Multiple-comparison correction (Holm)\n",
"try:\n",
" from statsmodels.stats.multitest import multipletests\n",
" rej, p_holm, _, _ = multipletests(out[\"p_raw\"], method=\"holm\")\n",
" out[\"p_holm\"] = p_holm\n",
" out[\"reject_0.05\"] = rej\n",
"except Exception:\n",
" # simple Holm fallback\n",
" order = np.argsort(out[\"p_raw\"].values)\n",
" m = len(out)\n",
" holm = np.empty(m); holm[:] = np.nan\n",
" for rank, idx in enumerate(order, start=1):\n",
" holm[idx] = (m - rank + 1) * out.loc[idx, \"p_raw\"]\n",
" # monotone adjustment\n",
" for i in range(1, m):\n",
" holm[order[i]] = max(holm[order[i]], holm[order[i-1]])\n",
" out[\"p_holm\"] = np.clip(holm, 0, 1)\n",
" out[\"reject_0.05\"] = out[\"p_holm\"] < 0.05\n",
"\n",
"# Nicely sorted table\n",
"print(out.sort_values(\"p_holm\")[[\"stat\",\"mean_legendary\",\"mean_nonlegend\",\"diff(L-N)\",\"hedges_g\",\"p_raw\",\"p_holm\",\"reject_0.05\"]]\n",
" .to_string(index=False))\n",
"\n",
"# (Optional) Global multivariate test (MANOVA)\n",
"try:\n",
" from statsmodels.multivariate.manova import MANOVA\n",
" # make a temporary df with clean column names for formula\n",
" tmp = df[[flag_col]+stats_cols].dropna().copy()\n",
" tmp = tmp.rename(columns={flag_col:\"LegendaryFlag\",\n",
" \"Sp. Atk\":\"Sp_Atk\",\"Sp. Def\":\"Sp_Def\"})\n",
" formula = \"HP + Attack + Defense + Sp_Atk + Sp_Def + Speed ~ C(LegendaryFlag)\"\n",
" manova = MANOVA.from_formula(formula, data=tmp)\n",
" print(\"\\nMANOVA (Pillai’s trace):\")\n",
" print(manova.mv_test()) # look at Pillai/ Wilks p-values\n",
"except Exception as e:\n",
" print(\"\\nMANOVA skipped (statsmodels not available or column names differ).\")\n"
]
},
{
Expand All @@ -337,7 +499,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 17,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -453,7 +615,7 @@
"4 624.0 262.0 1.9250 65500.0 "
]
},
"execution_count": 5,
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
Expand Down Expand Up @@ -483,10 +645,87 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": []
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Close: n=6829, mean=$246,952\n",
"Far : n=10171, mean=$180,678\n",
"Welch t = 37.992, df ≈ 14571.2, one-sided p = 1.503e-301, diff = $66,274\n",
"Hedges' g = 0.595\n",
"Mann–Whitney U one-sided p = 0\n"
]
}
],
"source": [
"# 0) Load\n",
"df = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/california_housing.csv\")\n",
"\n",
"# 1) Columns\n",
"lon_col = next(c for c in df.columns if \"long\" in c.lower()) # 'longitude'\n",
"lat_col = next(c for c in df.columns if \"lat\" in c.lower()) # 'latitude'\n",
"price_col = next(c for c in df.columns if \"median_house_value\" in c.lower())\n",
"\n",
"# 2) Distances (euclidean on lon/lat degrees, as hinted)\n",
"school = (-118.0, 34.0)\n",
"hospital = (-122.0, 37.0)\n",
"\n",
"def euclid_xy(x1, y1, x2, y2):\n",
" return np.sqrt((x1 - x2)**2 + (y1 - y2)**2)\n",
"\n",
"df[\"dist_school\"] = euclid_xy(df[lon_col], df[lat_col], school[0], school[1])\n",
"df[\"dist_hospital\"] = euclid_xy(df[lon_col], df[lat_col], hospital[0], hospital[1])\n",
"df[\"dist_min\"] = df[[\"dist_school\",\"dist_hospital\"]].min(axis=1)\n",
"\n",
"# 3) Close vs Far\n",
"threshold = 0.50 # per instructions\n",
"df[\"close\"] = df[\"dist_min\"] < threshold\n",
"\n",
"# 4) Prepare groups\n",
"close_vals = df.loc[df[\"close\"], price_col].dropna().astype(float)\n",
"far_vals = df.loc[~df[\"close\"], price_col].dropna().astype(float)\n",
"\n",
"print(f\"Close: n={len(close_vals)}, mean=${close_vals.mean():,.0f}\")\n",
"print(f\"Far : n={len(far_vals)}, mean=${far_vals.mean():,.0f}\")\n",
"\n",
"# 5) Hypothesis test (one-sided Welch t-test)\n",
"# H0: mu_close <= mu_far vs H1: mu_close > mu_far\n",
"try:\n",
" # SciPy >= 1.9 supports 'alternative'\n",
" res = st.ttest_ind(close_vals, far_vals, equal_var=False, alternative=\"greater\")\n",
" tstat, p_one = res.statistic, res.pvalue\n",
"except TypeError:\n",
" # manual one-sided from two-sided\n",
" res = st.ttest_ind(close_vals, far_vals, equal_var=False)\n",
" tstat = res.statistic\n",
" p_one = st.t.sf(tstat, df=min(len(close_vals)-1, len(far_vals)-1)) # conservative df\n",
"\n",
"# Welch SE & dof (for reporting)\n",
"s1, s2 = close_vals.var(ddof=1), far_vals.var(ddof=1)\n",
"n1, n2 = len(close_vals), len(far_vals)\n",
"se = np.sqrt(s1/n1 + s2/n2)\n",
"dof = (s1/n1 + s2/n2)**2 / ((s1**2)/((n1**2)*(n1-1)) + (s2**2)/((n2**2)*(n2-1)))\n",
"\n",
"diff = close_vals.mean() - far_vals.mean()\n",
"print(f\"Welch t = {tstat:.3f}, df ≈ {dof:.1f}, one-sided p = {p_one:.4g}, diff = ${diff:,.0f}\")\n",
"\n",
"# 6) Effect size (Hedges' g)\n",
"sp2 = ((n1-1)*s1 + (n2-1)*s2) / (n1+n2-2)\n",
"d = diff / np.sqrt(sp2)\n",
"J = 1 - 3/(4*(n1+n2)-9)\n",
"g = J*d\n",
"print(f\"Hedges' g = {g:.3f}\")\n",
"\n",
"# 7) (Optional) Robustness: one-sided Mann–Whitney U (close > far)\n",
"try:\n",
" u_stat, p_mwu = st.mannwhitneyu(close_vals, far_vals, alternative=\"greater\")\n",
" print(f\"Mann–Whitney U one-sided p = {p_mwu:.4g}\")\n",
"except TypeError:\n",
" pass\n"
]
},
{
"cell_type": "code",
Expand All @@ -498,7 +737,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"display_name": "base",
"language": "python",
"name": "python3"
},
Expand All @@ -512,7 +751,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.9"
"version": "3.12.7"
}
},
"nbformat": 4,
Expand Down
Loading