diff --git a/README.md b/README.md
index 8e86a3b..dd4da6e 100644
--- a/README.md
+++ b/README.md
@@ -51,6 +51,11 @@ Each notebook corresponds to a chapter from the source material. Click on the "O
+8. **Ch9. Specification and Data Issues**
+
+
+
+
## How to Use
1. Click on the "Open in Colab" badge next to the notebook you want to explore.
diff --git a/markdown/Ch7. MRA - Qualitative Regressors.md b/markdown/Ch7. MRA - Qualitative Regressors.md
index b7e8641..96821ce 100644
--- a/markdown/Ch7. MRA - Qualitative Regressors.md
+++ b/markdown/Ch7. MRA - Qualitative Regressors.md
@@ -20,11 +20,11 @@ jupyter:
```
```python
+import numpy as np # noqa
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import wooldridge as wool
-import numpy as np
```
## 7.1 Linear Regression with Dummy Variables as Regressors
diff --git a/markdown/Ch9. Specification and Data Issues.md b/markdown/Ch9. Specification and Data Issues.md
new file mode 100644
index 0000000..0b71315
--- /dev/null
+++ b/markdown/Ch9. Specification and Data Issues.md
@@ -0,0 +1,384 @@
+---
+jupyter:
+ jupytext:
+ formats: notebooks//ipynb,markdown//md,scripts//py
+ text_representation:
+ extension: .md
+ format_name: markdown
+ format_version: '1.3'
+ jupytext_version: 1.16.4
+ kernelspec:
+ display_name: merino
+ language: python
+ name: python3
+---
+
+# Ch9. Specification and Data Issues
+
+```python
+%pip install matplotlib numpy pandas statsmodels wooldridge scipy -q
+```
+
+```python
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import statsmodels.api as sm
+import statsmodels.formula.api as smf
+import statsmodels.stats.outliers_influence as smo
+import wooldridge as wool
+from scipy import stats
+```
+
+## 9.1 Functional Form Misspecification
+
+### Example 9.2: Housing Price Equation
+
+```python
+hprice1 = wool.data("hprice1")
+
+# original OLS:
+reg = smf.ols(formula="price ~ lotsize + sqrft + bdrms", data=hprice1)
+results = reg.fit()
+
+# regression for RESET test:
+hprice1["fitted_sq"] = results.fittedvalues**2
+hprice1["fitted_cub"] = results.fittedvalues**3
+reg_reset = smf.ols(
+ formula="price ~ lotsize + sqrft + bdrms + fitted_sq + fitted_cub",
+ data=hprice1,
+)
+results_reset = reg_reset.fit()
+
+# print regression table:
+table = pd.DataFrame(
+ {
+ "b": round(results_reset.params, 4),
+ "se": round(results_reset.bse, 4),
+ "t": round(results_reset.tvalues, 4),
+ "pval": round(results_reset.pvalues, 4),
+ },
+)
+print(f"table: \n{table}\n")
+```
+
+```python
+# RESET test (H0: all coeffs including "fitted" are=0):
+hypotheses = ["fitted_sq = 0", "fitted_cub = 0"]
+ftest_man = results_reset.f_test(hypotheses)
+fstat_man = ftest_man.statistic
+fpval_man = ftest_man.pvalue
+
+print(f"fstat_man: {fstat_man}\n")
+print(f"fpval_man: {fpval_man}\n")
+```
+
+```python
+hprice1 = wool.data("hprice1")
+
+# original linear regression:
+reg = smf.ols(formula="price ~ lotsize + sqrft + bdrms", data=hprice1)
+results = reg.fit()
+
+# automated RESET test:
+reset_output = smo.reset_ramsey(res=results, degree=3)
+fstat_auto = reset_output.statistic
+fpval_auto = reset_output.pvalue
+
+print(f"fstat_auto: {fstat_auto}\n")
+print(f"fpval_auto: {fpval_auto}\n")
+```
+
+```python
+hprice1 = wool.data("hprice1")
+
+# two alternative models:
+reg1 = smf.ols(formula="price ~ lotsize + sqrft + bdrms", data=hprice1)
+results1 = reg1.fit()
+
+reg2 = smf.ols(
+ formula="price ~ np.log(lotsize) +np.log(sqrft) + bdrms",
+ data=hprice1,
+)
+results2 = reg2.fit()
+
+# encompassing test of Davidson & MacKinnon:
+# comprehensive model:
+reg3 = smf.ols(
+ formula="price ~ lotsize + sqrft + bdrms + np.log(lotsize) + np.log(sqrft)",
+ data=hprice1,
+)
+results3 = reg3.fit()
+
+# model 1 vs. comprehensive model:
+anovaResults1 = sm.stats.anova_lm(results1, results3)
+print(f"anovaResults1: \n{anovaResults1}\n")
+```
+
+```python
+# model 2 vs. comprehensive model:
+anovaResults2 = sm.stats.anova_lm(results2, results3)
+print(f"anovaResults2: \n{anovaResults2}\n")
+```
+
+## 9.2 Measurement Error
+
+```python
+# set the random seed:
+np.random.seed(1234567)
+
+# set sample size and number of simulations:
+n = 1000
+r = 10000
+
+# set true parameters (betas):
+beta0 = 1
+beta1 = 0.5
+
+# initialize arrays to store results later (b1 without ME, b1_me with ME):
+b1 = np.empty(r)
+b1_me = np.empty(r)
+
+# draw a sample of x, fixed over replications:
+x = stats.norm.rvs(4, 1, size=n)
+
+# repeat r times:
+for i in range(r):
+ # draw a sample of u:
+ u = stats.norm.rvs(0, 1, size=n)
+
+ # draw a sample of ystar:
+ ystar = beta0 + beta1 * x + u
+
+ # measurement error and mismeasured y:
+ e0 = stats.norm.rvs(0, 1, size=n)
+ y = ystar + e0
+ df = pd.DataFrame({"ystar": ystar, "y": y, "x": x})
+
+ # regress ystar on x and store slope estimate at position i:
+ reg_star = smf.ols(formula="ystar ~ x", data=df)
+ results_star = reg_star.fit()
+ b1[i] = results_star.params["x"]
+
+ # regress y on x and store slope estimate at position i:
+ reg_me = smf.ols(formula="y ~ x", data=df)
+ results_me = reg_me.fit()
+ b1_me[i] = results_me.params["x"]
+
+# mean with and without ME:
+b1_mean = np.mean(b1)
+b1_me_mean = np.mean(b1_me)
+print(f"b1_mean: {b1_mean}\n")
+print(f"b1_me_mean: {b1_me_mean}\n")
+```
+
+```python
+# variance with and without ME:
+b1_var = np.var(b1, ddof=1)
+b1_me_var = np.var(b1_me, ddof=1)
+print(f"b1_var: {b1_var}\n")
+print(f"b1_me_var: {b1_me_var}\n")
+```
+
+```python
+# set the random seed:
+np.random.seed(1234567)
+
+# set sample size and number of simulations:
+n = 1000
+r = 10000
+
+# set true parameters (betas):
+beta0 = 1
+beta1 = 0.5
+
+# initialize b1 arrays to store results later:
+b1 = np.empty(r)
+b1_me = np.empty(r)
+
+# draw a sample of x, fixed over replications:
+xstar = stats.norm.rvs(4, 1, size=n)
+
+# repeat r times:
+for i in range(r):
+ # draw a sample of u:
+ u = stats.norm.rvs(0, 1, size=n)
+
+ # draw a sample of y:
+ y = beta0 + beta1 * xstar + u
+
+ # measurement error and mismeasured x:
+ e1 = stats.norm.rvs(0, 1, size=n)
+ x = xstar + e1
+ df = pd.DataFrame({"y": y, "xstar": xstar, "x": x})
+
+ # regress y on xstar and store slope estimate at position i:
+ reg_star = smf.ols(formula="y ~ xstar", data=df)
+ results_star = reg_star.fit()
+ b1[i] = results_star.params["xstar"]
+
+ # regress y on x and store slope estimate at position i:
+ reg_me = smf.ols(formula="y ~ x", data=df)
+ results_me = reg_me.fit()
+ b1_me[i] = results_me.params["x"]
+
+# mean with and without ME:
+b1_mean = np.mean(b1)
+b1_me_mean = np.mean(b1_me)
+print(f"b1_mean: {b1_mean}\n")
+print(f"b1_me_mean: {b1_me_mean}\n")
+```
+
+```python
+# variance with and without ME:
+b1_var = np.var(b1, ddof=1)
+b1_me_var = np.var(b1_me, ddof=1)
+print(f"b1_var: {b1_var}\n")
+print(f"b1_me_var: {b1_me_var}\n")
+```
+
+## 9.3 Missing Data and Nonrandom Samples
+
+```python
+# nan and inf handling in numpy:
+x = np.array([-1, 0, 1, np.nan, np.inf, -np.inf])
+logx = np.log(x)
+invx = np.array(1 / x)
+ncdf = np.array(stats.norm.cdf(x))
+isnanx = np.isnan(x)
+
+results = pd.DataFrame(
+ {"x": x, "logx": logx, "invx": invx, "logx": logx, "ncdf": ncdf, "isnanx": isnanx},
+)
+print(f"results: \n{results}\n")
+```
+
+```python
+lawsch85 = wool.data("lawsch85")
+lsat_pd = lawsch85["LSAT"]
+
+# create boolean indicator for missings:
+missLSAT = lsat_pd.isna()
+
+# LSAT and indicator for Schools No. 120-129:
+preview = pd.DataFrame({"lsat_pd": lsat_pd[119:129], "missLSAT": missLSAT[119:129]})
+print(f"preview: \n{preview}\n")
+```
+
+```python
+# frequencies of indicator:
+freq_missLSAT = pd.crosstab(missLSAT, columns="count")
+print(f"freq_missLSAT: \n{freq_missLSAT}\n")
+```
+
+```python
+# missings for all variables in data frame (counts):
+miss_all = lawsch85.isna()
+colsums = miss_all.sum(axis=0)
+print(f"colsums: \n{colsums}\n")
+```
+
+```python
+# computing amount of complete cases:
+complete_cases = miss_all.sum(axis=1) == 0
+freq_complete_cases = pd.crosstab(complete_cases, columns="count")
+print(f"freq_complete_cases: \n{freq_complete_cases}\n")
+```
+
+```python
+lawsch85 = wool.data("lawsch85")
+
+# missings in numpy:
+x_np = np.array(lawsch85["LSAT"])
+x_np_bar1 = np.mean(x_np)
+x_np_bar2 = np.nanmean(x_np)
+print(f"x_np_bar1: {x_np_bar1}\n")
+print(f"x_np_bar2: {x_np_bar2}\n")
+```
+
+```python
+# missings in pandas:
+x_pd = lawsch85["LSAT"]
+x_pd_bar1 = np.mean(x_pd)
+x_pd_bar2 = np.nanmean(x_pd)
+print(f"x_pd_bar1: {x_pd_bar1}\n")
+print(f"x_pd_bar2: {x_pd_bar2}\n")
+```
+
+```python
+# observations and variables:
+print(f"lawsch85.shape: {lawsch85.shape}\n")
+```
+
+```python
+# regression (missings are taken care of by default):
+reg = smf.ols(formula="np.log(salary) ~ LSAT + cost + age", data=lawsch85)
+results = reg.fit()
+print(f"results.nobs: {results.nobs}\n")
+```
+
+## 9.4 Outlying Observations
+
+```python
+rdchem = wool.data("rdchem")
+
+# OLS regression:
+reg = smf.ols(formula="rdintens ~ sales + profmarg", data=rdchem)
+results = reg.fit()
+
+# studentized residuals for all observations:
+studres = results.get_influence().resid_studentized_external
+
+# display extreme values:
+studres_max = np.max(studres)
+studres_min = np.min(studres)
+print(f"studres_max: {studres_max}\n")
+print(f"studres_min: {studres_min}\n")
+```
+
+```python
+# histogram (and overlayed density plot):
+kde = sm.nonparametric.KDEUnivariate(studres)
+kde.fit()
+
+plt.hist(studres, color="grey", density=True)
+plt.plot(kde.support, kde.density, color="black", linewidth=2)
+plt.ylabel("density")
+plt.xlabel("studres")
+```
+
+## 9.5 Least Absolute Deviations (LAD) Estimation
+
+```python
+rdchem = wool.data("rdchem")
+
+# OLS regression:
+reg_ols = smf.ols(formula="rdintens ~ I(sales/1000) + profmarg", data=rdchem)
+results_ols = reg_ols.fit()
+
+table_ols = pd.DataFrame(
+ {
+ "b": round(results_ols.params, 4),
+ "se": round(results_ols.bse, 4),
+ "t": round(results_ols.tvalues, 4),
+ "pval": round(results_ols.pvalues, 4),
+ },
+)
+print(f"table_ols: \n{table_ols}\n")
+```
+
+```python
+# LAD regression:
+reg_lad = smf.quantreg(formula="rdintens ~ I(sales/1000) + profmarg", data=rdchem)
+results_lad = reg_lad.fit(q=0.5)
+
+table_lad = pd.DataFrame(
+ {
+ "b": round(results_lad.params, 4),
+ "se": round(results_lad.bse, 4),
+ "t": round(results_lad.tvalues, 4),
+ "pval": round(results_lad.pvalues, 4),
+ },
+)
+print(f"table_lad: \n{table_lad}\n")
+```
diff --git a/myst.yml b/myst.yml
index 84ff155..b50a29a 100644
--- a/myst.yml
+++ b/myst.yml
@@ -19,6 +19,7 @@ project:
- file: notebooks/Ch6. MRA - Further Issues.ipynb
- file: notebooks/Ch7. MRA - Qualitative Regressors.ipynb
- file: notebooks/Ch8. Heteroskedasticity.ipynb
+ - file: notebooks/Ch9. Specification and Data Issues.ipynb
site:
template: book-theme
diff --git a/notebooks/Ch7. MRA - Qualitative Regressors.ipynb b/notebooks/Ch7. MRA - Qualitative Regressors.ipynb
index 99fab19..3a67a7f 100644
--- a/notebooks/Ch7. MRA - Qualitative Regressors.ipynb
+++ b/notebooks/Ch7. MRA - Qualitative Regressors.ipynb
@@ -31,11 +31,11 @@
"metadata": {},
"outputs": [],
"source": [
+ "import numpy as np # noqa\n",
"import pandas as pd\n",
"import statsmodels.api as sm\n",
"import statsmodels.formula.api as smf\n",
- "import wooldridge as wool\n",
- "import numpy as np"
+ "import wooldridge as wool"
]
},
{
diff --git a/notebooks/Ch9. Specification and Data Issues.ipynb b/notebooks/Ch9. Specification and Data Issues.ipynb
new file mode 100644
index 0000000..3159a13
--- /dev/null
+++ b/notebooks/Ch9. Specification and Data Issues.ipynb
@@ -0,0 +1,868 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "5d3db9d2",
+ "metadata": {},
+ "source": [
+ "# Ch9. Specification and Data Issues"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "dbadb3c8",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Note: you may need to restart the kernel to use updated packages.\n"
+ ]
+ }
+ ],
+ "source": [
+ "%pip install matplotlib numpy pandas statsmodels wooldridge scipy -q"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "ce9220f5",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import matplotlib.pyplot as plt\n",
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "import statsmodels.api as sm\n",
+ "import statsmodels.formula.api as smf\n",
+ "import statsmodels.stats.outliers_influence as smo\n",
+ "import wooldridge as wool\n",
+ "from scipy import stats"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 9.1 Functional Form Misspecification\n",
+ "\n",
+ "### Example 9.2: Housing Price Equation"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "table: \n",
+ " b se t pval\n",
+ "Intercept 166.0973 317.4325 0.5233 0.6022\n",
+ "lotsize 0.0002 0.0052 0.0295 0.9765\n",
+ "sqrft 0.0176 0.2993 0.0588 0.9532\n",
+ "bdrms 2.1749 33.8881 0.0642 0.9490\n",
+ "fitted_sq 0.0004 0.0071 0.0498 0.9604\n",
+ "fitted_cub 0.0000 0.0000 0.2358 0.8142\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "hprice1 = wool.data(\"hprice1\")\n",
+ "\n",
+ "# original OLS:\n",
+ "reg = smf.ols(formula=\"price ~ lotsize + sqrft + bdrms\", data=hprice1)\n",
+ "results = reg.fit()\n",
+ "\n",
+ "# regression for RESET test:\n",
+ "hprice1[\"fitted_sq\"] = results.fittedvalues**2\n",
+ "hprice1[\"fitted_cub\"] = results.fittedvalues**3\n",
+ "reg_reset = smf.ols(\n",
+ " formula=\"price ~ lotsize + sqrft + bdrms + fitted_sq + fitted_cub\",\n",
+ " data=hprice1,\n",
+ ")\n",
+ "results_reset = reg_reset.fit()\n",
+ "\n",
+ "# print regression table:\n",
+ "table = pd.DataFrame(\n",
+ " {\n",
+ " \"b\": round(results_reset.params, 4),\n",
+ " \"se\": round(results_reset.bse, 4),\n",
+ " \"t\": round(results_reset.tvalues, 4),\n",
+ " \"pval\": round(results_reset.pvalues, 4),\n",
+ " },\n",
+ ")\n",
+ "print(f\"table: \\n{table}\\n\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "fstat_man: 4.668205534946464\n",
+ "\n",
+ "fpval_man: 0.012021711442908005\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "# RESET test (H0: all coeffs including \"fitted\" are=0):\n",
+ "hypotheses = [\"fitted_sq = 0\", \"fitted_cub = 0\"]\n",
+ "ftest_man = results_reset.f_test(hypotheses)\n",
+ "fstat_man = ftest_man.statistic\n",
+ "fpval_man = ftest_man.pvalue\n",
+ "\n",
+ "print(f\"fstat_man: {fstat_man}\\n\")\n",
+ "print(f\"fpval_man: {fpval_man}\\n\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "fstat_auto: 4.668205534948772\n",
+ "\n",
+ "fpval_auto: 0.012021711442883198\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "hprice1 = wool.data(\"hprice1\")\n",
+ "\n",
+ "# original linear regression:\n",
+ "reg = smf.ols(formula=\"price ~ lotsize + sqrft + bdrms\", data=hprice1)\n",
+ "results = reg.fit()\n",
+ "\n",
+ "# automated RESET test:\n",
+ "reset_output = smo.reset_ramsey(res=results, degree=3)\n",
+ "fstat_auto = reset_output.statistic\n",
+ "fpval_auto = reset_output.pvalue\n",
+ "\n",
+ "print(f\"fstat_auto: {fstat_auto}\\n\")\n",
+ "print(f\"fpval_auto: {fpval_auto}\\n\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "anovaResults1: \n",
+ " df_resid ssr df_diff ss_diff F Pr(>F)\n",
+ "0 84.0 300723.805123 0.0 NaN NaN NaN\n",
+ "1 82.0 252340.364481 2.0 48383.440642 7.861291 0.000753\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "hprice1 = wool.data(\"hprice1\")\n",
+ "\n",
+ "# two alternative models:\n",
+ "reg1 = smf.ols(formula=\"price ~ lotsize + sqrft + bdrms\", data=hprice1)\n",
+ "results1 = reg1.fit()\n",
+ "\n",
+ "reg2 = smf.ols(\n",
+ " formula=\"price ~ np.log(lotsize) +np.log(sqrft) + bdrms\",\n",
+ " data=hprice1,\n",
+ ")\n",
+ "results2 = reg2.fit()\n",
+ "\n",
+ "# encompassing test of Davidson & MacKinnon:\n",
+ "# comprehensive model:\n",
+ "reg3 = smf.ols(\n",
+ " formula=\"price ~ lotsize + sqrft + bdrms + np.log(lotsize) + np.log(sqrft)\",\n",
+ " data=hprice1,\n",
+ ")\n",
+ "results3 = reg3.fit()\n",
+ "\n",
+ "# model 1 vs. comprehensive model:\n",
+ "anovaResults1 = sm.stats.anova_lm(results1, results3)\n",
+ "print(f\"anovaResults1: \\n{anovaResults1}\\n\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "anovaResults2: \n",
+ " df_resid ssr df_diff ss_diff F Pr(>F)\n",
+ "0 84.0 295735.273607 0.0 NaN NaN NaN\n",
+ "1 82.0 252340.364481 2.0 43394.909126 7.05076 0.001494\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "# model 2 vs. comprehensive model:\n",
+ "anovaResults2 = sm.stats.anova_lm(results2, results3)\n",
+ "print(f\"anovaResults2: \\n{anovaResults2}\\n\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 9.2 Measurement Error"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "b1_mean: 0.5002159846382418\n",
+ "\n",
+ "b1_me_mean: 0.4999676458235338\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "# set the random seed:\n",
+ "np.random.seed(1234567)\n",
+ "\n",
+ "# set sample size and number of simulations:\n",
+ "n = 1000\n",
+ "r = 10000\n",
+ "\n",
+ "# set true parameters (betas):\n",
+ "beta0 = 1\n",
+ "beta1 = 0.5\n",
+ "\n",
+ "# initialize arrays to store results later (b1 without ME, b1_me with ME):\n",
+ "b1 = np.empty(r)\n",
+ "b1_me = np.empty(r)\n",
+ "\n",
+ "# draw a sample of x, fixed over replications:\n",
+ "x = stats.norm.rvs(4, 1, size=n)\n",
+ "\n",
+ "# repeat r times:\n",
+ "for i in range(r):\n",
+ " # draw a sample of u:\n",
+ " u = stats.norm.rvs(0, 1, size=n)\n",
+ "\n",
+ " # draw a sample of ystar:\n",
+ " ystar = beta0 + beta1 * x + u\n",
+ "\n",
+ " # measurement error and mismeasured y:\n",
+ " e0 = stats.norm.rvs(0, 1, size=n)\n",
+ " y = ystar + e0\n",
+ " df = pd.DataFrame({\"ystar\": ystar, \"y\": y, \"x\": x})\n",
+ "\n",
+ " # regress ystar on x and store slope estimate at position i:\n",
+ " reg_star = smf.ols(formula=\"ystar ~ x\", data=df)\n",
+ " results_star = reg_star.fit()\n",
+ " b1[i] = results_star.params[\"x\"]\n",
+ "\n",
+ " # regress y on x and store slope estimate at position i:\n",
+ " reg_me = smf.ols(formula=\"y ~ x\", data=df)\n",
+ " results_me = reg_me.fit()\n",
+ " b1_me[i] = results_me.params[\"x\"]\n",
+ "\n",
+ "# mean with and without ME:\n",
+ "b1_mean = np.mean(b1)\n",
+ "b1_me_mean = np.mean(b1_me)\n",
+ "print(f\"b1_mean: {b1_mean}\\n\")\n",
+ "print(f\"b1_me_mean: {b1_me_mean}\\n\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "b1_var: 0.0010335543409510668\n",
+ "\n",
+ "b1_me_var: 0.0020439380493408005\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "# variance with and without ME:\n",
+ "b1_var = np.var(b1, ddof=1)\n",
+ "b1_me_var = np.var(b1_me, ddof=1)\n",
+ "print(f\"b1_var: {b1_var}\\n\")\n",
+ "print(f\"b1_me_var: {b1_me_var}\\n\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "b1_mean: 0.5002159846382418\n",
+ "\n",
+ "b1_me_mean: 0.2445467197788616\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "# set the random seed:\n",
+ "np.random.seed(1234567)\n",
+ "\n",
+ "# set sample size and number of simulations:\n",
+ "n = 1000\n",
+ "r = 10000\n",
+ "\n",
+ "# set true parameters (betas):\n",
+ "beta0 = 1\n",
+ "beta1 = 0.5\n",
+ "\n",
+ "# initialize b1 arrays to store results later:\n",
+ "b1 = np.empty(r)\n",
+ "b1_me = np.empty(r)\n",
+ "\n",
+ "# draw a sample of x, fixed over replications:\n",
+ "xstar = stats.norm.rvs(4, 1, size=n)\n",
+ "\n",
+ "# repeat r times:\n",
+ "for i in range(r):\n",
+ " # draw a sample of u:\n",
+ " u = stats.norm.rvs(0, 1, size=n)\n",
+ "\n",
+ " # draw a sample of y:\n",
+ " y = beta0 + beta1 * xstar + u\n",
+ "\n",
+ " # measurement error and mismeasured x:\n",
+ " e1 = stats.norm.rvs(0, 1, size=n)\n",
+ " x = xstar + e1\n",
+ " df = pd.DataFrame({\"y\": y, \"xstar\": xstar, \"x\": x})\n",
+ "\n",
+ " # regress y on xstar and store slope estimate at position i:\n",
+ " reg_star = smf.ols(formula=\"y ~ xstar\", data=df)\n",
+ " results_star = reg_star.fit()\n",
+ " b1[i] = results_star.params[\"xstar\"]\n",
+ "\n",
+ " # regress y on x and store slope estimate at position i:\n",
+ " reg_me = smf.ols(formula=\"y ~ x\", data=df)\n",
+ " results_me = reg_me.fit()\n",
+ " b1_me[i] = results_me.params[\"x\"]\n",
+ "\n",
+ "# mean with and without ME:\n",
+ "b1_mean = np.mean(b1)\n",
+ "b1_me_mean = np.mean(b1_me)\n",
+ "print(f\"b1_mean: {b1_mean}\\n\")\n",
+ "print(f\"b1_me_mean: {b1_me_mean}\\n\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "b1_var: 0.0010335543409510668\n",
+ "\n",
+ "b1_me_var: 0.0005435611029837354\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "# variance with and without ME:\n",
+ "b1_var = np.var(b1, ddof=1)\n",
+ "b1_me_var = np.var(b1_me, ddof=1)\n",
+ "print(f\"b1_var: {b1_var}\\n\")\n",
+ "print(f\"b1_me_var: {b1_me_var}\\n\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 9.3 Missing Data and Nonrandom Samples"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "results: \n",
+ " x logx invx ncdf isnanx\n",
+ "0 -1.0 NaN -1.0 0.158655 False\n",
+ "1 0.0 -inf inf 0.500000 False\n",
+ "2 1.0 0.0 1.0 0.841345 False\n",
+ "3 NaN NaN NaN NaN True\n",
+ "4 inf inf 0.0 1.000000 False\n",
+ "5 -inf NaN -0.0 0.000000 False\n",
+ "\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/tmp/ipykernel_16428/3106953107.py:3: RuntimeWarning: divide by zero encountered in log\n",
+ " logx = np.log(x)\n",
+ "/tmp/ipykernel_16428/3106953107.py:3: RuntimeWarning: invalid value encountered in log\n",
+ " logx = np.log(x)\n",
+ "/tmp/ipykernel_16428/3106953107.py:4: RuntimeWarning: divide by zero encountered in divide\n",
+ " invx = np.array(1 / x)\n"
+ ]
+ }
+ ],
+ "source": [
+ "# nan and inf handling in numpy:\n",
+ "x = np.array([-1, 0, 1, np.nan, np.inf, -np.inf])\n",
+ "logx = np.log(x)\n",
+ "invx = np.array(1 / x)\n",
+ "ncdf = np.array(stats.norm.cdf(x))\n",
+ "isnanx = np.isnan(x)\n",
+ "\n",
+ "results = pd.DataFrame(\n",
+ " {\"x\": x, \"logx\": logx, \"invx\": invx, \"logx\": logx, \"ncdf\": ncdf, \"isnanx\": isnanx},\n",
+ ")\n",
+ "print(f\"results: \\n{results}\\n\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "preview: \n",
+ " lsat_pd missLSAT\n",
+ "119 156.0 False\n",
+ "120 159.0 False\n",
+ "121 157.0 False\n",
+ "122 167.0 False\n",
+ "123 NaN True\n",
+ "124 158.0 False\n",
+ "125 155.0 False\n",
+ "126 157.0 False\n",
+ "127 NaN True\n",
+ "128 163.0 False\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "lawsch85 = wool.data(\"lawsch85\")\n",
+ "lsat_pd = lawsch85[\"LSAT\"]\n",
+ "\n",
+ "# create boolean indicator for missings:\n",
+ "missLSAT = lsat_pd.isna()\n",
+ "\n",
+ "# LSAT and indicator for Schools No. 120-129:\n",
+ "preview = pd.DataFrame({\"lsat_pd\": lsat_pd[119:129], \"missLSAT\": missLSAT[119:129]})\n",
+ "print(f\"preview: \\n{preview}\\n\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "freq_missLSAT: \n",
+ "col_0 count\n",
+ "LSAT \n",
+ "False 150\n",
+ "True 6\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "# frequencies of indicator:\n",
+ "freq_missLSAT = pd.crosstab(missLSAT, columns=\"count\")\n",
+ "print(f\"freq_missLSAT: \\n{freq_missLSAT}\\n\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "colsums: \n",
+ "rank 0\n",
+ "salary 8\n",
+ "cost 6\n",
+ "LSAT 6\n",
+ "GPA 7\n",
+ "libvol 1\n",
+ "faculty 4\n",
+ "age 45\n",
+ "clsize 3\n",
+ "north 0\n",
+ "south 0\n",
+ "east 0\n",
+ "west 0\n",
+ "lsalary 8\n",
+ "studfac 6\n",
+ "top10 0\n",
+ "r11_25 0\n",
+ "r26_40 0\n",
+ "r41_60 0\n",
+ "llibvol 1\n",
+ "lcost 6\n",
+ "dtype: int64\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "# missings for all variables in data frame (counts):\n",
+ "miss_all = lawsch85.isna()\n",
+ "colsums = miss_all.sum(axis=0)\n",
+ "print(f\"colsums: \\n{colsums}\\n\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "freq_complete_cases: \n",
+ "col_0 count\n",
+ "row_0 \n",
+ "False 66\n",
+ "True 90\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "# computing amount of complete cases:\n",
+ "complete_cases = miss_all.sum(axis=1) == 0\n",
+ "freq_complete_cases = pd.crosstab(complete_cases, columns=\"count\")\n",
+ "print(f\"freq_complete_cases: \\n{freq_complete_cases}\\n\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "x_np_bar1: nan\n",
+ "\n",
+ "x_np_bar2: 158.29333333333332\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "lawsch85 = wool.data(\"lawsch85\")\n",
+ "\n",
+ "# missings in numpy:\n",
+ "x_np = np.array(lawsch85[\"LSAT\"])\n",
+ "x_np_bar1 = np.mean(x_np)\n",
+ "x_np_bar2 = np.nanmean(x_np)\n",
+ "print(f\"x_np_bar1: {x_np_bar1}\\n\")\n",
+ "print(f\"x_np_bar2: {x_np_bar2}\\n\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "x_pd_bar1: 158.29333333333332\n",
+ "\n",
+ "x_pd_bar2: 158.29333333333332\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "# missings in pandas:\n",
+ "x_pd = lawsch85[\"LSAT\"]\n",
+ "x_pd_bar1 = np.mean(x_pd)\n",
+ "x_pd_bar2 = np.nanmean(x_pd)\n",
+ "print(f\"x_pd_bar1: {x_pd_bar1}\\n\")\n",
+ "print(f\"x_pd_bar2: {x_pd_bar2}\\n\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "lawsch85.shape: (156, 21)\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "# observations and variables:\n",
+ "print(f\"lawsch85.shape: {lawsch85.shape}\\n\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "results.nobs: 95.0\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "# regression (missings are taken care of by default):\n",
+ "reg = smf.ols(formula=\"np.log(salary) ~ LSAT + cost + age\", data=lawsch85)\n",
+ "results = reg.fit()\n",
+ "print(f\"results.nobs: {results.nobs}\\n\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 9.4 Outlying Observations"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "studres_max: 4.555033421514247\n",
+ "\n",
+ "studres_min: -1.8180393952811693\n",
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "rdchem = wool.data(\"rdchem\")\n",
+ "\n",
+ "# OLS regression:\n",
+ "reg = smf.ols(formula=\"rdintens ~ sales + profmarg\", data=rdchem)\n",
+ "results = reg.fit()\n",
+ "\n",
+ "# studentized residuals for all observations:\n",
+ "studres = results.get_influence().resid_studentized_external\n",
+ "\n",
+ "# display extreme values:\n",
+ "studres_max = np.max(studres)\n",
+ "studres_min = np.min(studres)\n",
+ "print(f\"studres_max: {studres_max}\\n\")\n",
+ "print(f\"studres_min: {studres_min}\\n\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Text(0.5, 0, 'studres')"
+ ]
+ },
+ "execution_count": 22,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "",
+ "text/plain": [
+ "