From 3097287260ace8236c0acdbbec49f277d28f2135 Mon Sep 17 00:00:00 2001
From: Pegah-Ardehkhani <89220787+Pegah-Ardehkhani@users.noreply.github.com>
Date: Wed, 15 Jun 2022 15:15:02 +0430
Subject: [PATCH] Add files via upload
---
Chapter 4 Parametric Hypothesis Testing.ipynb | 2544 +++++++++++++++++
1 file changed, 2544 insertions(+)
create mode 100644 Chapter 4 Parametric Hypothesis Testing.ipynb
diff --git a/Chapter 4 Parametric Hypothesis Testing.ipynb b/Chapter 4 Parametric Hypothesis Testing.ipynb
new file mode 100644
index 0000000..111dccc
--- /dev/null
+++ b/Chapter 4 Parametric Hypothesis Testing.ipynb
@@ -0,0 +1,2544 @@
+{
+ "nbformat": 4,
+ "nbformat_minor": 0,
+ "metadata": {
+ "colab": {
+ "name": "Chapter 4: Parametric Hypothesis Testing.ipynb",
+ "provenance": [],
+ "collapsed_sections": [
+ "WJhmgDxsVHEO",
+ "VAqEz3tk15jW",
+ "Kc6Ospy5qm2f",
+ "L5FYq4KIr5tR",
+ "btrLaUKzN1PY",
+ "4V1_eXUkxJD4",
+ "58w24h7DxfUc",
+ "rkSluAxdVb1M",
+ "Br8LlPdh4e9R",
+ "ZJbTadX1NnaL",
+ "a6nq2C5NlRKo",
+ "I2zKIWZWFf4V",
+ "ro9fDzmQU8IZ"
+ ]
+ },
+ "kernelspec": {
+ "name": "python3",
+ "display_name": "Python 3"
+ },
+ "language_info": {
+ "name": "python"
+ }
+ },
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "source": [
+ "# **Chapter 4: Parametric Hypothesis Testing**"
+ ],
+ "metadata": {
+ "id": "a2GtsO5MH734"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "**Table of Content:**\n",
+ "\n",
+ "- [Import Libraries](#Import_Libraries)\n",
+ "- [4.1. Introduction](#Introduction)\n",
+ "- [4.2. Test Concerning the Mean of a Normal Population](#Test_Concerning_the_Mean_of_a_Normal_Population)\n",
+ " - [4.2.1. Known Standard Deviation](#Known_Standard_Deviation)\n",
+ " - [4.2.2. Unknown Standard Deviation](#Unknown_Standard_Deviation)\n",
+ "- [4.3. Test Concerning the Equality of Means of Two Normal Populations](#Test_Concerning_the_Equality_of_Means_of_Two_Normal_Populations)\n",
+ " - [4.3.1. Known Variances](#Known_Variances)\n",
+ " - [4.3.2. Unknown but Equal Variances](#Unknown_but_Equal_Variances)\n",
+ "- [4.4. Paired t-test](#Paired_t-test)\n",
+ "- [4.5. Test Concerning the Variance of a Normal Population](#Test_Concerning_the_Variance_of_a_Normal_Population)\n",
+ "- [4.6. Test Concerning the Equality of Variances of Two Normal Populations](#Test_Concerning_the_Equality_of_Variances_of_Two_Normal_Populations)\n",
+ "- [4.7. Test _Concerning P in Bernoulli Populations](#Test_Concerning_P_in_Bernoulli_Populations)\n",
+ "- [4.8. Test Concerning the Equality of P in Two Bernoulli Populations](#Test_Concerning_the_Equality_of_P_in_Two_Bernoulli_Populations)\n"
+ ],
+ "metadata": {
+ "id": "dGYw5uzq5aM7"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "\n",
+ "\n",
+ "## **Import Libraries**"
+ ],
+ "metadata": {
+ "id": "WJhmgDxsVHEO"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "!pip install --upgrade scipy"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "vXStgb2JU6c0",
+ "outputId": "c3be2276-b0e8-4feb-c137-f726380b0ac9"
+ },
+ "execution_count": 1,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n",
+ "Requirement already satisfied: scipy in /usr/local/lib/python3.7/dist-packages (1.4.1)\n",
+ "Collecting scipy\n",
+ " Downloading scipy-1.7.3-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (38.1 MB)\n",
+ "\u001b[K |████████████████████████████████| 38.1 MB 18.4 MB/s \n",
+ "\u001b[?25hRequirement already satisfied: numpy<1.23.0,>=1.16.5 in /usr/local/lib/python3.7/dist-packages (from scipy) (1.21.6)\n",
+ "Installing collected packages: scipy\n",
+ " Attempting uninstall: scipy\n",
+ " Found existing installation: scipy 1.4.1\n",
+ " Uninstalling scipy-1.4.1:\n",
+ " Successfully uninstalled scipy-1.4.1\n",
+ "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
+ "albumentations 0.1.12 requires imgaug<0.2.7,>=0.2.5, but you have imgaug 0.2.9 which is incompatible.\u001b[0m\n",
+ "Successfully installed scipy-1.7.3\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "import matplotlib.pyplot as plt\n",
+ "import matplotlib.patches as mpatches\n",
+ "import seaborn as sns\n",
+ "import math\n",
+ "from scipy import stats\n",
+ "from scipy.stats import norm\n",
+ "from scipy.stats import chi2\n",
+ "from scipy.stats import t\n",
+ "from scipy.stats import f\n",
+ "from scipy.stats import bernoulli\n",
+ "from scipy.stats import binom\n",
+ "from scipy.stats import nbinom\n",
+ "from scipy.stats import geom\n",
+ "from scipy.stats import poisson\n",
+ "from scipy.stats import uniform\n",
+ "from scipy.stats import randint\n",
+ "from scipy.stats import expon\n",
+ "from scipy.stats import gamma\n",
+ "from scipy.stats import beta\n",
+ "from scipy.stats import weibull_min\n",
+ "from scipy.stats import hypergeom\n",
+ "from scipy.stats import shapiro\n",
+ "from scipy.stats import pearsonr\n",
+ "from scipy.stats import normaltest\n",
+ "from scipy.stats import anderson\n",
+ "from scipy.stats import spearmanr\n",
+ "from scipy.stats import kendalltau\n",
+ "from scipy.stats import chi2_contingency\n",
+ "from scipy.stats import ttest_ind\n",
+ "from scipy.stats import ttest_rel\n",
+ "from scipy.stats import mannwhitneyu\n",
+ "from scipy.stats import wilcoxon\n",
+ "from scipy.stats import kruskal\n",
+ "from scipy.stats import friedmanchisquare\n",
+ "from statsmodels.tsa.stattools import adfuller\n",
+ "from statsmodels.tsa.stattools import kpss\n",
+ "from statsmodels.stats.weightstats import ztest\n",
+ "from scipy.integrate import quad\n",
+ "from IPython.display import display, Latex\n",
+ "\n",
+ "import warnings\n",
+ "warnings.filterwarnings('ignore')\n",
+ "warnings.simplefilter(action='ignore', category=FutureWarning)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "ZPuphzTmU-P8",
+ "outputId": "16e994f6-8530-4d26-a3f4-34a299ddbd95"
+ },
+ "execution_count": 2,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stderr",
+ "text": [
+ "/usr/local/lib/python3.7/dist-packages/statsmodels/tools/_testing.py:19: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.\n",
+ " import pandas.util.testing as tm\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "\n",
+ "\n",
+ "## **4.1. Introduction:**"
+ ],
+ "metadata": {
+ "id": "VAqEz3tk15jW"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "**Statistical Hypothesis:** A statistical hypothesis is usually a statement about a set of parameters of a population distribution.\n",
+ "\n",
+ "**$H_0$ (Null Hypothesis):** The null hypothesis is a statistical hypothesis to be tested and accepted or rejected in favor of an alternative.\n",
+ "\n",
+ "**$H_1$ (Alternative Hypothesis):** An alternative hypothesis is an opposing theory in relation to the null hypothesis.\n",
+ "\n",
+ "**Type $\\mathrm{I}$ Error:** The type $\\mathrm{I}$ error, is said to result if the test incorrectly calls for rejecting $H_0$ when it is indeed correct.\n",
+ "\n",
+ "$\\alpha = P(reject\\ H_0\\ |\\ H_0\\ is\\ true)$\n",
+ "\n",
+ "**Type $\\mathrm{II}$ Error:** The type $\\mathrm{II}$ error, results if the test calls for accepting $H_0$ when it is false.\n",
+ "\n",
+ "$\\beta = P(Accept\\ H_0\\ |\\ H_0\\ is\\ not\\ true)$\n",
+ "\n",
+ "**Significance Level:** Whenever $H_0$ is true, its probability of being rejected is never greater than $\\alpha$. The value $\\alpha$, called the level of significance of the test, is usually set in advance, with commonly chosen values being $\\alpha = 0.1, 0.05, 0.005$.\n",
+ "\n",
+ "**P_value:** The P value, or calculated probability, is the probability of finding the observed, or more extreme, results when the null hypothesis (H 0) of a study question is true — the definition of ‘extreme’ depends on how the hypothesis is being tested.\n",
+ "\n",
+ "If your P value is less than the chosen significance level then you reject the null hypothesis i.e. accept that your sample gives reasonable evidence to support the alternative hypothesis."
+ ],
+ "metadata": {
+ "id": "ObVoGiq9mQ_D"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "
\n",
+ ". | $H_0$ is actually true | $H_0$ is not true |
---|
\n",
+ "
Accept $H_0$ | $1-\\alpha$ | $\\beta$ |
\n",
+ "Reject $H_0$ | $\\alpha$ | $1-\\beta$ |
\n",
+ "
"
+ ],
+ "metadata": {
+ "id": "YepvfOWIIcEk"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "**Steps of a hypothesis testing:**\n",
+ "\n",
+ "1. Specify the null and alternative Hypothesis.\n",
+ "2. Collect a random sample from the population. \n",
+ "3. Calculate the Test Statistic and Corresponding P-Value\n",
+ "4. Decide whether to reject or fail to reject your null hypothesis"
+ ],
+ "metadata": {
+ "id": "UaFVzosSs5oP"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "\n",
+ "\n",
+ "## **4.2. Test Concerning the Mean of a Normal Population:**"
+ ],
+ "metadata": {
+ "id": "Kc6Ospy5qm2f"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "\n",
+ "\n",
+ "### **4.2.1. Known Standard Deviation:**"
+ ],
+ "metadata": {
+ "id": "L5FYq4KIr5tR"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "**A. Two Tailed Test:**\n",
+ "\n",
+ "Suppose that $X_1, X_2, ..., X_n$ is a sample of size $n$ from a normal distribution having an unknown mean $\\mu$ and a known variance $\\sigma^2$.\n",
+ "\n",
+ "$X_1, X_2, ..., X_n \\sim N( \\mu, \\sigma^2)$\n",
+ "\n",
+ "$H_0:\\ \\mu = \\mu_0$\n",
+ "\n",
+ "$H_1:\\ \\mu \\neq \\mu_0$\n",
+ "\n",
+ "$\\\\ $\n",
+ "\n",
+ "Testing statistics: \n",
+ "\n",
+ "$Z_0 \\equiv \\frac{\\overline{X}-\\mu_0}{\\frac{\\sigma}{\\sqrt{n}}}$\n",
+ "\n",
+ "$\\\\ $\n",
+ "\n",
+ "Significance level = $\\alpha$\n",
+ "\n",
+ "We accept $H_0$ if:\n",
+ "\n",
+ "1. $-\\ Z_{\\frac{\\alpha}{2}}\\ <\\ Z_0 = \\frac{\\overline{X}-\\mu_0}{\\frac{\\sigma}{\\sqrt{n}}}\\ <\\ Z_{\\frac{\\alpha}{2}}$\n",
+ "\n",
+ "2. $\\mu_0\\ -\\ Z_{\\frac{\\alpha}{2}} \\frac{\\sigma}{\\sqrt{n}}\\ <\\ \\overline{X}\\ <\\ \\mu_0\\ +\\ Z_{\\frac{\\alpha}{2}} \\frac{\\sigma}{\\sqrt{n}}$\n",
+ "\n",
+ "3. $|\\overline{X}-\\mu_0| < Z_{\\frac{\\alpha}{2}} \\frac{\\sigma}{\\sqrt{n}}$\n",
+ "\n",
+ "4. P_value $ = 2 \\times P(Z \\geq |Z_0|) > \\alpha$"
+ ],
+ "metadata": {
+ "id": "9PWV-JGeJyVN"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "**B. One-sided tests:**\n",
+ "\n",
+ "**Right-tailed test:**\n",
+ "\n",
+ "Suppose that $X_1, X_2, ..., X_n$ is a sample of size $n$ from a normal distribution having an unknown mean $\\mu$ and a known variance $\\sigma^2$.\n",
+ "\n",
+ "$X_1, X_2, ..., X_n \\sim N( \\mu, \\sigma^2)$\n",
+ "\n",
+ "$H_0:\\ \\mu = \\mu_0\\ \\quad or \\quad H_0:\\ \\mu \\leq \\mu_0$\n",
+ "\n",
+ "$H_1:\\ \\mu > \\mu_0$\n",
+ "\n",
+ "$\\\\ $\n",
+ "\n",
+ "Testing statistics: \n",
+ "\n",
+ "$Z_0 \\equiv \\frac{\\overline{X}-\\mu_0}{\\frac{\\sigma}{\\sqrt{n}}}$\n",
+ "\n",
+ "$\\\\ $\n",
+ "\n",
+ "Significance level = $\\alpha$\n",
+ "\n",
+ "We accept $H_0$ if:\n",
+ "\n",
+ "1. $ -\\infty <\\ Z_0 = \\frac{\\overline{X}-\\mu_0}{\\frac{\\sigma}{\\sqrt{n}}}\\ <\\ Z_{\\alpha} $\n",
+ "\n",
+ "2. $- \\infty \\ <\\ \\overline{X}\\ <\\ \\mu_0\\ +\\ Z_{\\alpha} \\frac{\\sigma}{\\sqrt{n}}$\n",
+ "\n",
+ "3. P_value $ = P(Z \\geq Z_0) > \\alpha$"
+ ],
+ "metadata": {
+ "id": "31PJg_ETxNMP"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "**Left-tailed test:**\n",
+ "\n",
+ "Suppose that $X_1, X_2, ..., X_n$ is a sample of size $n$ from a normal distribution having an unknown mean $\\mu$ and a known variance $\\sigma^2$.\n",
+ "\n",
+ "$X_1, X_2, ..., X_n \\sim N( \\mu, \\sigma^2)$\n",
+ "\n",
+ "$H_0:\\ \\mu = \\mu_0 \\quad or \\quad H_0:\\ \\mu \\geq \\mu_0$\n",
+ "\n",
+ "$H_1:\\ \\mu < \\mu_0$\n",
+ "\n",
+ "$\\\\ $\n",
+ "\n",
+ "Testing statistics: \n",
+ "\n",
+ "$Z_0 \\equiv \\frac{\\overline{X}-\\mu_0}{\\frac{\\sigma}{\\sqrt{n}}}$\n",
+ "\n",
+ "$\\\\ $\n",
+ "\n",
+ "Significance level = $\\alpha$\n",
+ "\n",
+ "We accept $H_0$ if:\n",
+ "\n",
+ "1. $-\\ Z_{\\alpha}\\ <\\ Z_0 = \\frac{\\overline{X}-\\mu_0}{\\frac{\\sigma}{\\sqrt{n}}}\\ <\\ \\infty $\n",
+ "\n",
+ "2. $\\ \\mu_0\\ -\\ Z_{\\alpha} \\frac{\\sigma}{\\sqrt{n}}\\ <\\ \\overline{X}\\ <\\ \\infty$\n",
+ "\n",
+ "3. P_value $ = P(Z \\leq Z_0) > \\alpha$"
+ ],
+ "metadata": {
+ "id": "rhL-meIV0GAB"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "class mean_with_known_variance:\n",
+ " \"\"\"\n",
+ " Parameters\n",
+ " ----------\n",
+ " null_mean : u0\n",
+ " population_sd : known standrad deviation of the population\n",
+ " n : optional, number of sample members\n",
+ " alpha : significance level\n",
+ " type_t : 'two_tailed', 'right_tailed', 'left_tailed'\n",
+ " Sample_mean : optional, mean of the sample\n",
+ " data : optional, if you do not know the sample mean, just pass the data\n",
+ " \"\"\"\n",
+ " def __init__(self, null_mean, population_sd, alpha, type_t, n = 0., Sample_mean = 0., data=None):\n",
+ " self.Sample_mean = Sample_mean\n",
+ " self.null_mean = null_mean\n",
+ " self.population_sd = population_sd\n",
+ " self.type_t = type_t\n",
+ " self.n = n\n",
+ " self.alpha = alpha\n",
+ " self.data = data\n",
+ " if data is not None:\n",
+ " self.Sample_mean = np.mean(list(data))\n",
+ " self.n = len(list(data))\n",
+ "\n",
+ " mean_with_known_variance.__test(self)\n",
+ " \n",
+ " def __test(self):\n",
+ " test_statistic = (self.Sample_mean-self.null_mean) / (self.population_sd/np.sqrt(self.n))\n",
+ " print('test_statistic:', test_statistic, '\\n')\n",
+ "\n",
+ " if self.type_t == 'two_tailed':\n",
+ " p_value = 2*(1-norm.cdf(abs(test_statistic)))\n",
+ " print('P_value:', p_value, '\\n')\n",
+ " elif self.type_t == 'left_tailed':\n",
+ " p_value = (norm.cdf(test_statistic))\n",
+ " print('P_value:', p_value, '\\n')\n",
+ " else:\n",
+ " p_value = (1-norm.cdf(test_statistic))\n",
+ " print('P_value:', p_value, '\\n')\n",
+ "\n",
+ " if p_value < self.alpha:\n",
+ " print(f'Since p_value < {self.alpha}, reject null hypothesis.')\n",
+ " else:\n",
+ " print(f'Since p_value > {self.alpha}, the null hypothesis cannot be rejected.')"
+ ],
+ "metadata": {
+ "id": "lX6jag0Rlm0r"
+ },
+ "execution_count": 3,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "mean_with_known_variance(type_t='two_tailed', Sample_mean=8.75, n=16, null_mean=10, population_sd=1, alpha=0.05);"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "7X8RAhJ2lp86",
+ "outputId": "cb26aaa5-0684-467f-a5b5-d074a9a7c622"
+ },
+ "execution_count": 4,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "test_statistic: -5.0 \n",
+ "\n",
+ "P_value: 5.733031438470704e-07 \n",
+ "\n",
+ "Since p_value < 0.05, reject null hypothesis.\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "np.random.seed(1)\n",
+ "data = np.random.normal(0,1,100)\n",
+ "mean_with_known_variance(type_t='left_tailed', data=data, null_mean=0.1, population_sd=1, alpha=0.05);"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "WiTfjm-Blw9Z",
+ "outputId": "59b6eb97-6fe6-4e80-a2c1-91152e4288a1"
+ },
+ "execution_count": 5,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "test_statistic: -0.394171479243013 \n",
+ "\n",
+ "P_value: 0.34672722046703075 \n",
+ "\n",
+ "Since p_value > 0.05, the null hypothesis cannot be rejected.\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "\n",
+ "\n",
+ "### **4.2.2. Unknown Standard Deviation:**\n"
+ ],
+ "metadata": {
+ "id": "btrLaUKzN1PY"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "**A. Two Tailed Test:**\n",
+ "\n",
+ "Suppose that $X_1, X_2, ..., X_n$ is a sample of size $n$ from a normal distribution having an unknown mean $\\mu$ and a unknown variance $\\sigma^2$.\n",
+ "\n",
+ "$X_1, X_2, ..., X_n \\sim N( \\mu, \\sigma^2)$\n",
+ "\n",
+ "$H_0:\\ \\mu = \\mu_0$\n",
+ "\n",
+ "$H_1:\\ \\mu \\neq \\mu_0$\n",
+ "\n",
+ "$\\\\ $\n",
+ "\n",
+ "Testing statistics: \n",
+ "\n",
+ "$t_0 \\equiv \\frac{\\overline{X}-\\mu_0}{\\frac{S}{\\sqrt{n}}} \\sim t_{n-1}$\n",
+ "\n",
+ "$S = \\sqrt{\\frac{\\sum_{i=1}^n\\ (x_i\\ -\\ \\overline{x})^2}{n-1}}$\n",
+ "\n",
+ "$\\\\ $\n",
+ "\n",
+ "Significance level = $\\alpha$\n",
+ "\n",
+ "We accept $H_0$ if:\n",
+ "\n",
+ "1. $-\\ t_{\\frac{\\alpha}{2},n-1}\\ <\\ t_0 = \\frac{\\overline{X}-\\mu_0}{\\frac{S}{\\sqrt{n}}} <\\ t_{\\frac{\\alpha}{2},n-1}$\n",
+ "\n",
+ "2. $\\mu_0\\ -\\ t_{\\frac{\\alpha}{2},n-1} \\frac{S}{\\sqrt{n}}\\ <\\ \\overline{X}\\ <\\ \\mu_0\\ +\\ t_{\\frac{\\alpha}{2},n-1} \\frac{S}{\\sqrt{n}}$\n",
+ "\n",
+ "3. $|\\overline{X}-\\mu_0| < t_{\\frac{\\alpha}{2},n-1} \\frac{S}{\\sqrt{n}}$\n",
+ "\n",
+ "4. P_value $ = 2 \\times P(t_{n-1} \\geq |t_0|) > \\alpha$"
+ ],
+ "metadata": {
+ "id": "2b3-4p0tQKdv"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "**B. One-sided tests:**\n",
+ "\n",
+ "**Right-tailed test:**\n",
+ "\n",
+ "Suppose that $X_1, X_2, ..., X_n$ is a sample of size $n$ from a normal distribution having an unknown mean $\\mu$ and a unknown variance $\\sigma^2$.\n",
+ "\n",
+ "$X_1, X_2, ..., X_n \\sim N( \\mu, \\sigma^2)$\n",
+ "\n",
+ "$H_0:\\ \\mu = \\mu_0\\ \\quad or \\quad H_0:\\ \\mu \\leq \\mu_0$\n",
+ "\n",
+ "$H_1:\\ \\mu > \\mu_0$\n",
+ "\n",
+ "$\\\\ $\n",
+ "\n",
+ "Testing statistics: \n",
+ "\n",
+ "$t_0 \\equiv \\frac{\\overline{X}-\\mu_0}{\\frac{S}{\\sqrt{n}}} \\sim t_{n-1}$\n",
+ "\n",
+ "$S = \\sqrt{\\frac{\\sum_{i=1}^n\\ (x_i\\ -\\ \\overline{x})^2}{n-1}}$\n",
+ "\n",
+ "$\\\\ $\n",
+ "\n",
+ "Significance level = $\\alpha$\n",
+ "\n",
+ "We accept $H_0$ if:\n",
+ "\n",
+ "1. $ -\\infty <\\ t_0 = \\frac{\\overline{X}-\\mu_0}{\\frac{S}{\\sqrt{n}}}\\ <\\ t_{\\alpha, n-1} $\n",
+ "\n",
+ "2. $- \\infty \\ <\\ \\overline{X}\\ <\\ \\mu_0\\ +\\ t_{\\alpha, n-1} \\frac{S}{\\sqrt{n}}$\n",
+ "\n",
+ "3. P_value $ = P(t_{n-1} \\geq t_0) > \\alpha$"
+ ],
+ "metadata": {
+ "id": "EDXY_X3im-39"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "**Left-tailed test:**\n",
+ "\n",
+ "Suppose that $X_1, X_2, ..., X_n$ is a sample of size $n$ from a normal distribution having an unknown mean $\\mu$ and a known variance $\\sigma^2$.\n",
+ "\n",
+ "$X_1, X_2, ..., X_n \\sim N( \\mu, \\sigma^2)$\n",
+ "\n",
+ "$H_0:\\ \\mu = \\mu_0 \\quad or \\quad H_0:\\ \\mu \\geq \\mu_0$\n",
+ "\n",
+ "$H_1:\\ \\mu < \\mu_0$\n",
+ "\n",
+ "$\\\\ $\n",
+ "\n",
+ "Testing statistics: \n",
+ "\n",
+ "$t_0 \\equiv \\frac{\\overline{X}-\\mu_0}{\\frac{S}{\\sqrt{n}}} \\sim t_{n-1}$\n",
+ "\n",
+ "$S = \\sqrt{\\frac{\\sum_{i=1}^n\\ (x_i\\ -\\ \\overline{x})^2}{n-1}}$\n",
+ "\n",
+ "$\\\\ $\n",
+ "\n",
+ "Significance level = $\\alpha$\n",
+ "\n",
+ "We accept $H_0$ if:\n",
+ "\n",
+ "1. $ -\\ t_{\\alpha, n-1} <\\ t_0 = \\frac{\\overline{X}-\\mu_0}{\\frac{S}{\\sqrt{n}}}\\ <\\ \\infty$\n",
+ "\n",
+ "2. $\\ \\mu_0\\ -\\ t_{\\alpha, n-1} \\frac{S}{\\sqrt{n}} <\\ \\overline{X}\\ <\\ \\infty $\n",
+ "\n",
+ "3. P_value $ = P(t_{n-1} \\leq t_0) > \\alpha$"
+ ],
+ "metadata": {
+ "id": "BjSK1-M-oaJU"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "class mean_with_unknown_variance:\n",
+ " \"\"\"\n",
+ " Parameters\n",
+ " ----------\n",
+ " null_mean : u0\n",
+ " n : optional, number of sample members\n",
+ " S : optional, sample standard deviation\n",
+ " alpha : significance level\n",
+ " Sample_mean : optional, mean of the sample\n",
+ " data : optional, if you do not know the sample mean and S, just pass the data\n",
+ " \"\"\"\n",
+ " def __init__(self, null_mean, alpha, type_t, n = 0.,S = 0., Sample_mean = 0., data=None):\n",
+ " self.Sample_mean = Sample_mean\n",
+ " self.S = S\n",
+ " self.null_mean = null_mean\n",
+ " self.type_t = type_t\n",
+ " self.n = n\n",
+ " self.alpha = alpha\n",
+ " self.data = data\n",
+ " if data is not None:\n",
+ " self.Sample_mean = np.mean(list(data))\n",
+ " self.S = np.std(list(data), ddof=1)\n",
+ " self.n = len(list(data))\n",
+ "\n",
+ " mean_with_unknown_variance.__test(self)\n",
+ " \n",
+ " def __test(self):\n",
+ " test_statistic = (self.Sample_mean-self.null_mean) / (self.S/np.sqrt(self.n))\n",
+ " print('test_statistic:', test_statistic, '\\n')\n",
+ " \n",
+ " if self.type_t == 'two_tailed':\n",
+ " p_value = 2*(1-t.cdf(abs(test_statistic), df=self.n-1))\n",
+ " print('P_value:', p_value, '\\n')\n",
+ " elif self.type_t == 'left_tailed':\n",
+ " p_value = (t.cdf(test_statistic, df=self.n-1))\n",
+ " print('P_value:', p_value, '\\n')\n",
+ " else:\n",
+ " p_value = (1-t.cdf(test_statistic, df=self.n-1))\n",
+ " print('P_value:', p_value, '\\n')\n",
+ "\n",
+ " if p_value < self.alpha:\n",
+ " print(f'Since p_value < {self.alpha}, reject null hypothesis.')\n",
+ " else:\n",
+ " print(f'Since p_value > {self.alpha}, the null hypothesis cannot be rejected.')"
+ ],
+ "metadata": {
+ "id": "nt7giKLcfxtx"
+ },
+ "execution_count": 6,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "mean_with_unknown_variance(type_t='two_tailed', Sample_mean=14.5, S=0.3, n=9, null_mean=14.2, alpha=0.05);"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "60yeQGM1k0cZ",
+ "outputId": "ebf8ef2b-0722-48cf-c024-a053823acad8"
+ },
+ "execution_count": 7,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "test_statistic: 3.0000000000000075 \n",
+ "\n",
+ "P_value: 0.017071681233782554 \n",
+ "\n",
+ "Since p_value < 0.05, reject null hypothesis.\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "np.random.seed(1)\n",
+ "data = np.random.normal(0,1,100)\n",
+ "mean_with_unknown_variance(type_t='two_tailed', data=data, null_mean=0.1, alpha=0.05);"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "cIFNzvbIqov6",
+ "outputId": "11b26f64-0e67-4f9a-d316-542cdb0bfb90"
+ },
+ "execution_count": 8,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "test_statistic: -0.4430807396299341 \n",
+ "\n",
+ "P_value: 0.6586740373655937 \n",
+ "\n",
+ "Since p_value > 0.05, the null hypothesis cannot be rejected.\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "\n",
+ "\n",
+ "## **4.3. Test Concerning the Equality of Means of Two Normal Populations:**"
+ ],
+ "metadata": {
+ "id": "4V1_eXUkxJD4"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "\n",
+ "\n",
+ "### **4.3.1. Known Variances:**"
+ ],
+ "metadata": {
+ "id": "58w24h7DxfUc"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "**A. Two Tailed Test:**\n",
+ "\n",
+ "Suppose that $X_1, X_2, ..., X_n$ is a sample of size $n_x$ from a normal distribution having an unknown mean $\\mu_x$ and a known variance $\\sigma^2_x$. \n",
+ "\n",
+ "$Y_1, Y_2, ..., Y_n$ is a sample of size $n_y$ from a normal distribution having an unknown mean $\\mu_y$ and a known variance $\\sigma^2_y$.\n",
+ "\n",
+ "$X_1, X_2, ..., X_{n_x} \\sim N( \\mu_x, \\sigma^2_x)$\n",
+ "\n",
+ "$Y_1, Y_2, ..., Y_{n_y} \\sim N( \\mu_y, \\sigma^2_y)$\n",
+ "\n",
+ "$H_0:\\ \\mu_x = \\mu_y$\n",
+ "\n",
+ "$H_1:\\ \\mu_x \\neq \\mu_y$\n",
+ "\n",
+ "$\\\\ $\n",
+ "\n",
+ "Testing statistics: \n",
+ "\n",
+ "$Z_0 \\equiv \\frac{\\overline{X}-\\overline{Y} -\\ 0}{\\sqrt{(\\frac{\\sigma^2_x}{n_x}) + (\\frac{\\sigma^2_y}{n_y})}}$\n",
+ "\n",
+ "$\\\\ $\n",
+ "\n",
+ "Significance level = $\\alpha$\n",
+ "\n",
+ "We accept $H_0$ if:\n",
+ "\n",
+ "1. $-\\ Z_{\\frac{\\alpha}{2}}\\ <\\ Z_0 = \\frac{\\overline{X}-\\overline{Y} -\\ 0}{\\sqrt{(\\frac{\\sigma^2_x}{n_x}) + (\\frac{\\sigma^2_y}{n_y})}}\\ <\\ Z_{\\frac{\\alpha}{2}}$\n",
+ "\n",
+ "2. $-\\ Z_{\\frac{\\alpha}{2}} {\\sqrt{(\\frac{\\sigma^2_x}{n_x}) + (\\frac{\\sigma^2_y}{n_y})}}\\ <\\ \\overline{X}\\ - \\overline{Y}\\ <\\ Z_{\\frac{\\alpha}{2}} {\\sqrt{(\\frac{\\sigma^2_x}{n_x}) + (\\frac{\\sigma^2_y}{n_y})}}$\n",
+ "\n",
+ "3. $|\\overline{X}-\\overline{Y}| < Z_{\\frac{\\alpha}{2}} {\\sqrt{(\\frac{\\sigma^2_x}{n_x}) + (\\frac{\\sigma^2_y}{n_y})}}$\n",
+ "\n",
+ "4. P_value $ = 2 \\times P(Z \\geq |Z_0|) > \\alpha$"
+ ],
+ "metadata": {
+ "id": "CkDCKD4ZHYtP"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "**B. One-sided tests:**\n",
+ "\n",
+ "**Right-tailed test:**\n",
+ "\n",
+ "Suppose that $X_1, X_2, ..., X_n$ is a sample of size $n_x$ from a normal distribution having an unknown mean $\\mu_x$ and a known variance $\\sigma^2_x$. \n",
+ "\n",
+ "$Y_1, Y_2, ..., Y_n$ is a sample of size $n_y$ from a normal distribution having an unknown mean $\\mu_y$ and a known variance $\\sigma^2_y$.\n",
+ "\n",
+ "$X_1, X_2, ..., X_{n_x} \\sim N( \\mu_x, \\sigma^2_x)$\n",
+ "\n",
+ "$Y_1, Y_2, ..., Y_{n_y} \\sim N( \\mu_y, \\sigma^2_y)$\n",
+ "\n",
+ "$H_0:\\ \\mu_x = \\mu_y\\ \\quad or \\quad H_0:\\ \\mu_x \\leq \\mu_y$\n",
+ "\n",
+ "$H_1:\\ \\mu_x > \\mu_y$\n",
+ "\n",
+ "$\\\\ $\n",
+ "\n",
+ "Testing statistics: \n",
+ "\n",
+ "$Z_0 \\equiv \\frac{\\overline{X}-\\overline{Y} - 0}{\\sqrt{(\\frac{\\sigma^2_x}{n_x}) + (\\frac{\\sigma^2_y}{n_y})}}$\n",
+ "\n",
+ "$\\\\ $\n",
+ "\n",
+ "Significance level = $\\alpha$\n",
+ "\n",
+ "We accept $H_0$ if:\n",
+ "\n",
+ "1. $ -\\infty <\\ Z_0 = \\frac{\\overline{X}-\\overline{Y} - 0}{\\sqrt{(\\frac{\\sigma^2_x}{n_x}) + (\\frac{\\sigma^2_y}{n_y})}}\\ <\\ Z_{\\alpha} $\n",
+ "\n",
+ "2. $- \\infty \\ <\\ \\overline{X}\\ - \\overline{Y}\\ <\\ Z_{\\alpha} {\\sqrt{(\\frac{\\sigma^2_x}{n_x}) + (\\frac{\\sigma^2_y}{n_y})}}$\n",
+ "\n",
+ "3. P_value $ = P(Z \\geq Z_0) > \\alpha$"
+ ],
+ "metadata": {
+ "id": "Yo2g8fAeHYtS"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "**Left-tailed test:**\n",
+ "\n",
+ "Suppose that $X_1, X_2, ..., X_n$ is a sample of size $n_x$ from a normal distribution having an unknown mean $\\mu_x$ and a known variance $\\sigma^2_x$. \n",
+ "\n",
+ "$Y_1, Y_2, ..., Y_n$ is a sample of size $n_y$ from a normal distribution having an unknown mean $\\mu_y$ and a known variance $\\sigma^2_y$.\n",
+ "\n",
+ "$X_1, X_2, ..., X_{n_x} \\sim N( \\mu_x, \\sigma^2_x)$\n",
+ "\n",
+ "$Y_1, Y_2, ..., Y_{n_y} \\sim N( \\mu_y, \\sigma^2_y)$\n",
+ "\n",
+ "$H_0:\\ \\mu_x = \\mu_y \\quad or \\quad H_0:\\ \\mu_x \\geq \\mu_y$\n",
+ "\n",
+ "$H_1:\\ \\mu_x < \\mu_y$\n",
+ "\n",
+ "$\\\\ $\n",
+ "\n",
+ "Testing statistics: \n",
+ "\n",
+ "$Z_0 \\equiv \\frac{\\overline{X}-\\overline{Y} - 0}{\\sqrt{(\\frac{\\sigma^2_x}{n_x}) + (\\frac{\\sigma^2_y}{n_y})}}$\n",
+ "\n",
+ "$\\\\ $\n",
+ "\n",
+ "Significance level = $\\alpha$\n",
+ "\n",
+ "We accept $H_0$ if:\n",
+ "\n",
+ "1. $-\\ Z_{\\alpha}\\ <\\ Z_0 = \\frac{\\overline{X}-\\overline{Y} - 0}{\\sqrt{(\\frac{\\sigma^2_x}{n_x}) + (\\frac{\\sigma^2_y}{n_y})}} <\\ \\infty $\n",
+ "\n",
+ "2. $\\ -Z_{\\alpha} {\\sqrt{(\\frac{\\sigma^2_x}{n_x}) + (\\frac{\\sigma^2_y}{n_y})}}\\ <\\ \\overline{X} - \\overline{Y}\\ <\\ \\infty$\n",
+ "\n",
+ "3. P_value $ = P(Z \\leq Z_0) > \\alpha$"
+ ],
+ "metadata": {
+ "id": "nt2bxi_dHYtU"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "If you want to test the below hypothesis, just the test statistics changes. The other equations are the same. \n",
+ "\n",
+ "$H_0:\\ \\mu_x - k\\ \\mu_y = d $\n",
+ "\n",
+ "$H_1:\\ \\mu_x - k\\ \\mu_y \\neq d$\n",
+ "\n",
+ "$\\\\ $\n",
+ "\n",
+ "Testing statistics: \n",
+ "\n",
+ "$Z_0 \\equiv \\frac{\\overline{X}-\\ k\\ \\overline{Y} -\\ d}{\\sqrt{(\\frac{\\sigma^2_x}{n_x}) + k^2(\\frac{\\sigma^2_y}{n_y})}}$"
+ ],
+ "metadata": {
+ "id": "28hPv38GQIhB"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "class equal_means_with_known_variances:\n",
+ " \"\"\"\n",
+ " Parameters\n",
+ " ----------\n",
+ " diff_mean_null : difference between the means of two population in the null hypothesis\n",
+ " population_v1 : known variance of the population1\n",
+ " population_v2 : known variance of the population2\n",
+ " n1 : optional, number of sample1 members\n",
+ " n2 : optional, number of sample2 members\n",
+ " alpha : significance level\n",
+ " type_t : 'two_tailed', 'right_tailed', 'left_tailed'\n",
+ " Sample_mean1 : mean of the sample1\n",
+ " Sample_mean2 : mean of the sample2\n",
+ " k : optional\n",
+ " data1 : optional, if you do not know the sample mean1, just pass the first sample\n",
+ " data2 : optional, if you do not know the sample mean2, just pass the second sample\n",
+ " \"\"\"\n",
+ " def __init__(self, diff_mean_null, population_v1, population_v2, alpha, type_t, k = 1,\n",
+ " n1 = 0., n2 = 0., Sample_mean1 = 0., Sample_mean2 = 0., data1=None, data2=None):\n",
+ " self.Sample_mean1 = Sample_mean1\n",
+ " self.Sample_mean2 = Sample_mean2 \n",
+ " self.diff_mean_null = diff_mean_null\n",
+ " self.population_v1 = population_v1\n",
+ " self.population_v2 = population_v2\n",
+ " self.type_t = type_t\n",
+ " self.n1 = n1\n",
+ " self.n2 = n2\n",
+ " self.k = k\n",
+ " self.alpha = alpha\n",
+ " self.data1 = data1\n",
+ " self.data2 = data2\n",
+ " if data1 is not None:\n",
+ " self.Sample_mean1 = np.mean(list(data1))\n",
+ " self.n1 = len(list(data1))\n",
+ " if data2 is not None:\n",
+ " self.Sample_mean2 = np.mean(list(data2))\n",
+ " self.n2 = len(list(data2)) \n",
+ "\n",
+ " equal_means_with_known_variances.__test(self)\n",
+ " \n",
+ " def __test(self):\n",
+ " test_statistic = (self.Sample_mean1-(self.k)*self.Sample_mean2-self.diff_mean_null) / (np.sqrt((self.population_v1/ self.n1) + (self.k**2)*(self.population_v2/ self.n2)))\n",
+ " print('test_statistic:', test_statistic, '\\n')\n",
+ "\n",
+ " if self.type_t == 'two_tailed':\n",
+ " p_value = 2*(1-norm.cdf(abs(test_statistic)))\n",
+ " print('P_value:', p_value, '\\n')\n",
+ " elif self.type_t == 'left_tailed':\n",
+ " p_value = (norm.cdf(test_statistic))\n",
+ " print('P_value:', p_value, '\\n')\n",
+ " else:\n",
+ " p_value = (1-norm.cdf(test_statistic))\n",
+ " print('P_value:', p_value, '\\n')\n",
+ "\n",
+ " if p_value < self.alpha:\n",
+ " print(f'Since p_value < {self.alpha}, reject null hypothesis.')\n",
+ " else:\n",
+ " print(f'Since p_value > {self.alpha}, the null hypothesis cannot be rejected.')"
+ ],
+ "metadata": {
+ "id": "l1L5_GNiHaUN"
+ },
+ "execution_count": 9,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "data1 = np.multiply([61.1,58.2,62.3,64,59.7,66.2,57.8,61.4,62.2,63.6], 100)\n",
+ "data2 = np.multiply([62.2,56.6,66.4,56.2,57.4,58.4,57.6,65.4], 100)\n",
+ "np.mean(data1), np.mean(data2)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "UKOU5aMAT23t",
+ "outputId": "eadf5b16-ae41-4f48-9d12-a8f75f742d50"
+ },
+ "execution_count": 10,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "(6165.0, 6002.5)"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 10
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "equal_means_with_known_variances(diff_mean_null = 0, population_v1 = 4000**2, population_v2 = 6000**2, n1 = 10, n2 = 8, \n",
+ " alpha = 0.05, type_t = 'two_tailed', k = 1, Sample_mean1 = 6165, Sample_mean2 = 6002.5);"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "OrYN40CmRS3V",
+ "outputId": "b5e6a3c4-24ba-4c03-ebd3-168b1f5517c7"
+ },
+ "execution_count": 11,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "test_statistic: 0.06579432682703693 \n",
+ "\n",
+ "P_value: 0.947541572987298 \n",
+ "\n",
+ "Since p_value > 0.05, the null hypothesis cannot be rejected.\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "equal_means_with_known_variances(diff_mean_null = 0, population_v1 = 4000**2, population_v2 = 6000**2, \n",
+ " alpha = 0.05, type_t = 'two_tailed', k = 1, data1 = data1, data2 = data2);"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "OAA0YmNPTFuf",
+ "outputId": "ab4e8374-81c7-43df-c77a-24d5193c5620"
+ },
+ "execution_count": 12,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "test_statistic: 0.06579432682703693 \n",
+ "\n",
+ "P_value: 0.947541572987298 \n",
+ "\n",
+ "Since p_value > 0.05, the null hypothesis cannot be rejected.\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "\n",
+ "\n",
+ "### **4.3.2. Unknown but Equal Variances:**"
+ ],
+ "metadata": {
+ "id": "rkSluAxdVb1M"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "**A. Two Tailed Test:**\n",
+ "\n",
+ "Suppose that $X_1, X_2, ..., X_n$ is a sample of size $n_x$ from a normal distribution having an unknown mean $\\mu_x$ and a unknown variance $\\sigma^2_x$. \n",
+ "\n",
+ "$Y_1, Y_2, ..., Y_n$ is a sample of size $n_y$ from a normal distribution having an unknown mean $\\mu_y$ and a unknown variance $\\sigma^2_y$.\n",
+ "\n",
+ "$X_1, X_2, ..., X_{n_x} \\sim N( \\mu_x, \\sigma^2_x)$\n",
+ "\n",
+ "$Y_1, Y_2, ..., Y_{n_y} \\sim N( \\mu_y, \\sigma^2_y)$\n",
+ "\n",
+ "$H_0:\\ \\mu_x = \\mu_y$\n",
+ "\n",
+ "$H_1:\\ \\mu_x \\neq \\mu_y$\n",
+ "\n",
+ "$\\\\ $\n",
+ "\n",
+ "Testing statistics: \n",
+ "\n",
+ "$t_0 \\equiv \\frac{\\overline{X}-\\ \\overline{Y}\\ -\\ 0}{{S_p} \\sqrt{\\frac{1}{n_x} + \\frac{1}{n_y}}} \\sim t_{{n_x}+{n_y}-2}$\n",
+ "\n",
+ "$S_p^2 = \\frac{(n_x-1)S_x^2 + (n_y-1)S_y^2}{n_x+n_y-2}$\n",
+ "\n",
+ "$\\\\ $\n",
+ "\n",
+ "Significance level = $\\alpha$\n",
+ "\n",
+ "We accept $H_0$ if:\n",
+ "\n",
+ "1. $-\\ t_{\\frac{\\alpha}{2},{n_x} + {n_y}-2}\\ <\\ t_0 = \\frac{\\overline{X}-\\ \\overline{Y}\\ -\\ 0}{{S_p} \\sqrt{\\frac{1}{n_x} + \\frac{1}{n_y}}} <\\ t_{\\frac{\\alpha}{2},{n_x} + {n_y}-2}$\n",
+ "\n",
+ "2. $-\\ t_{\\frac{\\alpha}{2},{n_x} + {n_y}-2} {{S_p} \\sqrt{\\frac{1}{n_x} + \\frac{1}{n_y}}}\\ <\\ \\overline{X}\\ - \\overline{Y}\\ <\\ t_{\\frac{\\alpha}{2},{n_x} + {n_y}-2} {{S_p} \\sqrt{\\frac{1}{n_x} + \\frac{1}{n_y}}}$\n",
+ "\n",
+ "3. $|\\overline{X}-\\overline{Y}| < t_{\\frac{\\alpha}{2},{n_x} + {n_y}-2} {{S_p} \\sqrt{\\frac{1}{n_x} + \\frac{1}{n_y}}}$\n",
+ "\n",
+ "4. P_value $ = 2 \\times P(t_{\\frac{\\alpha}{2},{n_x} + {n_y}-2} \\geq |t_0|) > \\alpha$"
+ ],
+ "metadata": {
+ "id": "fE5B1uXoVyjf"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "**B. One-sided tests:**\n",
+ "\n",
+ "**Right-tailed test:**\n",
+ "\n",
+ "Suppose that $X_1, X_2, ..., X_n$ is a sample of size $n_x$ from a normal distribution having an unknown mean $\\mu_x$ and a unknown variance $\\sigma^2_x$. \n",
+ "\n",
+ "$Y_1, Y_2, ..., Y_n$ is a sample of size $n_y$ from a normal distribution having an unknown mean $\\mu_y$ and a unknown variance $\\sigma^2_y$.\n",
+ "\n",
+ "$X_1, X_2, ..., X_{n_x} \\sim N( \\mu_x, \\sigma^2_x)$\n",
+ "\n",
+ "$Y_1, Y_2, ..., Y_{n_y} \\sim N( \\mu_y, \\sigma^2_y)$\n",
+ "\n",
+ "$H_0:\\ \\mu_x = \\mu_y\\ \\quad or \\quad H_0:\\ \\mu_x \\leq \\mu_y$\n",
+ "\n",
+ "$H_1:\\ \\mu_x > \\mu_y$\n",
+ "\n",
+ "$\\\\ $\n",
+ "\n",
+ "Testing statistics: \n",
+ "\n",
+ "$t_0 \\equiv \\frac{\\overline{X}-\\ \\overline{Y}\\ -\\ 0}{{S_p} \\sqrt{\\frac{1}{n_x} + \\frac{1}{n_y}}} \\sim t_{{n_x}+{n_y}-2}$\n",
+ "\n",
+ "$S_p^2 = \\frac{(n_x-1)S_x^2 + (n_y-1)S_y^2}{n_x+n_y-2}$\n",
+ "\n",
+ "$\\\\ $\n",
+ "\n",
+ "Significance level = $\\alpha$\n",
+ "\n",
+ "We accept $H_0$ if:\n",
+ "\n",
+ "1. $ -\\infty <\\ t_0 = \\frac{\\overline{X}-\\ \\overline{Y}\\ -\\ 0}{{S_p} \\sqrt{\\frac{1}{n_x} + \\frac{1}{n_y}}}\\ <\\ t_{{\\alpha},{n_x} + {n_y}-2} $\n",
+ "\n",
+ "2. $- \\infty \\ <\\ \\overline{X}\\ - \\overline{Y}\\ <\\ t_{{\\alpha},{n_x} + {n_y}-2} {{S_p} \\sqrt{\\frac{1}{n_x} + \\frac{1}{n_y}}}$\n",
+ "\n",
+ "3. P_value $ = P(t_{{n_x} + {n_y}-2} \\geq t_0) > \\alpha$"
+ ],
+ "metadata": {
+ "id": "h4g05pIyafSJ"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "**Left-tailed test:**\n",
+ "\n",
+ "Suppose that $X_1, X_2, ..., X_n$ is a sample of size $n_x$ from a normal distribution having an unknown mean $\\mu_x$ and a known variance $\\sigma^2_x$. \n",
+ "\n",
+ "$Y_1, Y_2, ..., Y_n$ is a sample of size $n_y$ from a normal distribution having an unknown mean $\\mu_y$ and a known variance $\\sigma^2_y$.\n",
+ "\n",
+ "$X_1, X_2, ..., X_{n_x} \\sim N( \\mu_x, \\sigma^2_x)$\n",
+ "\n",
+ "$Y_1, Y_2, ..., Y_{n_y} \\sim N( \\mu_y, \\sigma^2_y)$\n",
+ "\n",
+ "$H_0:\\ \\mu_x = \\mu_y \\quad or \\quad H_0:\\ \\mu_x \\geq \\mu_y$\n",
+ "\n",
+ "$H_1:\\ \\mu_x < \\mu_y$\n",
+ "\n",
+ "$\\\\ $\n",
+ "\n",
+ "Testing statistics: \n",
+ "\n",
+ "$t_0 \\equiv \\frac{\\overline{X}-\\ \\overline{Y}\\ -\\ 0}{{S_p} \\sqrt{\\frac{1}{n_x} + \\frac{1}{n_y}}} \\sim t_{{n_x}+{n_y}-2}$\n",
+ "\n",
+ "$S_p^2 = \\frac{(n_x-1)S_x^2 + (n_y-1)S_y^2}{n_x+n_y-2}$\n",
+ "\n",
+ "$\\\\ $\n",
+ "\n",
+ "Significance level = $\\alpha$\n",
+ "\n",
+ "We accept $H_0$ if:\n",
+ "\n",
+ "1. $-\\ t_{{\\alpha},{n_x} + {n_y}-2} <\\ t_0 = \\frac{\\overline{X}-\\ \\overline{Y}\\ -\\ 0}{{S_p} \\sqrt{\\frac{1}{n_x} + \\frac{1}{n_y}}} <\\ \\infty $\n",
+ "\n",
+ "2. $\\ -t_{{\\alpha},{n_x} + {n_y}-2} {{S_p} \\sqrt{\\frac{1}{n_x} + \\frac{1}{n_y}}}\\ <\\ \\overline{X} - \\overline{Y}\\ <\\ \\infty$\n",
+ "\n",
+ "3. P_value $ = P(t_{{\\alpha},{n_x} + {n_y}-2} \\leq t_0) > \\alpha$"
+ ],
+ "metadata": {
+ "id": "8CjR8DPccmoH"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "class equal_means_with_unknown_variances:\n",
+ " \"\"\"\n",
+ " Parameters\n",
+ " ----------\n",
+ " diff_mean_null : difference between the means of two population in the null hypothesis\n",
+ " S1 : optional, std of the sample1\n",
+ " S2 : optional, std of the sample2\n",
+ " n1 : optional, number of sample1 members\n",
+ " n2 : optional, number of sample2 members\n",
+ " alpha : significance level\n",
+ " type_t : 'two_tailed', 'right_tailed', 'left_tailed'\n",
+ " Sample_mean1 : mean of the sample1\n",
+ " Sample_mean2 : mean of the sample2\n",
+ " data1 : optional, if you do not know the sample mean1 and S1, just pass the first sample\n",
+ " data2 : optional, if you do not know the sample mean2 and S2, just pass the second sample\n",
+ " \"\"\"\n",
+ " def __init__(self, diff_mean_null, alpha, type_t, n1 = 0., n2 = 0., S1 = 0., S2 = 0., \n",
+ " Sample_mean1 = 0., Sample_mean2 = 0., data1=None, data2=None):\n",
+ " self.Sample_mean1 = Sample_mean1\n",
+ " self.Sample_mean2 = Sample_mean2 \n",
+ " self.diff_mean_null = diff_mean_null\n",
+ " self.S1 = S1\n",
+ " self.S2 = S2\n",
+ " self.type_t = type_t\n",
+ " self.n1 = n1\n",
+ " self.n2 = n2\n",
+ " self.alpha = alpha\n",
+ " self.data1 = data1\n",
+ " self.data2 = data2\n",
+ " self.SP2 = None\n",
+ " if data1 is not None:\n",
+ " self.Sample_mean1 = np.mean(list(data1))\n",
+ " self.S1 = np.std(list(data1), ddof=1)\n",
+ " self.n1 = len(list(data1))\n",
+ "\n",
+ " if data2 is not None:\n",
+ " self.Sample_mean2 = np.mean(list(data2)) \n",
+ " self.S2 = np.std(data2, ddof=1)\n",
+ " self.n2 = len(list(data2))\n",
+ "\n",
+ " self.SP2 = ((self.n1-1)*(self.S1**2) + (self.n2-1)*(self.S2**2)) / (self.n1+self.n2-2)\n",
+ "\n",
+ " equal_means_with_unknown_variances.__test(self)\n",
+ " \n",
+ " def __test(self):\n",
+ " test_statistic = (self.Sample_mean1-self.Sample_mean2-self.diff_mean_null) / (np.sqrt(self.SP2)*np.sqrt(1/self.n1+1/self.n2))\n",
+ " print('test_statistic:', test_statistic, '\\n')\n",
+ "\n",
+ " if self.type_t == 'two_tailed':\n",
+ " p_value = 2*(1-t.cdf(abs(test_statistic), df = self.n1+self.n2-2))\n",
+ " print('P_value:', p_value, '\\n')\n",
+ " elif self.type_t == 'left_tailed':\n",
+ " p_value = (t.cdf(test_statistic, df = self.n1+self.n2-2))\n",
+ " print('P_value:', p_value, '\\n')\n",
+ " else:\n",
+ " p_value = 1-t.cdf(test_statistic, df = self.n1+self.n2-2)\n",
+ " print('P_value:', p_value, '\\n')\n",
+ "\n",
+ " if p_value < self.alpha:\n",
+ " print(f'Since p_value < {self.alpha}, reject null hypothesis.')\n",
+ " else:\n",
+ " print(f'Since p_value > {self.alpha}, the null hypothesis cannot be rejected.')"
+ ],
+ "metadata": {
+ "id": "IVC_vkmRaUZJ"
+ },
+ "execution_count": 13,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "equal_means_with_unknown_variances(diff_mean_null = 0, S1 = np.sqrt(0.581), S2 = np.sqrt(0.778), n1 = 10, n2 = 12,\n",
+ " alpha = 0.05, type_t = 'left_tailed', Sample_mean1 = 6.45, Sample_mean2 = 7.125);"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "6-VjkWzZswic",
+ "outputId": "64e96e74-d6b4-4f2c-9efd-2a9f085fae63"
+ },
+ "execution_count": 14,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "test_statistic: -1.8987297952365871 \n",
+ "\n",
+ "P_value: 0.03606193933099178 \n",
+ "\n",
+ "Since p_value < 0.05, reject null hypothesis.\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "data1 = [5.5,6,7,6,7.5,6,7.5,5.5,7,6.5]\n",
+ "data2 = [6.5,6,8.5,7,6.5,8,7.5,6.5,7.5,6,8.5,7]\n",
+ "np.mean(data1), np.mean(data2)"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "4GVzWRqD1e-g",
+ "outputId": "5538346f-fa85-4373-c67b-2db8b0fbbb27"
+ },
+ "execution_count": 15,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "(6.45, 7.125)"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 15
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "equal_means_with_unknown_variances(diff_mean_null = 0, alpha = 0.05, type_t = 'left_tailed', data1 = data1, data2 = data2);"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "46MLRnZBwKt4",
+ "outputId": "44eafe71-ea78-4856-b6b8-adc3b63ecaa6"
+ },
+ "execution_count": 16,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "test_statistic: -1.8986953664603443 \n",
+ "\n",
+ "P_value: 0.03606431976959166 \n",
+ "\n",
+ "Since p_value < 0.05, reject null hypothesis.\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "You can also do this test using scipy.\n",
+ "\n",
+ "`scipy.stats.ttest_ind` calculate the T-test for the means of two independent samples.\n",
+ "\n",
+ "[Doc](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ttest_ind.html)"
+ ],
+ "metadata": {
+ "id": "Er9iBrjkOvUE"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "alpha = 0.05\n",
+ "Test_statistic, p_value = ttest_ind(data1, data2, alternative='less')\n",
+ "\n",
+ "print(f'Test_statistic = {Test_statistic}, p_value = {p_value}', '\\n')\n",
+ "\n",
+ "if p_value < alpha:\n",
+ " print(f'Since p_value < {alpha}, reject null hypothesis.')\n",
+ "else:\n",
+ " print(f'Since p_value > {alpha}, the null hypothesis cannot be rejected.')"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "a5AQocXrN7ix",
+ "outputId": "7109f06e-b519-43cd-eb1a-02e574d67db3"
+ },
+ "execution_count": 17,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Test_statistic = -1.8986953664603443, p_value = 0.03606431976959166 \n",
+ "\n",
+ "Since p_value < 0.05, reject null hypothesis.\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "\n",
+ "\n",
+ "## **4.4. Paired t-test:**"
+ ],
+ "metadata": {
+ "id": "Br8LlPdh4e9R"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "**A. Two Tailed Test:**\n",
+ "\n",
+ "The data can be described by the $n$ pairs $(X_i, Y_i), i = 1, 2, ..., n$, where $X_i$ is the data before an action, and $Y_i$ is the respective data points after an action.\n",
+ "\n",
+ "It is important to note that we cannot treat $X_1, ... , X_n$ and $Y_1, ... , Y_n$ as being independent samples.\n",
+ "\n",
+ "$D_i = X_i − Y_i, \\quad i = 1, ... , n$\n",
+ "\n",
+ "$H_0 : \\mu_D = 0$\n",
+ "\n",
+ "$H_1 : \\mu_D \\neq 0$\n",
+ "\n",
+ "$\\\\ $\n",
+ "\n",
+ "Testing statistics: \n",
+ "\n",
+ "$t_0 \\equiv \\frac{\\overline{D}\\ -\\ 0}{\\frac{S_D}{\\sqrt{n}}} \\sim t_{n-1}$\n",
+ "\n",
+ "$S_D = \\sqrt{\\frac{\\sum_{i=1}^n\\ (D_i\\ -\\ \\overline{D})^2}{n\\ -\\ 1}} \\quad \\quad \\overline{D} = \\sqrt{\\frac{{\\sum_{i=1}^n\\ D_i}}{n}}$\n",
+ "\n",
+ "$\\\\ $\n",
+ "\n",
+ "Significance level = $\\alpha$\n",
+ "\n",
+ "We accept $H_0$ if:\n",
+ "\n",
+ "1. $ -\\ t_{\\frac{{\\alpha}}{2},{n-1}} <\\ t_0 = \\frac{\\overline{D}\\ -\\ 0}{\\frac{S_D}{\\sqrt{n}}}\\ <\\ t_{\\frac{{\\alpha}}{2},{n-1}} $\n",
+ "\n",
+ "2. $- t_{\\frac{{\\alpha}}{2},{n-1}} \\frac{S_D}{\\sqrt{n}} \\ <\\ \\overline{D}\\ <\\ t_{\\frac{{\\alpha}}{2},{n-1}} \\frac{S_D}{\\sqrt{n}}$\n",
+ "\n",
+ "3. $| \\overline{D} |\\ <\\ t_{\\frac{{\\alpha}}{2},{n-1}} \\frac{S_D}{\\sqrt{n}}$\n",
+ "\n",
+ "4. P_value $ = P(t_{n-1} \\geq |t_0|) > \\alpha$"
+ ],
+ "metadata": {
+ "id": "1mOZr_hk5i7m"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "**B. One-sided tests:**\n",
+ "\n",
+ "**Right-tailed test:**\n",
+ "\n",
+ "The data can be described by the $n$ pairs $(X_i, Y_i), i = 1, 2, ..., n$, where $X_i$ is the data before an action, and $Y_i$ is the respective data points after an action.\n",
+ "\n",
+ "It is important to note that we cannot treat $X_1, ... , X_n$ and $Y_1, ... , Y_n$ as being independent samples.\n",
+ "\n",
+ "$D_i = X_i − Y_i, \\quad i = 1, ... , n$\n",
+ "\n",
+ "$H_0:\\ \\mu_D = 0\\ \\quad or \\quad H_0:\\ \\mu_D \\leq 0$\n",
+ "\n",
+ "$H_1 : \\mu_D > 0$\n",
+ "\n",
+ "$\\\\ $\n",
+ "\n",
+ "Testing statistics: \n",
+ "\n",
+ "$t_0 \\equiv \\frac{\\overline{D}\\ -\\ 0}{\\frac{S_D}{\\sqrt{n}}} \\sim t_{n-1}$\n",
+ "\n",
+ "$S_D = \\sqrt{\\frac{\\sum_{i=1}^n\\ (D_i\\ -\\ \\overline{D})^2}{n\\ -\\ 1}} \\quad \\quad \\overline{D} = \\sqrt{\\frac{{\\sum_{i=1}^n\\ D_i}}{n}}$\n",
+ "\n",
+ "$\\\\ $\n",
+ "\n",
+ "Significance level = $\\alpha$\n",
+ "\n",
+ "We accept $H_0$ if:\n",
+ "\n",
+ "1. $ -\\infty <\\ t_0 = \\frac{\\overline{D}\\ -\\ 0}{\\frac{S_D}{\\sqrt{n}}}\\ <\\ t_{{\\alpha},{n-1}} $\n",
+ "\n",
+ "2. $- \\infty \\ <\\ \\overline{D}\\ <\\ t_{{\\alpha},{n-1}} {\\frac{S_D}{\\sqrt{n}}}$\n",
+ "\n",
+ "3. P_value $ = P(t_{n-1} \\geq t_0) > \\alpha$"
+ ],
+ "metadata": {
+ "id": "Xmfb3znaIMuQ"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "**Left-tailed test:**\n",
+ "\n",
+ "The data can be described by the $n$ pairs $(X_i, Y_i), i = 1, 2, ..., n$, where $X_i$ is the data before an action, and $Y_i$ is the respective data points after an action.\n",
+ "\n",
+ "It is important to note that we cannot treat $X_1, ... , X_n$ and $Y_1, ... , Y_n$ as being independent samples.\n",
+ "\n",
+ "$D_i = X_i − Y_i, \\quad i = 1, ... , n$\n",
+ "\n",
+ "$H_0:\\ \\mu_D = 0\\ \\quad or \\quad H_0:\\ \\mu_D \\geq 0$\n",
+ "\n",
+ "$H_1 : \\mu_D < 0$\n",
+ "\n",
+ "$\\\\ $\n",
+ "\n",
+ "Testing statistics: \n",
+ "\n",
+ "$t_0 \\equiv \\frac{\\overline{D}\\ -\\ 0}{\\frac{S_D}{\\sqrt{n}}} \\sim t_{n-1}$\n",
+ "\n",
+ "$S_D = \\sqrt{\\frac{\\sum_{i=1}^n\\ (D_i\\ -\\ \\overline{D})^2}{n\\ -\\ 1}} \\quad \\quad \\overline{D} = \\sqrt{\\frac{{\\sum_{i=1}^n\\ D_i}}{n}}$\n",
+ "\n",
+ "$\\\\ $\n",
+ "\n",
+ "Significance level = $\\alpha$\n",
+ "\n",
+ "We accept $H_0$ if:\n",
+ "\n",
+ "1. $-\\ t_{{\\alpha},{n-1}} <\\ t_0 = \\frac{\\overline{D}\\ -\\ 0}{\\frac{S_D}{\\sqrt{n}}} <\\ \\infty $\n",
+ "\n",
+ "2. $\\ -t_{{\\alpha},{n-1}} {\\frac{S_D}{\\sqrt{n}}}\\ <\\ \\overline{D}\\ <\\ \\infty$\n",
+ "\n",
+ "3. P_value $ = P(t_{n-1} \\leq t_0) > \\alpha$"
+ ],
+ "metadata": {
+ "id": "5tBxt9m2JWKe"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "class paired_test:\n",
+ " \"\"\"\n",
+ " Parameters\n",
+ " ----------\n",
+ " null_mean : diff in the null hpothesis\n",
+ " n : optional, number of sample members\n",
+ " SD : D standard deviation\n",
+ " alpha : significance level\n",
+ " D_bar : mean of the D\n",
+ " type_t : 'two_tailed', 'right_tailed', 'left_tailed'\n",
+ " data1 : optional, if you do not know the D_bar and SD, just pass the data\n",
+ " data2 : optional, if you do not know the D_bar and SD, just pass the data\n",
+ " \"\"\"\n",
+ " def __init__(self, null_mean, alpha, type_t, n = 0., SD = 0., D_bar = 0., data1=None, data2=None):\n",
+ " self.D_bar = D_bar\n",
+ " self.SD = SD\n",
+ " self.null_mean = null_mean\n",
+ " self.type_t = type_t\n",
+ " self.n = n\n",
+ " self.alpha = alpha\n",
+ " self.data1 = data1\n",
+ " self.data2 = data2\n",
+ "\n",
+ " if data1 is not None:\n",
+ " D = np.subtract(data1, data2)\n",
+ " self.D_bar = np.mean(list(D))\n",
+ " self.SD = np.std(list(D), ddof=1)\n",
+ " self.n = len(list(data1))\n",
+ "\n",
+ " paired_test.__test(self)\n",
+ " \n",
+ " def __test(self):\n",
+ " test_statistic = (self.D_bar - self.null_mean) / (self.SD / np.sqrt(self.n))\n",
+ " print('test_statistic:', test_statistic, '\\n')\n",
+ " \n",
+ " if self.type_t == 'two_tailed':\n",
+ " p_value = 2*(1-t.cdf(abs(test_statistic), df=self.n-1))\n",
+ " print('P_value:', p_value, '\\n')\n",
+ " elif self.type_t == 'left_tailed':\n",
+ " p_value = (t.cdf(test_statistic, df=self.n-1))\n",
+ " print('P_value:', p_value, '\\n')\n",
+ " else:\n",
+ " p_value = (1-t.cdf(test_statistic, df=self.n-1))\n",
+ " print('P_value:', p_value, '\\n')\n",
+ "\n",
+ " if p_value < self.alpha:\n",
+ " print(f'Since p_value < {self.alpha}, reject null hypothesis.')\n",
+ " else:\n",
+ " print(f'Since p_value > {self.alpha}, the null hypothesis cannot be rejected.')"
+ ],
+ "metadata": {
+ "id": "83eVKnVs4osA"
+ },
+ "execution_count": 18,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "data1 = [7,6,10,16]\n",
+ "data2 = [9,10,14,18]\n",
+ "paired_test(null_mean = 0, n = 4, alpha = 0.05, type_t = 'two_tailed', data1=data1, data2=data2);"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "L2HKLRMDAbat",
+ "outputId": "2d5f95a7-b8fe-4fa0-d044-0012b605a518"
+ },
+ "execution_count": 19,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "test_statistic: -5.196152422706632 \n",
+ "\n",
+ "P_value: 0.013846832988859026 \n",
+ "\n",
+ "Since p_value < 0.05, reject null hypothesis.\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "data1 = [30.5,18.5,24.5,32,16,15,23.5,25.5,28,18]\n",
+ "data2 = [23,21,22,28.5,14.5,15.5,24.5,21,23.5,16.5]\n",
+ "paired_test(null_mean = 0, n = 10, alpha = 0.05, type_t = 'right_tailed', data1=data1, data2=data2);"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "vOK3B_4aHDH-",
+ "outputId": "39336454-bb13-49f7-b03f-59461ad081b3"
+ },
+ "execution_count": 20,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "test_statistic: 2.2659493332258522 \n",
+ "\n",
+ "P_value: 0.02484551530199397 \n",
+ "\n",
+ "Since p_value < 0.05, reject null hypothesis.\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "You can also do this test using scipy.\n",
+ "\n",
+ "`scipy.stats.ttest_rel` calculate the T-test for the means of two independent samples.\n",
+ "\n",
+ "[Doc](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ttest_rel.html)"
+ ],
+ "metadata": {
+ "id": "1M5Wf-TFPzrU"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "alpha = 0.05\n",
+ "Test_statistic, p_value = ttest_rel(data1, data2, alternative='greater')\n",
+ "\n",
+ "print(f'Test_statistic = {Test_statistic}, p_value = {p_value}', '\\n')\n",
+ "\n",
+ "if p_value < alpha:\n",
+ " print(f'Since p_value < {alpha}, reject null hypothesis.')\n",
+ "else:\n",
+ " print(f'Since p_value > {alpha}, the null hypothesis cannot be rejected.')"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "P8Xf8chpP0Xx",
+ "outputId": "30801515-d83e-4421-db52-667f4a013213"
+ },
+ "execution_count": 21,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "Test_statistic = 2.2659493332258522, p_value = 0.024845515301993935 \n",
+ "\n",
+ "Since p_value < 0.05, reject null hypothesis.\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "\n",
+ "\n",
+ "## **4.5. Test Concerning the Variance of a Normal Population:**"
+ ],
+ "metadata": {
+ "id": "ZJbTadX1NnaL"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "**A. Two Tailed Test:**\n",
+ "\n",
+ "Suppose that $X_1, X_2, ..., X_n$ is a sample of size $n$ from a normal distribution having an unknown mean $\\mu$ and a known variance $\\sigma^2$.\n",
+ "\n",
+ "$X_1, X_2, ..., X_n \\sim N( \\mu, \\sigma^2)$\n",
+ "\n",
+ "$H_0:\\ \\sigma^2 = \\sigma_0^2$\n",
+ "\n",
+ "$H_1:\\ \\sigma^2 \\neq \\sigma_0^2$\n",
+ "\n",
+ "$\\\\ $\n",
+ "\n",
+ "Testing statistics: \n",
+ "\n",
+ "$ \\chi^2_0 \\equiv \\frac{(n-1)\\ S^2}{\\sigma_0^2}$\n",
+ "\n",
+ "$\\\\ $\n",
+ "\n",
+ "Significance level = $\\alpha$\n",
+ "\n",
+ "We accept $H_0$ if:\n",
+ "\n",
+ "1. $\\ \\chi^2_{1-\\frac{\\alpha}{2}, n-1}\\ <\\ \\chi^2_0 = \\frac{(n-1)\\ S^2}{\\sigma_0^2} \\ <\\chi^2_{\\frac{\\alpha}{2}, n-1} $\n",
+ "\n",
+ "2. $\\sigma_0^2\\ \\frac{\\chi^2_{1-\\frac{\\alpha}{2}, n-1}}{n-1} <\\ S^2 <\\ \\sigma_0^2\\ \\frac{\\chi^2_{\\frac{\\alpha}{2}, n-1}}{n-1}$\n",
+ "\n",
+ "3. P_value $ = 2 \\times Min\\ ([P(\\chi^2_{n-1})< \\chi^2_0],\\ [1-P(\\chi^2_{n-1}< \\chi^2_0)]) > \\alpha$"
+ ],
+ "metadata": {
+ "id": "Uk4LjdzUWEG2"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "**B. One-sided tests:**\n",
+ "\n",
+ "**Right-tailed test:**\n",
+ "\n",
+ "Suppose that $X_1, X_2, ..., X_n$ is a sample of size $n$ from a normal distribution having an unknown mean $\\mu$ and a known variance $\\sigma^2$.\n",
+ "\n",
+ "$X_1, X_2, ..., X_n \\sim N( \\mu, \\sigma^2)$\n",
+ "\n",
+ "$H_0:\\ \\sigma^2 = \\sigma_0^2\\ \\quad or \\quad H_0:\\ \\sigma^2 \\leq \\sigma_0^2$\n",
+ "\n",
+ "$H_1:\\ \\sigma^2 > \\sigma_0^2$\n",
+ "\n",
+ "$\\\\ $\n",
+ "\n",
+ "Testing statistics: \n",
+ "\n",
+ "$ \\chi^2_0 \\equiv \\frac{(n-1)\\ S^2}{\\sigma_0^2}$\n",
+ "\n",
+ "$\\\\ $\n",
+ "\n",
+ "Significance level = $\\alpha$\n",
+ "\n",
+ "We accept $H_0$ if:\n",
+ "\n",
+ "1. $ 0 <\\ \\chi^2_0 = \\frac{(n-1)\\ S^2}{\\sigma_0^2}\\ <\\ \\chi^2_{{\\alpha}, n-1}$\n",
+ "\n",
+ "2. $0 \\ <\\ S^2\\ <\\ \\frac{\\chi^2_{{\\alpha}, n-1}\\ \\sigma_0^2}{(n-1)\\ S^2} $\n",
+ "\n",
+ "3. P_value $ = P(\\chi^2_{n-1} \\geq Z_0) > \\alpha$"
+ ],
+ "metadata": {
+ "id": "q4IQUnEmbd-A"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "**B. One-sided tests:**\n",
+ "\n",
+ "**Left-tailed test:**\n",
+ "\n",
+ "Suppose that $X_1, X_2, ..., X_n$ is a sample of size $n$ from a normal distribution having an unknown mean $\\mu$ and a known variance $\\sigma^2$.\n",
+ "\n",
+ "$X_1, X_2, ..., X_n \\sim N( \\mu, \\sigma^2)$\n",
+ "\n",
+ "$H_0:\\ \\sigma^2 = \\sigma_0^2\\ \\quad or \\quad H_0:\\ \\sigma^2 \\geq \\sigma_0^2$\n",
+ "\n",
+ "$H_1:\\ \\sigma^2 < \\sigma_0^2$\n",
+ "\n",
+ "$\\\\ $\n",
+ "\n",
+ "Testing statistics: \n",
+ "\n",
+ "$ \\chi^2_0 \\equiv \\frac{(n-1)\\ S^2}{\\sigma_0^2}$\n",
+ "\n",
+ "$\\\\ $\n",
+ "\n",
+ "Significance level = $\\alpha$\n",
+ "\n",
+ "We accept $H_0$ if:\n",
+ "\n",
+ "1. $ \\chi^2_{1-{\\alpha}, n-1} <\\ \\chi^2_0 = \\frac{(n-1)\\ S^2}{\\sigma_0^2} <\\ \\infty$\n",
+ "\n",
+ "2. $\\frac{\\chi^2_{1-{\\alpha}, n-1}\\ \\sigma_0^2}{(n-1)\\ S^2} \\ <\\ S^2\\ <\\ \\infty $\n",
+ "\n",
+ "3. P_value $ = P(\\chi^2_{n-1} \\leq Z_0) > \\alpha$"
+ ],
+ "metadata": {
+ "id": "ROJhLnQMdknc"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "class variance_normal:\n",
+ " \"\"\"\n",
+ " Parameters\n",
+ " ----------\n",
+ " null_v : v0\n",
+ " n : optional, number of sample members\n",
+ " S : optional, sample variance\n",
+ " alpha : significance level\n",
+ " type_t : 'two_tailed', 'right_tailed', 'left_tailed'\n",
+ " data : optional, if you do not know the S2, just pass the data\n",
+ " \"\"\"\n",
+ " def __init__(self, null_v, alpha, type_t, n = 0., S2 = 0., data=None):\n",
+ " self.S2 = S2\n",
+ " self.null_v = null_v\n",
+ " self.type_t = type_t\n",
+ " self.n = n\n",
+ " self.alpha = alpha\n",
+ " self.data = data\n",
+ " if data is not None:\n",
+ " self.S2 = np.std(list(data), ddof=1)**2\n",
+ " self.n = len(list(data))\n",
+ "\n",
+ " variance_normal.__test(self)\n",
+ " \n",
+ " def __test(self):\n",
+ " test_statistic = ((self.n-1)*(self.S2)) / self.null_v\n",
+ " print('test_statistic:', test_statistic, '\\n')\n",
+ " \n",
+ " if self.type_t == 'two_tailed':\n",
+ " a = 1-chi2.cdf(test_statistic, df=self.n-1)\n",
+ " b = chi2.cdf(test_statistic, df=self.n-1)\n",
+ " p_value = 2*np.min([a, b])\n",
+ " print('P_value:', p_value, '\\n')\n",
+ " elif self.type_t == 'left_tailed':\n",
+ " p_value = (chi2.cdf(test_statistic, df=self.n-1))\n",
+ " print('P_value:', p_value, '\\n')\n",
+ " else:\n",
+ " p_value = (1-chi2.cdf(test_statistic, df=self.n-1))\n",
+ " print('P_value:', p_value, '\\n')\n",
+ "\n",
+ " if p_value < self.alpha:\n",
+ " print(f'Since p_value < {self.alpha}, reject null hypothesis.')\n",
+ " else:\n",
+ " print(f'Since p_value > {self.alpha}, the null hypothesis cannot be rejected.')"
+ ],
+ "metadata": {
+ "id": "lectCJy-L6Op"
+ },
+ "execution_count": 22,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "variance_normal(null_v = 0.15**2, n = 20, alpha = 0.05, type_t = 'right_tailed', S2 = 0.025);"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "TOr9beSdkOXv",
+ "outputId": "c4a14497-a9ce-4d5f-c1e3-5c3df2dd6fbc"
+ },
+ "execution_count": 23,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "test_statistic: 21.111111111111114 \n",
+ "\n",
+ "P_value: 0.33069403418551535 \n",
+ "\n",
+ "Since p_value > 0.05, the null hypothesis cannot be rejected.\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "data = [11,10,9,10,10,11,11,10,12,9,7,9,11,10,11]\n",
+ "variance_normal(null_v = 2**2, alpha = 0.05, type_t = 'two_tailed', data = data);"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "7CqUa66upMdN",
+ "outputId": "639a3abf-ac5e-48f1-8518-bb00ee03db9b"
+ },
+ "execution_count": 24,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "test_statistic: 5.233333333333333 \n",
+ "\n",
+ "P_value: 0.035414043881992714 \n",
+ "\n",
+ "Since p_value < 0.05, reject null hypothesis.\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "\n",
+ "\n",
+ "## **4.6. Test Concerning the Equality of Variances of Two Normal Populations:**"
+ ],
+ "metadata": {
+ "id": "a6nq2C5NlRKo"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "**A. Two Tailed Test:**\n",
+ "\n",
+ "Suppose that $X_1, X_2, ..., X_n$ is a sample of size $n_x$ from a normal distribution having an unknown mean $\\mu_x$ and a known variance $\\sigma^2_x$. \n",
+ "\n",
+ "$Y_1, Y_2, ..., Y_n$ is a sample of size $n_y$ from a normal distribution having an unknown mean $\\mu_y$ and a known variance $\\sigma^2_y$.\n",
+ "\n",
+ "$X_1, X_2, ..., X_n \\sim N( \\mu_x, \\sigma^2_x)$\n",
+ "\n",
+ "$Y_1, Y_2, ..., Y_n \\sim N( \\mu_y, \\sigma^2_y)$\n",
+ "\n",
+ "$H_0:\\ \\sigma_x^2 / \\sigma_y^2 = 1$\n",
+ "\n",
+ "$H_1:\\ \\sigma_x^2 / \\sigma_y^2 \\neq 1$\n",
+ "\n",
+ "$\\\\ $\n",
+ "\n",
+ "Testing statistics: \n",
+ "\n",
+ "$ F_0 \\equiv \\frac{S^2_x}{S^2_y}$\n",
+ "\n",
+ "$S^2_x = \\frac{\\sum_{i=1}^{n_x}\\ (x_i\\ -\\ \\overline{x})^2}{{n_x}-1}$\n",
+ "\n",
+ "$S^2_y = \\frac{\\sum_{i=1}^{n_y}\\ (y_i\\ -\\ \\overline{y})^2}{{n_y}-1}$\n",
+ "\n",
+ "$\\\\ $\n",
+ "\n",
+ "Significance level = $\\alpha$\n",
+ "\n",
+ "We accept $H_0$ if:\n",
+ "\n",
+ "1. $F_{{1-\\frac{\\alpha}{2}}, n_x-1, n_y-1}\\ <\\ F_0 = \\frac{S^2_x}{S^2_y}\\ < F_{{\\frac{\\alpha}{2}}, n_x-1, n_y-1}$\n",
+ "\n",
+ "2. P_value $ = 2 \\times Min\\ ([P(F_{n_x-1,n_y-1})< F_0],\\ [1-P(F_{n_x-1,n_y-1}< F_0]) > \\alpha$"
+ ],
+ "metadata": {
+ "id": "pdlDGLQg6eYA"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "**B. One-sided tests:**\n",
+ "\n",
+ "**Right-tailed test:**\n",
+ "\n",
+ "Suppose that $X_1, X_2, ..., X_n$ is a sample of size $n_x$ from a normal distribution having an unknown mean $\\mu_x$ and a known variance $\\sigma^2_x$. \n",
+ "\n",
+ "$Y_1, Y_2, ..., Y_n$ is a sample of size $n_y$ from a normal distribution having an unknown mean $\\mu_y$ and a known variance $\\sigma^2_y$.\n",
+ "\n",
+ "$X_1, X_2, ..., X_n \\sim N( \\mu_x, \\sigma^2_x)$\n",
+ "\n",
+ "$Y_1, Y_2, ..., Y_n \\sim N( \\mu_y, \\sigma^2_y)$\n",
+ "\n",
+ "$H_0:\\ \\sigma_x^2 / \\sigma_y^2 = 1 \\quad or \\quad \\sigma_x^2 / \\sigma_y^2 \\leq 1$\n",
+ "\n",
+ "$H_1:\\ \\sigma_x^2 / \\sigma_y^2 > 1$\n",
+ "\n",
+ "$\\\\ $\n",
+ "\n",
+ "Testing statistics: \n",
+ "\n",
+ "$ F_0 \\equiv \\frac{S^2_x}{S^2_y}$\n",
+ "\n",
+ "$S^2_x = \\frac{\\sum_{i=1}^{n_x}\\ (x_i\\ -\\ \\overline{x})^2}{{n_x}-1}$\n",
+ "\n",
+ "$S^2_y = \\frac{\\sum_{i=1}^{n_y}\\ (y_i\\ -\\ \\overline{y})^2}{{n_y}-1}$\n",
+ "\n",
+ "$\\\\ $\n",
+ "\n",
+ "Significance level = $\\alpha$\n",
+ "\n",
+ "We accept $H_0$ if:\n",
+ "\n",
+ "1. $0\\ <\\ F_0 = \\frac{S^2_x}{S^2_y}\\ < F_{{\\alpha}, n_x-1, n_y-1}$\n",
+ "\n",
+ "2. P_value $ = P(F_{n_x-1,n_y-1} \\geq F_0) > \\alpha$"
+ ],
+ "metadata": {
+ "id": "wIkaIsU9-gBg"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "**Left-tailed test:**\n",
+ "\n",
+ "Suppose that $X_1, X_2, ..., X_n$ is a sample of size $n_x$ from a normal distribution having an unknown mean $\\mu_x$ and a known variance $\\sigma^2_x$. \n",
+ "\n",
+ "$Y_1, Y_2, ..., Y_n$ is a sample of size $n_y$ from a normal distribution having an unknown mean $\\mu_y$ and a known variance $\\sigma^2_y$.\n",
+ "\n",
+ "$X_1, X_2, ..., X_n \\sim N( \\mu_x, \\sigma^2_x)$\n",
+ "\n",
+ "$Y_1, Y_2, ..., Y_n \\sim N( \\mu_y, \\sigma^2_y)$\n",
+ "\n",
+ "$H_0:\\ \\sigma_x^2 / \\sigma_y^2 = 1 \\quad or \\quad \\sigma_x^2 / \\sigma_y^2 \\geq 1$\n",
+ "\n",
+ "$H_1:\\ \\sigma_x^2 / \\sigma_y^2 < 1$\n",
+ "\n",
+ "$\\\\ $\n",
+ "\n",
+ "Testing statistics: \n",
+ "\n",
+ "$ F_0 \\equiv \\frac{S^2_x}{S^2_y}$\n",
+ "\n",
+ "$S^2_x = \\frac{\\sum_{i=1}^{n_x}\\ (x_i\\ -\\ \\overline{x})^2}{{n_x}-1}$\n",
+ "\n",
+ "$S^2_y = \\frac{\\sum_{i=1}^{n_y}\\ (y_i\\ -\\ \\overline{y})^2}{{n_x}-1}$\n",
+ "\n",
+ "$\\\\ $\n",
+ "\n",
+ "Significance level = $\\alpha$\n",
+ "\n",
+ "We accept $H_0$ if:\n",
+ "\n",
+ "1. $F_{{1-\\alpha}, n_x-1, n_y-1}\\ <\\ F_0 = \\frac{S^2_x}{S^2_y}\\ < \\infty$\n",
+ "\n",
+ "2. P_value $ = P(F_{n_x-1,n_y-1} \\leq F_0) > \\alpha$"
+ ],
+ "metadata": {
+ "id": "vCD8AO5_BDBR"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "class equal_variances:\n",
+ " \"\"\"\n",
+ " Parameters\n",
+ " ----------\n",
+ " null_v : v0\n",
+ " n1 : optional, number of data1 members\n",
+ " n2 : optional, number of data2 members\n",
+ " S2X : sample1 variance\n",
+ " S2Y : sample2 variance\n",
+ " alpha : significance level\n",
+ " type_t : 'two_tailed', 'right_tailed', 'left_tailed'\n",
+ " data1 : optional, if you do not know the S2X, just pass the data\n",
+ " data2 : optional, if you do not know the S2Y, just pass the data\n",
+ " \"\"\"\n",
+ " def __init__(self, null_v, alpha, type_t, n1 = 0., n2 = 0., S2X = 0., S2Y = 0., data1=None, data2=None):\n",
+ " self.S2X = S2X\n",
+ " self.S2Y = S2Y\n",
+ " self.null_v = null_v\n",
+ " self.type_t = type_t\n",
+ " self.n1 = n1\n",
+ " self.n2 = n2\n",
+ " self.alpha = alpha\n",
+ " self.data1 = data1\n",
+ " self.data2 = data2\n",
+ " if data1 is not None:\n",
+ " self.SX2 = np.std(list(data1), ddof=1)**2\n",
+ " self.n1 = len(list(data1))\n",
+ " if data2 is not None:\n",
+ " self.S2Y = np.std(list(data2), ddof=1)**2\n",
+ " self.n2 = len(list(data2))\n",
+ "\n",
+ " equal_variances.__test(self)\n",
+ " \n",
+ " def __test(self):\n",
+ " test_statistic = self.S2X / self.S2Y\n",
+ " print('test_statistic:', test_statistic, '\\n')\n",
+ " \n",
+ " if self.type_t == 'two_tailed':\n",
+ " a = 1-stats.f.cdf(test_statistic, self.n1-1, self.n2-1)\n",
+ " b = stats.f.cdf(test_statistic, self.n1-1, self.n2-1)\n",
+ " p_value = 2*np.min([a, b])\n",
+ " print('P_value:', p_value, '\\n')\n",
+ " elif self.type_t == 'left_tailed':\n",
+ " p_value = (stats.f.cdf(test_statistic, self.n1-1, self.n2-2))\n",
+ " print('P_value:', p_value, '\\n')\n",
+ " else:\n",
+ " p_value = (1-stats.f.cdf(test_statistic, self.n1-1, self.n2-2))\n",
+ " print('P_value:', p_value, '\\n')\n",
+ "\n",
+ " if p_value < self.alpha:\n",
+ " print(f'Since p_value < {self.alpha}, reject null hypothesis')\n",
+ " else:\n",
+ " print(f'Since p_value > {self.alpha}, the null hypothesis cannot be rejected.')"
+ ],
+ "metadata": {
+ "id": "TPjrukBQlDVd"
+ },
+ "execution_count": 25,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "equal_variances(null_v = 1, n1 = 10, n2 = 12, alpha = 0.05, type_t = 'two_tailed', S2X = .14, S2Y = .28);"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "KQgyQ4LzDHKp",
+ "outputId": "6bafa54d-3128-4202-87ae-3951a6988d04"
+ },
+ "execution_count": 26,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "test_statistic: 0.5 \n",
+ "\n",
+ "P_value: 0.3075191907594375 \n",
+ "\n",
+ "Since p_value > 0.05, the null hypothesis cannot be rejected.\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "\n",
+ "\n",
+ "## **4.7. Test Concerning P in Bernoulli Populations:**"
+ ],
+ "metadata": {
+ "id": "I2zKIWZWFf4V"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "**A. Two Tailed Test:**\n",
+ "\n",
+ "Suppose that $X_1, X_2, ..., X_n$ is a sample of size $n$ (large) from a bernoulli distribution having an unknown parameter $P$. \n",
+ "\n",
+ "$X_1, X_2, ..., X_n \\sim Ber(P)$\n",
+ "\n",
+ "$H_0 : P = P_0$\n",
+ "\n",
+ "$H_1 : P \\neq P_0$\n",
+ "\n",
+ "$\\\\ $\n",
+ "\n",
+ "Testing statistics: \n",
+ "\n",
+ "$ Z_0 \\equiv \\frac{\\overline{X}\\ -\\ P_0}{\\sqrt{\\frac{P_0(1-P_0)}{n}}}$\n",
+ "\n",
+ "$\\\\ $\n",
+ "\n",
+ "Significance level = $\\alpha$\n",
+ "\n",
+ "We accept $H_0$ if:\n",
+ "\n",
+ "1. $-\\ Z_{\\frac{\\alpha}{2}}\\ <\\ Z_0 = \\frac{\\overline{X}\\ -\\ P_0}{\\sqrt{\\frac{P_0(1-P_0)}{n}}}\\ < Z_{\\frac{\\alpha}{2}}$\n",
+ "\n",
+ "2. $P_0\\ -\\ Z_{\\frac{\\alpha}{2}} \\sqrt{\\frac{P_0(1-P_0)}{n}} \\ <\\ \\overline{X} \\ < P_0\\ +\\ Z_{\\frac{\\alpha}{2}} \\sqrt{\\frac{P_0(1-P_0)}{n}}$\n",
+ "\n",
+ "3. P_value $ = 2 \\times P(Z \\geq |Z_0|) > \\alpha$"
+ ],
+ "metadata": {
+ "id": "0ba1PtjFHYji"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "**B. One-sided tests:**\n",
+ "\n",
+ "**Right-tailed test:**\n",
+ "\n",
+ "Suppose that $X_1, X_2, ..., X_n$ is a sample of size $n$ (large) from a bernoulli distribution having an unknown parameter $P$. \n",
+ "\n",
+ "$X_1, X_2, ..., X_n \\sim Ber(P)$\n",
+ "\n",
+ "$H_0 : P = P_0 \\quad or \\quad P \\leq P_0$\n",
+ "\n",
+ "$H_1 : P > P_0$\n",
+ "\n",
+ "$\\\\ $\n",
+ "\n",
+ "Testing statistics: \n",
+ "\n",
+ "$ Z_0 \\equiv \\frac{\\overline{X}\\ -\\ P_0}{\\sqrt{\\frac{P_0(1-P_0)}{n}}}$\n",
+ "\n",
+ "$\\\\ $\n",
+ "\n",
+ "Significance level = $\\alpha$\n",
+ "\n",
+ "We accept $H_0$ if:\n",
+ "\n",
+ "1. $-\\ \\infty <\\ Z_0 = \\frac{\\overline{X}\\ -\\ P_0}{\\sqrt{\\frac{P_0(1-P_0)}{n}}}\\ < Z_{\\alpha}$\n",
+ "\n",
+ "2. $-\\ \\infty \\ <\\ \\overline{X} \\ < P_0\\ +\\ Z_{\\alpha} \\sqrt{\\frac{P_0(1-P_0)}{n}}$\n",
+ "\n",
+ "3. P_value $ = P(Z \\geq Z_0) > \\alpha$"
+ ],
+ "metadata": {
+ "id": "D3iUqEh4K65A"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "**Left-tailed test:**\n",
+ "\n",
+ "Suppose that $X_1, X_2, ..., X_n$ is a sample of size $n$ (large) from a bernoulli distribution having an unknown parameter $P$. \n",
+ "\n",
+ "$X_1, X_2, ..., X_n \\sim Ber(P)$\n",
+ "\n",
+ "$H_0 : P = P_0 \\quad or \\quad P \\geq P_0$\n",
+ "\n",
+ "$H_1 : P < P_0$\n",
+ "\n",
+ "$\\\\ $\n",
+ "\n",
+ "Testing statistics: \n",
+ "\n",
+ "$ Z_0 \\equiv \\frac{\\overline{X}\\ -\\ P_0}{\\sqrt{\\frac{P_0(1-P_0)}{n}}}$\n",
+ "\n",
+ "$\\\\ $\n",
+ "\n",
+ "Significance level = $\\alpha$\n",
+ "\n",
+ "We accept $H_0$ if:\n",
+ "\n",
+ "1. $-\\ Z_{\\alpha} <\\ Z_0 = \\frac{\\overline{X}\\ -\\ P_0}{\\sqrt{\\frac{P_0(1-P_0)}{n}}}\\ < \\infty$\n",
+ "\n",
+ "2. $\\ P_0\\ -\\ Z_{\\alpha} \\sqrt{\\frac{P_0(1-P_0)}{n}} \\ <\\ \\overline{X} \\ < \\infty$\n",
+ "\n",
+ "3. P_value $ = P(Z \\leq Z_0) > \\alpha$"
+ ],
+ "metadata": {
+ "id": "dxmzJiCbQ29D"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "class p_bernoulli:\n",
+ " \"\"\"\n",
+ " Parameters\n",
+ " ----------\n",
+ " null_mean : u0\n",
+ " population_sd : known standrad deviation of the population\n",
+ " n : optional, number of sample members\n",
+ " alpha : significance level\n",
+ " type_t : 'two_tailed', 'right_tailed', 'left_tailed'\n",
+ " Sample_mean : optional, mean of the sample\n",
+ " data : optional, if you do not know the sample mean, just pass the data\n",
+ " \"\"\"\n",
+ " def __init__(self, null_p, alpha, type_t, n = 0., Sample_mean = 0., data=None):\n",
+ " self.Sample_mean = Sample_mean\n",
+ " self.null_p = null_p\n",
+ " self.type_t = type_t\n",
+ " self.n = n\n",
+ " self.alpha = alpha\n",
+ " self.data = data\n",
+ " if data is not None:\n",
+ " self.Sample_mean = np.mean(list(data))\n",
+ " self.n = len(list(data))\n",
+ "\n",
+ " p_bernoulli.__test(self)\n",
+ " \n",
+ " def __test(self):\n",
+ " test_statistic = (self.Sample_mean - self.null_p) / np.sqrt(self.null_p * (1-self.null_p) / self.n)\n",
+ " print('test_statistic:', test_statistic, '\\n')\n",
+ "\n",
+ " if self.type_t == 'two_tailed':\n",
+ " p_value = 2*(1-norm.cdf(abs(test_statistic)))\n",
+ " print('P_value:', p_value, '\\n')\n",
+ " elif self.type_t == 'left_tailed':\n",
+ " p_value = (norm.cdf(test_statistic))\n",
+ " print('P_value:', p_value, '\\n')\n",
+ " else:\n",
+ " p_value = (1-norm.cdf(test_statistic))\n",
+ " print('P_value:', p_value, '\\n')\n",
+ "\n",
+ " if p_value < self.alpha:\n",
+ " print(f'Since p_value < {self.alpha}, reject null hypothesis.')\n",
+ " else:\n",
+ " print(f'Since p_value > {self.alpha}, the null hypothesis cannot be rejected.')"
+ ],
+ "metadata": {
+ "id": "6Dhc2QP2RwDe"
+ },
+ "execution_count": 27,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "p_bernoulli(null_p = 0.2, n = 100, alpha = 0.05, type_t = 'right_tailed', Sample_mean = 0.3);"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "pBQw3rpNTRdE",
+ "outputId": "494610d4-8594-49de-9ff6-3d281ed69355"
+ },
+ "execution_count": 28,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "test_statistic: 2.4999999999999996 \n",
+ "\n",
+ "P_value: 0.006209665325776159 \n",
+ "\n",
+ "Since p_value < 0.05, reject null hypothesis.\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "\n",
+ "\n",
+ "## **4.8. Test Concerning the Equality of P in Two Bernoulli Populations:**\n",
+ "\n"
+ ],
+ "metadata": {
+ "id": "ro9fDzmQU8IZ"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "**A. Two Tailed Test:**\n",
+ "\n",
+ "Suppose that $X_1, X_2, ..., X_{n_x}$ is a sample of size $n_x$ (large) from a bernoulli distribution having an unknown parameter $P_x$ and $Y_1, Y_2, ..., Y_{n_y}$ is a sample of size $n_y$ (large) from a bernoulli distribution having an unknown parameter $P_y$.\n",
+ "\n",
+ "$X_1, X_2, ..., X_{n_x} \\sim Ber(P_x)$\n",
+ "\n",
+ "$Y_1, Y_2, ..., Y_{n_y} \\sim Ber(P_y)$\n",
+ "\n",
+ "$H_0 : P_x = P_y$\n",
+ "\n",
+ "$H_1 : P_x \\neq P_y$\n",
+ "\n",
+ "$\\\\ $\n",
+ "\n",
+ "Testing statistics:\n",
+ "\n",
+ "$ Z_0 \\equiv \\frac{\\overline{X}\\ -\\ \\overline{Y}\\ -\\ 0}{\\sqrt{\\frac{\\widehat{p}(1- \\widehat{p})}{n_x} + \\frac{\\widehat{p}(1-\\widehat{p})}{n_y}}}$\n",
+ "\n",
+ "$\\widehat{p} = \\frac{n_x \\overline{X}\\ +\\ n_y \\overline{Y}}{n_x\\ +\\ n_y}$\n",
+ "\n",
+ "$\\\\ $\n",
+ "\n",
+ "Significance level = $\\alpha$\n",
+ "\n",
+ "We accept $H_0$ if:\n",
+ "\n",
+ "1. $-\\ Z_{\\frac{\\alpha}{2}}\\ <\\ Z_0 = \\frac{\\overline{X}\\ -\\ \\overline{Y}\\ -\\ 0}{\\sqrt{\\frac{\\widehat{p}(1- \\widehat{p})}{n_x} + \\frac{\\widehat{p}(1-\\widehat{p})}{n_y}}}\\ < Z_{\\frac{\\alpha}{2}}$\n",
+ "\n",
+ "2. $-\\ Z_{\\frac{\\alpha}{2}} \\sqrt{\\frac{\\widehat{p}(1- \\widehat{p})}{n_x} + \\frac{\\widehat{p}(1-\\widehat{p})}{n_y}}\\ <\\ \\overline{X}\\ -\\ \\overline{Y} < Z_{\\frac{\\alpha}{2}} \\sqrt{\\frac{\\widehat{p}(1- \\widehat{p})}{n_x} + \\frac{\\widehat{p}(1-\\widehat{p})}{n_y}}$\n",
+ "\n",
+ "3. P_value $ = 2 \\times P(Z \\geq |Z_0|) > \\alpha$"
+ ],
+ "metadata": {
+ "id": "vjQ7CTG8WJOq"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "**B. One-sided tests:**\n",
+ "\n",
+ "**Right-tailed test:**\n",
+ "\n",
+ "Suppose that $X_1, X_2, ..., X_{n_x}$ is a sample of size $n_x$ (large) from a bernoulli distribution having an unknown parameter $P_x$ and $Y_1, Y_2, ..., Y_{n_y}$ is a sample of size $n_y$ (large) from a bernoulli distribution having an unknown parameter $P_y$.\n",
+ "\n",
+ "$X_1, X_2, ..., X_{n_x} \\sim Ber(P_x)$\n",
+ "\n",
+ "$Y_1, Y_2, ..., Y_{n_y} \\sim Ber(P_y)$\n",
+ "\n",
+ "$H_0 : P_x = P_y \\quad or \\quad P_x \\leq P_y$\n",
+ "\n",
+ "$H_1 : P_x > P_y$\n",
+ "\n",
+ "$\\\\ $\n",
+ "\n",
+ "Testing statistics: \n",
+ "\n",
+ "$ Z_0 \\equiv \\frac{\\overline{X}\\ -\\ \\overline{Y}\\ -\\ 0}{\\sqrt{\\frac{\\widehat{p}(1- \\widehat{p})}{n_x} + \\frac{\\widehat{p}(1-\\widehat{p})}{n_y}}}$\n",
+ "\n",
+ "$\\widehat{p} = \\frac{n_x \\overline{X}\\ +\\ n_y \\overline{Y}}{n_x\\ +\\ n_y}$\n",
+ "\n",
+ "$\\\\ $\n",
+ "\n",
+ "Significance level = $\\alpha$\n",
+ "\n",
+ "We accept $H_0$ if:\n",
+ "\n",
+ "1. $-\\ \\infty\\ <\\ Z_0 = \\frac{\\overline{X}\\ -\\ \\overline{Y}\\ -\\ 0}{\\sqrt{\\frac{\\widehat{p}(1- \\widehat{p})}{n_x} + \\frac{\\widehat{p}(1-\\widehat{p})}{n_y}}}\\ < Z_{\\alpha}$\n",
+ "\n",
+ "2. $-\\ \\infty\\ <\\ \\overline{X}\\ -\\ \\overline{Y} < Z_{\\alpha} \\sqrt{\\frac{\\widehat{p}(1- \\widehat{p})}{n_x} + \\frac{\\widehat{p}(1-\\widehat{p})}{n_y}}$\n",
+ "\n",
+ "3. P_value $ = P(Z \\geq Z_0) > \\alpha$"
+ ],
+ "metadata": {
+ "id": "ETaJekx1ZkP6"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "**Left-tailed test:**\n",
+ "\n",
+ "Suppose that $X_1, X_2, ..., X_{n_x}$ is a sample of size $n_x$ (large) from a bernoulli distribution having an unknown parameter $P_x$ and $Y_1, Y_2, ..., Y_{n_y}$ is a sample of size $n_y$ (large) from a bernoulli distribution having an unknown parameter $P_y$.\n",
+ "\n",
+ "$X_1, X_2, ..., X_{n_x} \\sim Ber(P_x)$\n",
+ "\n",
+ "$Y_1, Y_2, ..., Y_{n_y} \\sim Ber(P_y)$\n",
+ "\n",
+ "$H_0 : P_x = P_y \\quad or \\quad P_x \\geq P_y$\n",
+ "\n",
+ "$H_1 : P_x < P_y$\n",
+ "\n",
+ "$\\\\ $\n",
+ "\n",
+ "Testing statistics:\n",
+ "\n",
+ "$ Z_0 \\equiv \\frac{\\overline{X}\\ -\\ \\overline{Y}\\ -\\ 0}{\\sqrt{\\frac{\\widehat{p}(1- \\widehat{p})}{n_x} + \\frac{\\widehat{p}(1-\\widehat{p})}{n_y}}}$\n",
+ "\n",
+ "$\\widehat{p} = \\frac{n_x \\overline{X}\\ +\\ n_y \\overline{Y}}{n_x\\ +\\ n_y}$\n",
+ "\n",
+ "$\\\\ $\n",
+ "\n",
+ "Significance level = $\\alpha$\n",
+ "\n",
+ "We accept $H_0$ if:\n",
+ "\n",
+ "1. $-\\ Z_{\\alpha}\\ <\\ Z_0 = \\frac{\\overline{X}\\ -\\ \\overline{Y}\\ -\\ 0}{\\sqrt{\\frac{\\widehat{p}(1- \\widehat{p})}{n_x} + \\frac{\\widehat{p}(1-\\widehat{p})}{n_y}}}\\ < \\infty$\n",
+ "\n",
+ "2. $-\\ Z_{\\alpha} \\sqrt{\\frac{\\widehat{p}(1- \\widehat{p})}{n_x} + \\frac{\\widehat{p}(1-\\widehat{p})}{n_y}} <\\ \\overline{X}\\ -\\ \\overline{Y} < \\infty$\n",
+ "\n",
+ "3. P_value $ = P(Z \\leq Z_0) > \\alpha$"
+ ],
+ "metadata": {
+ "id": "KICWomUlal8o"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "class p_bernoulli_two_populations:\n",
+ " \"\"\"\n",
+ " Parameters\n",
+ " ----------\n",
+ " null_mean : u0\n",
+ " population_sd : known standrad deviation of the population\n",
+ " n1 : optional, number of data1 members\n",
+ " n2 : optional, number of data2 members\n",
+ " alpha : significance level\n",
+ " type_t : 'two_tailed', 'right_tailed', 'left_tailed'\n",
+ " Sample_mean : mean of the sample\n",
+ " data : optional, if you do not know the sample mean, just pass the data\n",
+ " \"\"\"\n",
+ " def __init__(self, null_p, alpha, type_t, n1 = 0., n2 = 0., Sample_mean1 = 0., Sample_mean2 = 0., data1=None, data2=None):\n",
+ " self.Sample_mean1 = Sample_mean1\n",
+ " self.Sample_mean2 = Sample_mean2\n",
+ " self.null_p = null_p\n",
+ " self.type_t = type_t\n",
+ " self.n1 = n1\n",
+ " self.n2 = n2\n",
+ " self.alpha = alpha\n",
+ " self.data1 = data1\n",
+ " self.data2 = data2\n",
+ " if data1 is not None:\n",
+ " self.Sample_mean1 = np.mean(list(data1))\n",
+ " self.n1 = len(list(data1))\n",
+ " if data2 is not None:\n",
+ " self.Sample_mean2 = np.mean(list(data2))\n",
+ " self.n2 = len(list(data2))\n",
+ "\n",
+ " p_bernoulli_two_populations.__test(self)\n",
+ " \n",
+ " def __test(self):\n",
+ " p_hat = ((self.n1 * self.Sample_mean1) + (self.n2 * self.Sample_mean2)) / (self.n1 + self.n2)\n",
+ " test_statistic = (self.Sample_mean1 - self.Sample_mean2) / np.sqrt((p_hat*(1-p_hat)/self.n1) + (p_hat*(1-p_hat)/self.n2))\n",
+ " print('test_statistic:', test_statistic, '\\n')\n",
+ "\n",
+ " if self.type_t == 'two_tailed':\n",
+ " p_value = 2*(1-norm.cdf(abs(test_statistic)))\n",
+ " print('P_value:', p_value, '\\n')\n",
+ " elif self.type_t == 'left_tailed':\n",
+ " p_value = (norm.cdf(test_statistic))\n",
+ " print('P_value:', p_value, '\\n')\n",
+ " else:\n",
+ " p_value = (1-norm.cdf(test_statistic))\n",
+ " print('P_value:', p_value, '\\n')\n",
+ "\n",
+ " if p_value < self.alpha:\n",
+ " print(f'Since p_value < {self.alpha}, reject null hypothesis.')\n",
+ " else:\n",
+ " print(f'Since p_value > {self.alpha}, the null hypothesis cannot be rejected.')"
+ ],
+ "metadata": {
+ "id": "Q4xWEZaMa_vJ"
+ },
+ "execution_count": 29,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "p_bernoulli_two_populations(null_p = 0, n1 = 200, n2 = 200, alpha = 0.05, Sample_mean1 = 150/200,\n",
+ " Sample_mean2 = 170/200, type_t = 'two_tailed');"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "wI-ZscfYcrAo",
+ "outputId": "308d954a-ea99-4b4a-d65f-44801d19df58"
+ },
+ "execution_count": 30,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "test_statistic: -2.4999999999999996 \n",
+ "\n",
+ "P_value: 0.012419330651552318 \n",
+ "\n",
+ "Since p_value < 0.05, reject null hypothesis.\n"
+ ]
+ }
+ ]
+ }
+ ]
+}
\ No newline at end of file