From 3097287260ace8236c0acdbbec49f277d28f2135 Mon Sep 17 00:00:00 2001 From: Pegah-Ardehkhani <89220787+Pegah-Ardehkhani@users.noreply.github.com> Date: Wed, 15 Jun 2022 15:15:02 +0430 Subject: [PATCH] Add files via upload --- Chapter 4 Parametric Hypothesis Testing.ipynb | 2544 +++++++++++++++++ 1 file changed, 2544 insertions(+) create mode 100644 Chapter 4 Parametric Hypothesis Testing.ipynb diff --git a/Chapter 4 Parametric Hypothesis Testing.ipynb b/Chapter 4 Parametric Hypothesis Testing.ipynb new file mode 100644 index 0000000..111dccc --- /dev/null +++ b/Chapter 4 Parametric Hypothesis Testing.ipynb @@ -0,0 +1,2544 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "Chapter 4: Parametric Hypothesis Testing.ipynb", + "provenance": [], + "collapsed_sections": [ + "WJhmgDxsVHEO", + "VAqEz3tk15jW", + "Kc6Ospy5qm2f", + "L5FYq4KIr5tR", + "btrLaUKzN1PY", + "4V1_eXUkxJD4", + "58w24h7DxfUc", + "rkSluAxdVb1M", + "Br8LlPdh4e9R", + "ZJbTadX1NnaL", + "a6nq2C5NlRKo", + "I2zKIWZWFf4V", + "ro9fDzmQU8IZ" + ] + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# **Chapter 4: Parametric Hypothesis Testing**" + ], + "metadata": { + "id": "a2GtsO5MH734" + } + }, + { + "cell_type": "markdown", + "source": [ + "**Table of Content:**\n", + "\n", + "- [Import Libraries](#Import_Libraries)\n", + "- [4.1. Introduction](#Introduction)\n", + "- [4.2. Test Concerning the Mean of a Normal Population](#Test_Concerning_the_Mean_of_a_Normal_Population)\n", + " - [4.2.1. Known Standard Deviation](#Known_Standard_Deviation)\n", + " - [4.2.2. Unknown Standard Deviation](#Unknown_Standard_Deviation)\n", + "- [4.3. Test Concerning the Equality of Means of Two Normal Populations](#Test_Concerning_the_Equality_of_Means_of_Two_Normal_Populations)\n", + " - [4.3.1. Known Variances](#Known_Variances)\n", + " - [4.3.2. Unknown but Equal Variances](#Unknown_but_Equal_Variances)\n", + "- [4.4. Paired t-test](#Paired_t-test)\n", + "- [4.5. Test Concerning the Variance of a Normal Population](#Test_Concerning_the_Variance_of_a_Normal_Population)\n", + "- [4.6. Test Concerning the Equality of Variances of Two Normal Populations](#Test_Concerning_the_Equality_of_Variances_of_Two_Normal_Populations)\n", + "- [4.7. Test _Concerning P in Bernoulli Populations](#Test_Concerning_P_in_Bernoulli_Populations)\n", + "- [4.8. Test Concerning the Equality of P in Two Bernoulli Populations](#Test_Concerning_the_Equality_of_P_in_Two_Bernoulli_Populations)\n" + ], + "metadata": { + "id": "dGYw5uzq5aM7" + } + }, + { + "cell_type": "markdown", + "source": [ + "\n", + "\n", + "## **Import Libraries**" + ], + "metadata": { + "id": "WJhmgDxsVHEO" + } + }, + { + "cell_type": "code", + "source": [ + "!pip install --upgrade scipy" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "vXStgb2JU6c0", + "outputId": "c3be2276-b0e8-4feb-c137-f726380b0ac9" + }, + "execution_count": 1, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", + "Requirement already satisfied: scipy in /usr/local/lib/python3.7/dist-packages (1.4.1)\n", + "Collecting scipy\n", + " Downloading scipy-1.7.3-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (38.1 MB)\n", + "\u001b[K |████████████████████████████████| 38.1 MB 18.4 MB/s \n", + "\u001b[?25hRequirement already satisfied: numpy<1.23.0,>=1.16.5 in /usr/local/lib/python3.7/dist-packages (from scipy) (1.21.6)\n", + "Installing collected packages: scipy\n", + " Attempting uninstall: scipy\n", + " Found existing installation: scipy 1.4.1\n", + " Uninstalling scipy-1.4.1:\n", + " Successfully uninstalled scipy-1.4.1\n", + "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "albumentations 0.1.12 requires imgaug<0.2.7,>=0.2.5, but you have imgaug 0.2.9 which is incompatible.\u001b[0m\n", + "Successfully installed scipy-1.7.3\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import matplotlib.patches as mpatches\n", + "import seaborn as sns\n", + "import math\n", + "from scipy import stats\n", + "from scipy.stats import norm\n", + "from scipy.stats import chi2\n", + "from scipy.stats import t\n", + "from scipy.stats import f\n", + "from scipy.stats import bernoulli\n", + "from scipy.stats import binom\n", + "from scipy.stats import nbinom\n", + "from scipy.stats import geom\n", + "from scipy.stats import poisson\n", + "from scipy.stats import uniform\n", + "from scipy.stats import randint\n", + "from scipy.stats import expon\n", + "from scipy.stats import gamma\n", + "from scipy.stats import beta\n", + "from scipy.stats import weibull_min\n", + "from scipy.stats import hypergeom\n", + "from scipy.stats import shapiro\n", + "from scipy.stats import pearsonr\n", + "from scipy.stats import normaltest\n", + "from scipy.stats import anderson\n", + "from scipy.stats import spearmanr\n", + "from scipy.stats import kendalltau\n", + "from scipy.stats import chi2_contingency\n", + "from scipy.stats import ttest_ind\n", + "from scipy.stats import ttest_rel\n", + "from scipy.stats import mannwhitneyu\n", + "from scipy.stats import wilcoxon\n", + "from scipy.stats import kruskal\n", + "from scipy.stats import friedmanchisquare\n", + "from statsmodels.tsa.stattools import adfuller\n", + "from statsmodels.tsa.stattools import kpss\n", + "from statsmodels.stats.weightstats import ztest\n", + "from scipy.integrate import quad\n", + "from IPython.display import display, Latex\n", + "\n", + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "warnings.simplefilter(action='ignore', category=FutureWarning)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "ZPuphzTmU-P8", + "outputId": "16e994f6-8530-4d26-a3f4-34a299ddbd95" + }, + "execution_count": 2, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "/usr/local/lib/python3.7/dist-packages/statsmodels/tools/_testing.py:19: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.\n", + " import pandas.util.testing as tm\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "\n", + "\n", + "## **4.1. Introduction:**" + ], + "metadata": { + "id": "VAqEz3tk15jW" + } + }, + { + "cell_type": "markdown", + "source": [ + "**Statistical Hypothesis:** A statistical hypothesis is usually a statement about a set of parameters of a population distribution.\n", + "\n", + "**$H_0$ (Null Hypothesis):** The null hypothesis is a statistical hypothesis to be tested and accepted or rejected in favor of an alternative.\n", + "\n", + "**$H_1$ (Alternative Hypothesis):** An alternative hypothesis is an opposing theory in relation to the null hypothesis.\n", + "\n", + "**Type $\\mathrm{I}$ Error:** The type $\\mathrm{I}$ error, is said to result if the test incorrectly calls for rejecting $H_0$ when it is indeed correct.\n", + "\n", + "$\\alpha = P(reject\\ H_0\\ |\\ H_0\\ is\\ true)$\n", + "\n", + "**Type $\\mathrm{II}$ Error:** The type $\\mathrm{II}$ error, results if the test calls for accepting $H_0$ when it is false.\n", + "\n", + "$\\beta = P(Accept\\ H_0\\ |\\ H_0\\ is\\ not\\ true)$\n", + "\n", + "**Significance Level:** Whenever $H_0$ is true, its probability of being rejected is never greater than $\\alpha$. The value $\\alpha$, called the level of significance of the test, is usually set in advance, with commonly chosen values being $\\alpha = 0.1, 0.05, 0.005$.\n", + "\n", + "**P_value:** The P value, or calculated probability, is the probability of finding the observed, or more extreme, results when the null hypothesis (H 0) of a study question is true — the definition of ‘extreme’ depends on how the hypothesis is being tested.\n", + "\n", + "If your P value is less than the chosen significance level then you reject the null hypothesis i.e. accept that your sample gives reasonable evidence to support the alternative hypothesis." + ], + "metadata": { + "id": "ObVoGiq9mQ_D" + } + }, + { + "cell_type": "markdown", + "source": [ + "\n", + "\n", + "\n", + "\n", + "
.$H_0$ is actually true$H_0$ is not true
Accept $H_0$$1-\\alpha$$\\beta$
Reject $H_0$$\\alpha$$1-\\beta$
" + ], + "metadata": { + "id": "YepvfOWIIcEk" + } + }, + { + "cell_type": "markdown", + "source": [ + "**Steps of a hypothesis testing:**\n", + "\n", + "1. Specify the null and alternative Hypothesis.\n", + "2. Collect a random sample from the population. \n", + "3. Calculate the Test Statistic and Corresponding P-Value\n", + "4. Decide whether to reject or fail to reject your null hypothesis" + ], + "metadata": { + "id": "UaFVzosSs5oP" + } + }, + { + "cell_type": "markdown", + "source": [ + "\n", + "\n", + "## **4.2. Test Concerning the Mean of a Normal Population:**" + ], + "metadata": { + "id": "Kc6Ospy5qm2f" + } + }, + { + "cell_type": "markdown", + "source": [ + "\n", + "\n", + "### **4.2.1. Known Standard Deviation:**" + ], + "metadata": { + "id": "L5FYq4KIr5tR" + } + }, + { + "cell_type": "markdown", + "source": [ + "**A. Two Tailed Test:**\n", + "\n", + "Suppose that $X_1, X_2, ..., X_n$ is a sample of size $n$ from a normal distribution having an unknown mean $\\mu$ and a known variance $\\sigma^2$.\n", + "\n", + "$X_1, X_2, ..., X_n \\sim N( \\mu, \\sigma^2)$\n", + "\n", + "$H_0:\\ \\mu = \\mu_0$\n", + "\n", + "$H_1:\\ \\mu \\neq \\mu_0$\n", + "\n", + "$\\\\ $\n", + "\n", + "Testing statistics: \n", + "\n", + "$Z_0 \\equiv \\frac{\\overline{X}-\\mu_0}{\\frac{\\sigma}{\\sqrt{n}}}$\n", + "\n", + "$\\\\ $\n", + "\n", + "Significance level = $\\alpha$\n", + "\n", + "We accept $H_0$ if:\n", + "\n", + "1. $-\\ Z_{\\frac{\\alpha}{2}}\\ <\\ Z_0 = \\frac{\\overline{X}-\\mu_0}{\\frac{\\sigma}{\\sqrt{n}}}\\ <\\ Z_{\\frac{\\alpha}{2}}$\n", + "\n", + "2. $\\mu_0\\ -\\ Z_{\\frac{\\alpha}{2}} \\frac{\\sigma}{\\sqrt{n}}\\ <\\ \\overline{X}\\ <\\ \\mu_0\\ +\\ Z_{\\frac{\\alpha}{2}} \\frac{\\sigma}{\\sqrt{n}}$\n", + "\n", + "3. $|\\overline{X}-\\mu_0| < Z_{\\frac{\\alpha}{2}} \\frac{\\sigma}{\\sqrt{n}}$\n", + "\n", + "4. P_value $ = 2 \\times P(Z \\geq |Z_0|) > \\alpha$" + ], + "metadata": { + "id": "9PWV-JGeJyVN" + } + }, + { + "cell_type": "markdown", + "source": [ + "**B. One-sided tests:**\n", + "\n", + "**Right-tailed test:**\n", + "\n", + "Suppose that $X_1, X_2, ..., X_n$ is a sample of size $n$ from a normal distribution having an unknown mean $\\mu$ and a known variance $\\sigma^2$.\n", + "\n", + "$X_1, X_2, ..., X_n \\sim N( \\mu, \\sigma^2)$\n", + "\n", + "$H_0:\\ \\mu = \\mu_0\\ \\quad or \\quad H_0:\\ \\mu \\leq \\mu_0$\n", + "\n", + "$H_1:\\ \\mu > \\mu_0$\n", + "\n", + "$\\\\ $\n", + "\n", + "Testing statistics: \n", + "\n", + "$Z_0 \\equiv \\frac{\\overline{X}-\\mu_0}{\\frac{\\sigma}{\\sqrt{n}}}$\n", + "\n", + "$\\\\ $\n", + "\n", + "Significance level = $\\alpha$\n", + "\n", + "We accept $H_0$ if:\n", + "\n", + "1. $ -\\infty <\\ Z_0 = \\frac{\\overline{X}-\\mu_0}{\\frac{\\sigma}{\\sqrt{n}}}\\ <\\ Z_{\\alpha} $\n", + "\n", + "2. $- \\infty \\ <\\ \\overline{X}\\ <\\ \\mu_0\\ +\\ Z_{\\alpha} \\frac{\\sigma}{\\sqrt{n}}$\n", + "\n", + "3. P_value $ = P(Z \\geq Z_0) > \\alpha$" + ], + "metadata": { + "id": "31PJg_ETxNMP" + } + }, + { + "cell_type": "markdown", + "source": [ + "**Left-tailed test:**\n", + "\n", + "Suppose that $X_1, X_2, ..., X_n$ is a sample of size $n$ from a normal distribution having an unknown mean $\\mu$ and a known variance $\\sigma^2$.\n", + "\n", + "$X_1, X_2, ..., X_n \\sim N( \\mu, \\sigma^2)$\n", + "\n", + "$H_0:\\ \\mu = \\mu_0 \\quad or \\quad H_0:\\ \\mu \\geq \\mu_0$\n", + "\n", + "$H_1:\\ \\mu < \\mu_0$\n", + "\n", + "$\\\\ $\n", + "\n", + "Testing statistics: \n", + "\n", + "$Z_0 \\equiv \\frac{\\overline{X}-\\mu_0}{\\frac{\\sigma}{\\sqrt{n}}}$\n", + "\n", + "$\\\\ $\n", + "\n", + "Significance level = $\\alpha$\n", + "\n", + "We accept $H_0$ if:\n", + "\n", + "1. $-\\ Z_{\\alpha}\\ <\\ Z_0 = \\frac{\\overline{X}-\\mu_0}{\\frac{\\sigma}{\\sqrt{n}}}\\ <\\ \\infty $\n", + "\n", + "2. $\\ \\mu_0\\ -\\ Z_{\\alpha} \\frac{\\sigma}{\\sqrt{n}}\\ <\\ \\overline{X}\\ <\\ \\infty$\n", + "\n", + "3. P_value $ = P(Z \\leq Z_0) > \\alpha$" + ], + "metadata": { + "id": "rhL-meIV0GAB" + } + }, + { + "cell_type": "code", + "source": [ + "class mean_with_known_variance:\n", + " \"\"\"\n", + " Parameters\n", + " ----------\n", + " null_mean : u0\n", + " population_sd : known standrad deviation of the population\n", + " n : optional, number of sample members\n", + " alpha : significance level\n", + " type_t : 'two_tailed', 'right_tailed', 'left_tailed'\n", + " Sample_mean : optional, mean of the sample\n", + " data : optional, if you do not know the sample mean, just pass the data\n", + " \"\"\"\n", + " def __init__(self, null_mean, population_sd, alpha, type_t, n = 0., Sample_mean = 0., data=None):\n", + " self.Sample_mean = Sample_mean\n", + " self.null_mean = null_mean\n", + " self.population_sd = population_sd\n", + " self.type_t = type_t\n", + " self.n = n\n", + " self.alpha = alpha\n", + " self.data = data\n", + " if data is not None:\n", + " self.Sample_mean = np.mean(list(data))\n", + " self.n = len(list(data))\n", + "\n", + " mean_with_known_variance.__test(self)\n", + " \n", + " def __test(self):\n", + " test_statistic = (self.Sample_mean-self.null_mean) / (self.population_sd/np.sqrt(self.n))\n", + " print('test_statistic:', test_statistic, '\\n')\n", + "\n", + " if self.type_t == 'two_tailed':\n", + " p_value = 2*(1-norm.cdf(abs(test_statistic)))\n", + " print('P_value:', p_value, '\\n')\n", + " elif self.type_t == 'left_tailed':\n", + " p_value = (norm.cdf(test_statistic))\n", + " print('P_value:', p_value, '\\n')\n", + " else:\n", + " p_value = (1-norm.cdf(test_statistic))\n", + " print('P_value:', p_value, '\\n')\n", + "\n", + " if p_value < self.alpha:\n", + " print(f'Since p_value < {self.alpha}, reject null hypothesis.')\n", + " else:\n", + " print(f'Since p_value > {self.alpha}, the null hypothesis cannot be rejected.')" + ], + "metadata": { + "id": "lX6jag0Rlm0r" + }, + "execution_count": 3, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "mean_with_known_variance(type_t='two_tailed', Sample_mean=8.75, n=16, null_mean=10, population_sd=1, alpha=0.05);" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "7X8RAhJ2lp86", + "outputId": "cb26aaa5-0684-467f-a5b5-d074a9a7c622" + }, + "execution_count": 4, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "test_statistic: -5.0 \n", + "\n", + "P_value: 5.733031438470704e-07 \n", + "\n", + "Since p_value < 0.05, reject null hypothesis.\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "np.random.seed(1)\n", + "data = np.random.normal(0,1,100)\n", + "mean_with_known_variance(type_t='left_tailed', data=data, null_mean=0.1, population_sd=1, alpha=0.05);" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "WiTfjm-Blw9Z", + "outputId": "59b6eb97-6fe6-4e80-a2c1-91152e4288a1" + }, + "execution_count": 5, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "test_statistic: -0.394171479243013 \n", + "\n", + "P_value: 0.34672722046703075 \n", + "\n", + "Since p_value > 0.05, the null hypothesis cannot be rejected.\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "\n", + "\n", + "### **4.2.2. Unknown Standard Deviation:**\n" + ], + "metadata": { + "id": "btrLaUKzN1PY" + } + }, + { + "cell_type": "markdown", + "source": [ + "**A. Two Tailed Test:**\n", + "\n", + "Suppose that $X_1, X_2, ..., X_n$ is a sample of size $n$ from a normal distribution having an unknown mean $\\mu$ and a unknown variance $\\sigma^2$.\n", + "\n", + "$X_1, X_2, ..., X_n \\sim N( \\mu, \\sigma^2)$\n", + "\n", + "$H_0:\\ \\mu = \\mu_0$\n", + "\n", + "$H_1:\\ \\mu \\neq \\mu_0$\n", + "\n", + "$\\\\ $\n", + "\n", + "Testing statistics: \n", + "\n", + "$t_0 \\equiv \\frac{\\overline{X}-\\mu_0}{\\frac{S}{\\sqrt{n}}} \\sim t_{n-1}$\n", + "\n", + "$S = \\sqrt{\\frac{\\sum_{i=1}^n\\ (x_i\\ -\\ \\overline{x})^2}{n-1}}$\n", + "\n", + "$\\\\ $\n", + "\n", + "Significance level = $\\alpha$\n", + "\n", + "We accept $H_0$ if:\n", + "\n", + "1. $-\\ t_{\\frac{\\alpha}{2},n-1}\\ <\\ t_0 = \\frac{\\overline{X}-\\mu_0}{\\frac{S}{\\sqrt{n}}} <\\ t_{\\frac{\\alpha}{2},n-1}$\n", + "\n", + "2. $\\mu_0\\ -\\ t_{\\frac{\\alpha}{2},n-1} \\frac{S}{\\sqrt{n}}\\ <\\ \\overline{X}\\ <\\ \\mu_0\\ +\\ t_{\\frac{\\alpha}{2},n-1} \\frac{S}{\\sqrt{n}}$\n", + "\n", + "3. $|\\overline{X}-\\mu_0| < t_{\\frac{\\alpha}{2},n-1} \\frac{S}{\\sqrt{n}}$\n", + "\n", + "4. P_value $ = 2 \\times P(t_{n-1} \\geq |t_0|) > \\alpha$" + ], + "metadata": { + "id": "2b3-4p0tQKdv" + } + }, + { + "cell_type": "markdown", + "source": [ + "**B. One-sided tests:**\n", + "\n", + "**Right-tailed test:**\n", + "\n", + "Suppose that $X_1, X_2, ..., X_n$ is a sample of size $n$ from a normal distribution having an unknown mean $\\mu$ and a unknown variance $\\sigma^2$.\n", + "\n", + "$X_1, X_2, ..., X_n \\sim N( \\mu, \\sigma^2)$\n", + "\n", + "$H_0:\\ \\mu = \\mu_0\\ \\quad or \\quad H_0:\\ \\mu \\leq \\mu_0$\n", + "\n", + "$H_1:\\ \\mu > \\mu_0$\n", + "\n", + "$\\\\ $\n", + "\n", + "Testing statistics: \n", + "\n", + "$t_0 \\equiv \\frac{\\overline{X}-\\mu_0}{\\frac{S}{\\sqrt{n}}} \\sim t_{n-1}$\n", + "\n", + "$S = \\sqrt{\\frac{\\sum_{i=1}^n\\ (x_i\\ -\\ \\overline{x})^2}{n-1}}$\n", + "\n", + "$\\\\ $\n", + "\n", + "Significance level = $\\alpha$\n", + "\n", + "We accept $H_0$ if:\n", + "\n", + "1. $ -\\infty <\\ t_0 = \\frac{\\overline{X}-\\mu_0}{\\frac{S}{\\sqrt{n}}}\\ <\\ t_{\\alpha, n-1} $\n", + "\n", + "2. $- \\infty \\ <\\ \\overline{X}\\ <\\ \\mu_0\\ +\\ t_{\\alpha, n-1} \\frac{S}{\\sqrt{n}}$\n", + "\n", + "3. P_value $ = P(t_{n-1} \\geq t_0) > \\alpha$" + ], + "metadata": { + "id": "EDXY_X3im-39" + } + }, + { + "cell_type": "markdown", + "source": [ + "**Left-tailed test:**\n", + "\n", + "Suppose that $X_1, X_2, ..., X_n$ is a sample of size $n$ from a normal distribution having an unknown mean $\\mu$ and a known variance $\\sigma^2$.\n", + "\n", + "$X_1, X_2, ..., X_n \\sim N( \\mu, \\sigma^2)$\n", + "\n", + "$H_0:\\ \\mu = \\mu_0 \\quad or \\quad H_0:\\ \\mu \\geq \\mu_0$\n", + "\n", + "$H_1:\\ \\mu < \\mu_0$\n", + "\n", + "$\\\\ $\n", + "\n", + "Testing statistics: \n", + "\n", + "$t_0 \\equiv \\frac{\\overline{X}-\\mu_0}{\\frac{S}{\\sqrt{n}}} \\sim t_{n-1}$\n", + "\n", + "$S = \\sqrt{\\frac{\\sum_{i=1}^n\\ (x_i\\ -\\ \\overline{x})^2}{n-1}}$\n", + "\n", + "$\\\\ $\n", + "\n", + "Significance level = $\\alpha$\n", + "\n", + "We accept $H_0$ if:\n", + "\n", + "1. $ -\\ t_{\\alpha, n-1} <\\ t_0 = \\frac{\\overline{X}-\\mu_0}{\\frac{S}{\\sqrt{n}}}\\ <\\ \\infty$\n", + "\n", + "2. $\\ \\mu_0\\ -\\ t_{\\alpha, n-1} \\frac{S}{\\sqrt{n}} <\\ \\overline{X}\\ <\\ \\infty $\n", + "\n", + "3. P_value $ = P(t_{n-1} \\leq t_0) > \\alpha$" + ], + "metadata": { + "id": "BjSK1-M-oaJU" + } + }, + { + "cell_type": "code", + "source": [ + "class mean_with_unknown_variance:\n", + " \"\"\"\n", + " Parameters\n", + " ----------\n", + " null_mean : u0\n", + " n : optional, number of sample members\n", + " S : optional, sample standard deviation\n", + " alpha : significance level\n", + " Sample_mean : optional, mean of the sample\n", + " data : optional, if you do not know the sample mean and S, just pass the data\n", + " \"\"\"\n", + " def __init__(self, null_mean, alpha, type_t, n = 0.,S = 0., Sample_mean = 0., data=None):\n", + " self.Sample_mean = Sample_mean\n", + " self.S = S\n", + " self.null_mean = null_mean\n", + " self.type_t = type_t\n", + " self.n = n\n", + " self.alpha = alpha\n", + " self.data = data\n", + " if data is not None:\n", + " self.Sample_mean = np.mean(list(data))\n", + " self.S = np.std(list(data), ddof=1)\n", + " self.n = len(list(data))\n", + "\n", + " mean_with_unknown_variance.__test(self)\n", + " \n", + " def __test(self):\n", + " test_statistic = (self.Sample_mean-self.null_mean) / (self.S/np.sqrt(self.n))\n", + " print('test_statistic:', test_statistic, '\\n')\n", + " \n", + " if self.type_t == 'two_tailed':\n", + " p_value = 2*(1-t.cdf(abs(test_statistic), df=self.n-1))\n", + " print('P_value:', p_value, '\\n')\n", + " elif self.type_t == 'left_tailed':\n", + " p_value = (t.cdf(test_statistic, df=self.n-1))\n", + " print('P_value:', p_value, '\\n')\n", + " else:\n", + " p_value = (1-t.cdf(test_statistic, df=self.n-1))\n", + " print('P_value:', p_value, '\\n')\n", + "\n", + " if p_value < self.alpha:\n", + " print(f'Since p_value < {self.alpha}, reject null hypothesis.')\n", + " else:\n", + " print(f'Since p_value > {self.alpha}, the null hypothesis cannot be rejected.')" + ], + "metadata": { + "id": "nt7giKLcfxtx" + }, + "execution_count": 6, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "mean_with_unknown_variance(type_t='two_tailed', Sample_mean=14.5, S=0.3, n=9, null_mean=14.2, alpha=0.05);" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "60yeQGM1k0cZ", + "outputId": "ebf8ef2b-0722-48cf-c024-a053823acad8" + }, + "execution_count": 7, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "test_statistic: 3.0000000000000075 \n", + "\n", + "P_value: 0.017071681233782554 \n", + "\n", + "Since p_value < 0.05, reject null hypothesis.\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "np.random.seed(1)\n", + "data = np.random.normal(0,1,100)\n", + "mean_with_unknown_variance(type_t='two_tailed', data=data, null_mean=0.1, alpha=0.05);" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "cIFNzvbIqov6", + "outputId": "11b26f64-0e67-4f9a-d316-542cdb0bfb90" + }, + "execution_count": 8, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "test_statistic: -0.4430807396299341 \n", + "\n", + "P_value: 0.6586740373655937 \n", + "\n", + "Since p_value > 0.05, the null hypothesis cannot be rejected.\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "\n", + "\n", + "## **4.3. Test Concerning the Equality of Means of Two Normal Populations:**" + ], + "metadata": { + "id": "4V1_eXUkxJD4" + } + }, + { + "cell_type": "markdown", + "source": [ + "\n", + "\n", + "### **4.3.1. Known Variances:**" + ], + "metadata": { + "id": "58w24h7DxfUc" + } + }, + { + "cell_type": "markdown", + "source": [ + "**A. Two Tailed Test:**\n", + "\n", + "Suppose that $X_1, X_2, ..., X_n$ is a sample of size $n_x$ from a normal distribution having an unknown mean $\\mu_x$ and a known variance $\\sigma^2_x$. \n", + "\n", + "$Y_1, Y_2, ..., Y_n$ is a sample of size $n_y$ from a normal distribution having an unknown mean $\\mu_y$ and a known variance $\\sigma^2_y$.\n", + "\n", + "$X_1, X_2, ..., X_{n_x} \\sim N( \\mu_x, \\sigma^2_x)$\n", + "\n", + "$Y_1, Y_2, ..., Y_{n_y} \\sim N( \\mu_y, \\sigma^2_y)$\n", + "\n", + "$H_0:\\ \\mu_x = \\mu_y$\n", + "\n", + "$H_1:\\ \\mu_x \\neq \\mu_y$\n", + "\n", + "$\\\\ $\n", + "\n", + "Testing statistics: \n", + "\n", + "$Z_0 \\equiv \\frac{\\overline{X}-\\overline{Y} -\\ 0}{\\sqrt{(\\frac{\\sigma^2_x}{n_x}) + (\\frac{\\sigma^2_y}{n_y})}}$\n", + "\n", + "$\\\\ $\n", + "\n", + "Significance level = $\\alpha$\n", + "\n", + "We accept $H_0$ if:\n", + "\n", + "1. $-\\ Z_{\\frac{\\alpha}{2}}\\ <\\ Z_0 = \\frac{\\overline{X}-\\overline{Y} -\\ 0}{\\sqrt{(\\frac{\\sigma^2_x}{n_x}) + (\\frac{\\sigma^2_y}{n_y})}}\\ <\\ Z_{\\frac{\\alpha}{2}}$\n", + "\n", + "2. $-\\ Z_{\\frac{\\alpha}{2}} {\\sqrt{(\\frac{\\sigma^2_x}{n_x}) + (\\frac{\\sigma^2_y}{n_y})}}\\ <\\ \\overline{X}\\ - \\overline{Y}\\ <\\ Z_{\\frac{\\alpha}{2}} {\\sqrt{(\\frac{\\sigma^2_x}{n_x}) + (\\frac{\\sigma^2_y}{n_y})}}$\n", + "\n", + "3. $|\\overline{X}-\\overline{Y}| < Z_{\\frac{\\alpha}{2}} {\\sqrt{(\\frac{\\sigma^2_x}{n_x}) + (\\frac{\\sigma^2_y}{n_y})}}$\n", + "\n", + "4. P_value $ = 2 \\times P(Z \\geq |Z_0|) > \\alpha$" + ], + "metadata": { + "id": "CkDCKD4ZHYtP" + } + }, + { + "cell_type": "markdown", + "source": [ + "**B. One-sided tests:**\n", + "\n", + "**Right-tailed test:**\n", + "\n", + "Suppose that $X_1, X_2, ..., X_n$ is a sample of size $n_x$ from a normal distribution having an unknown mean $\\mu_x$ and a known variance $\\sigma^2_x$. \n", + "\n", + "$Y_1, Y_2, ..., Y_n$ is a sample of size $n_y$ from a normal distribution having an unknown mean $\\mu_y$ and a known variance $\\sigma^2_y$.\n", + "\n", + "$X_1, X_2, ..., X_{n_x} \\sim N( \\mu_x, \\sigma^2_x)$\n", + "\n", + "$Y_1, Y_2, ..., Y_{n_y} \\sim N( \\mu_y, \\sigma^2_y)$\n", + "\n", + "$H_0:\\ \\mu_x = \\mu_y\\ \\quad or \\quad H_0:\\ \\mu_x \\leq \\mu_y$\n", + "\n", + "$H_1:\\ \\mu_x > \\mu_y$\n", + "\n", + "$\\\\ $\n", + "\n", + "Testing statistics: \n", + "\n", + "$Z_0 \\equiv \\frac{\\overline{X}-\\overline{Y} - 0}{\\sqrt{(\\frac{\\sigma^2_x}{n_x}) + (\\frac{\\sigma^2_y}{n_y})}}$\n", + "\n", + "$\\\\ $\n", + "\n", + "Significance level = $\\alpha$\n", + "\n", + "We accept $H_0$ if:\n", + "\n", + "1. $ -\\infty <\\ Z_0 = \\frac{\\overline{X}-\\overline{Y} - 0}{\\sqrt{(\\frac{\\sigma^2_x}{n_x}) + (\\frac{\\sigma^2_y}{n_y})}}\\ <\\ Z_{\\alpha} $\n", + "\n", + "2. $- \\infty \\ <\\ \\overline{X}\\ - \\overline{Y}\\ <\\ Z_{\\alpha} {\\sqrt{(\\frac{\\sigma^2_x}{n_x}) + (\\frac{\\sigma^2_y}{n_y})}}$\n", + "\n", + "3. P_value $ = P(Z \\geq Z_0) > \\alpha$" + ], + "metadata": { + "id": "Yo2g8fAeHYtS" + } + }, + { + "cell_type": "markdown", + "source": [ + "**Left-tailed test:**\n", + "\n", + "Suppose that $X_1, X_2, ..., X_n$ is a sample of size $n_x$ from a normal distribution having an unknown mean $\\mu_x$ and a known variance $\\sigma^2_x$. \n", + "\n", + "$Y_1, Y_2, ..., Y_n$ is a sample of size $n_y$ from a normal distribution having an unknown mean $\\mu_y$ and a known variance $\\sigma^2_y$.\n", + "\n", + "$X_1, X_2, ..., X_{n_x} \\sim N( \\mu_x, \\sigma^2_x)$\n", + "\n", + "$Y_1, Y_2, ..., Y_{n_y} \\sim N( \\mu_y, \\sigma^2_y)$\n", + "\n", + "$H_0:\\ \\mu_x = \\mu_y \\quad or \\quad H_0:\\ \\mu_x \\geq \\mu_y$\n", + "\n", + "$H_1:\\ \\mu_x < \\mu_y$\n", + "\n", + "$\\\\ $\n", + "\n", + "Testing statistics: \n", + "\n", + "$Z_0 \\equiv \\frac{\\overline{X}-\\overline{Y} - 0}{\\sqrt{(\\frac{\\sigma^2_x}{n_x}) + (\\frac{\\sigma^2_y}{n_y})}}$\n", + "\n", + "$\\\\ $\n", + "\n", + "Significance level = $\\alpha$\n", + "\n", + "We accept $H_0$ if:\n", + "\n", + "1. $-\\ Z_{\\alpha}\\ <\\ Z_0 = \\frac{\\overline{X}-\\overline{Y} - 0}{\\sqrt{(\\frac{\\sigma^2_x}{n_x}) + (\\frac{\\sigma^2_y}{n_y})}} <\\ \\infty $\n", + "\n", + "2. $\\ -Z_{\\alpha} {\\sqrt{(\\frac{\\sigma^2_x}{n_x}) + (\\frac{\\sigma^2_y}{n_y})}}\\ <\\ \\overline{X} - \\overline{Y}\\ <\\ \\infty$\n", + "\n", + "3. P_value $ = P(Z \\leq Z_0) > \\alpha$" + ], + "metadata": { + "id": "nt2bxi_dHYtU" + } + }, + { + "cell_type": "markdown", + "source": [ + "If you want to test the below hypothesis, just the test statistics changes. The other equations are the same. \n", + "\n", + "$H_0:\\ \\mu_x - k\\ \\mu_y = d $\n", + "\n", + "$H_1:\\ \\mu_x - k\\ \\mu_y \\neq d$\n", + "\n", + "$\\\\ $\n", + "\n", + "Testing statistics: \n", + "\n", + "$Z_0 \\equiv \\frac{\\overline{X}-\\ k\\ \\overline{Y} -\\ d}{\\sqrt{(\\frac{\\sigma^2_x}{n_x}) + k^2(\\frac{\\sigma^2_y}{n_y})}}$" + ], + "metadata": { + "id": "28hPv38GQIhB" + } + }, + { + "cell_type": "code", + "source": [ + "class equal_means_with_known_variances:\n", + " \"\"\"\n", + " Parameters\n", + " ----------\n", + " diff_mean_null : difference between the means of two population in the null hypothesis\n", + " population_v1 : known variance of the population1\n", + " population_v2 : known variance of the population2\n", + " n1 : optional, number of sample1 members\n", + " n2 : optional, number of sample2 members\n", + " alpha : significance level\n", + " type_t : 'two_tailed', 'right_tailed', 'left_tailed'\n", + " Sample_mean1 : mean of the sample1\n", + " Sample_mean2 : mean of the sample2\n", + " k : optional\n", + " data1 : optional, if you do not know the sample mean1, just pass the first sample\n", + " data2 : optional, if you do not know the sample mean2, just pass the second sample\n", + " \"\"\"\n", + " def __init__(self, diff_mean_null, population_v1, population_v2, alpha, type_t, k = 1,\n", + " n1 = 0., n2 = 0., Sample_mean1 = 0., Sample_mean2 = 0., data1=None, data2=None):\n", + " self.Sample_mean1 = Sample_mean1\n", + " self.Sample_mean2 = Sample_mean2 \n", + " self.diff_mean_null = diff_mean_null\n", + " self.population_v1 = population_v1\n", + " self.population_v2 = population_v2\n", + " self.type_t = type_t\n", + " self.n1 = n1\n", + " self.n2 = n2\n", + " self.k = k\n", + " self.alpha = alpha\n", + " self.data1 = data1\n", + " self.data2 = data2\n", + " if data1 is not None:\n", + " self.Sample_mean1 = np.mean(list(data1))\n", + " self.n1 = len(list(data1))\n", + " if data2 is not None:\n", + " self.Sample_mean2 = np.mean(list(data2))\n", + " self.n2 = len(list(data2)) \n", + "\n", + " equal_means_with_known_variances.__test(self)\n", + " \n", + " def __test(self):\n", + " test_statistic = (self.Sample_mean1-(self.k)*self.Sample_mean2-self.diff_mean_null) / (np.sqrt((self.population_v1/ self.n1) + (self.k**2)*(self.population_v2/ self.n2)))\n", + " print('test_statistic:', test_statistic, '\\n')\n", + "\n", + " if self.type_t == 'two_tailed':\n", + " p_value = 2*(1-norm.cdf(abs(test_statistic)))\n", + " print('P_value:', p_value, '\\n')\n", + " elif self.type_t == 'left_tailed':\n", + " p_value = (norm.cdf(test_statistic))\n", + " print('P_value:', p_value, '\\n')\n", + " else:\n", + " p_value = (1-norm.cdf(test_statistic))\n", + " print('P_value:', p_value, '\\n')\n", + "\n", + " if p_value < self.alpha:\n", + " print(f'Since p_value < {self.alpha}, reject null hypothesis.')\n", + " else:\n", + " print(f'Since p_value > {self.alpha}, the null hypothesis cannot be rejected.')" + ], + "metadata": { + "id": "l1L5_GNiHaUN" + }, + "execution_count": 9, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "data1 = np.multiply([61.1,58.2,62.3,64,59.7,66.2,57.8,61.4,62.2,63.6], 100)\n", + "data2 = np.multiply([62.2,56.6,66.4,56.2,57.4,58.4,57.6,65.4], 100)\n", + "np.mean(data1), np.mean(data2)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "UKOU5aMAT23t", + "outputId": "eadf5b16-ae41-4f48-9d12-a8f75f742d50" + }, + "execution_count": 10, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "(6165.0, 6002.5)" + ] + }, + "metadata": {}, + "execution_count": 10 + } + ] + }, + { + "cell_type": "code", + "source": [ + "equal_means_with_known_variances(diff_mean_null = 0, population_v1 = 4000**2, population_v2 = 6000**2, n1 = 10, n2 = 8, \n", + " alpha = 0.05, type_t = 'two_tailed', k = 1, Sample_mean1 = 6165, Sample_mean2 = 6002.5);" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "OrYN40CmRS3V", + "outputId": "b5e6a3c4-24ba-4c03-ebd3-168b1f5517c7" + }, + "execution_count": 11, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "test_statistic: 0.06579432682703693 \n", + "\n", + "P_value: 0.947541572987298 \n", + "\n", + "Since p_value > 0.05, the null hypothesis cannot be rejected.\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "equal_means_with_known_variances(diff_mean_null = 0, population_v1 = 4000**2, population_v2 = 6000**2, \n", + " alpha = 0.05, type_t = 'two_tailed', k = 1, data1 = data1, data2 = data2);" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "OAA0YmNPTFuf", + "outputId": "ab4e8374-81c7-43df-c77a-24d5193c5620" + }, + "execution_count": 12, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "test_statistic: 0.06579432682703693 \n", + "\n", + "P_value: 0.947541572987298 \n", + "\n", + "Since p_value > 0.05, the null hypothesis cannot be rejected.\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "\n", + "\n", + "### **4.3.2. Unknown but Equal Variances:**" + ], + "metadata": { + "id": "rkSluAxdVb1M" + } + }, + { + "cell_type": "markdown", + "source": [ + "**A. Two Tailed Test:**\n", + "\n", + "Suppose that $X_1, X_2, ..., X_n$ is a sample of size $n_x$ from a normal distribution having an unknown mean $\\mu_x$ and a unknown variance $\\sigma^2_x$. \n", + "\n", + "$Y_1, Y_2, ..., Y_n$ is a sample of size $n_y$ from a normal distribution having an unknown mean $\\mu_y$ and a unknown variance $\\sigma^2_y$.\n", + "\n", + "$X_1, X_2, ..., X_{n_x} \\sim N( \\mu_x, \\sigma^2_x)$\n", + "\n", + "$Y_1, Y_2, ..., Y_{n_y} \\sim N( \\mu_y, \\sigma^2_y)$\n", + "\n", + "$H_0:\\ \\mu_x = \\mu_y$\n", + "\n", + "$H_1:\\ \\mu_x \\neq \\mu_y$\n", + "\n", + "$\\\\ $\n", + "\n", + "Testing statistics: \n", + "\n", + "$t_0 \\equiv \\frac{\\overline{X}-\\ \\overline{Y}\\ -\\ 0}{{S_p} \\sqrt{\\frac{1}{n_x} + \\frac{1}{n_y}}} \\sim t_{{n_x}+{n_y}-2}$\n", + "\n", + "$S_p^2 = \\frac{(n_x-1)S_x^2 + (n_y-1)S_y^2}{n_x+n_y-2}$\n", + "\n", + "$\\\\ $\n", + "\n", + "Significance level = $\\alpha$\n", + "\n", + "We accept $H_0$ if:\n", + "\n", + "1. $-\\ t_{\\frac{\\alpha}{2},{n_x} + {n_y}-2}\\ <\\ t_0 = \\frac{\\overline{X}-\\ \\overline{Y}\\ -\\ 0}{{S_p} \\sqrt{\\frac{1}{n_x} + \\frac{1}{n_y}}} <\\ t_{\\frac{\\alpha}{2},{n_x} + {n_y}-2}$\n", + "\n", + "2. $-\\ t_{\\frac{\\alpha}{2},{n_x} + {n_y}-2} {{S_p} \\sqrt{\\frac{1}{n_x} + \\frac{1}{n_y}}}\\ <\\ \\overline{X}\\ - \\overline{Y}\\ <\\ t_{\\frac{\\alpha}{2},{n_x} + {n_y}-2} {{S_p} \\sqrt{\\frac{1}{n_x} + \\frac{1}{n_y}}}$\n", + "\n", + "3. $|\\overline{X}-\\overline{Y}| < t_{\\frac{\\alpha}{2},{n_x} + {n_y}-2} {{S_p} \\sqrt{\\frac{1}{n_x} + \\frac{1}{n_y}}}$\n", + "\n", + "4. P_value $ = 2 \\times P(t_{\\frac{\\alpha}{2},{n_x} + {n_y}-2} \\geq |t_0|) > \\alpha$" + ], + "metadata": { + "id": "fE5B1uXoVyjf" + } + }, + { + "cell_type": "markdown", + "source": [ + "**B. One-sided tests:**\n", + "\n", + "**Right-tailed test:**\n", + "\n", + "Suppose that $X_1, X_2, ..., X_n$ is a sample of size $n_x$ from a normal distribution having an unknown mean $\\mu_x$ and a unknown variance $\\sigma^2_x$. \n", + "\n", + "$Y_1, Y_2, ..., Y_n$ is a sample of size $n_y$ from a normal distribution having an unknown mean $\\mu_y$ and a unknown variance $\\sigma^2_y$.\n", + "\n", + "$X_1, X_2, ..., X_{n_x} \\sim N( \\mu_x, \\sigma^2_x)$\n", + "\n", + "$Y_1, Y_2, ..., Y_{n_y} \\sim N( \\mu_y, \\sigma^2_y)$\n", + "\n", + "$H_0:\\ \\mu_x = \\mu_y\\ \\quad or \\quad H_0:\\ \\mu_x \\leq \\mu_y$\n", + "\n", + "$H_1:\\ \\mu_x > \\mu_y$\n", + "\n", + "$\\\\ $\n", + "\n", + "Testing statistics: \n", + "\n", + "$t_0 \\equiv \\frac{\\overline{X}-\\ \\overline{Y}\\ -\\ 0}{{S_p} \\sqrt{\\frac{1}{n_x} + \\frac{1}{n_y}}} \\sim t_{{n_x}+{n_y}-2}$\n", + "\n", + "$S_p^2 = \\frac{(n_x-1)S_x^2 + (n_y-1)S_y^2}{n_x+n_y-2}$\n", + "\n", + "$\\\\ $\n", + "\n", + "Significance level = $\\alpha$\n", + "\n", + "We accept $H_0$ if:\n", + "\n", + "1. $ -\\infty <\\ t_0 = \\frac{\\overline{X}-\\ \\overline{Y}\\ -\\ 0}{{S_p} \\sqrt{\\frac{1}{n_x} + \\frac{1}{n_y}}}\\ <\\ t_{{\\alpha},{n_x} + {n_y}-2} $\n", + "\n", + "2. $- \\infty \\ <\\ \\overline{X}\\ - \\overline{Y}\\ <\\ t_{{\\alpha},{n_x} + {n_y}-2} {{S_p} \\sqrt{\\frac{1}{n_x} + \\frac{1}{n_y}}}$\n", + "\n", + "3. P_value $ = P(t_{{n_x} + {n_y}-2} \\geq t_0) > \\alpha$" + ], + "metadata": { + "id": "h4g05pIyafSJ" + } + }, + { + "cell_type": "markdown", + "source": [ + "**Left-tailed test:**\n", + "\n", + "Suppose that $X_1, X_2, ..., X_n$ is a sample of size $n_x$ from a normal distribution having an unknown mean $\\mu_x$ and a known variance $\\sigma^2_x$. \n", + "\n", + "$Y_1, Y_2, ..., Y_n$ is a sample of size $n_y$ from a normal distribution having an unknown mean $\\mu_y$ and a known variance $\\sigma^2_y$.\n", + "\n", + "$X_1, X_2, ..., X_{n_x} \\sim N( \\mu_x, \\sigma^2_x)$\n", + "\n", + "$Y_1, Y_2, ..., Y_{n_y} \\sim N( \\mu_y, \\sigma^2_y)$\n", + "\n", + "$H_0:\\ \\mu_x = \\mu_y \\quad or \\quad H_0:\\ \\mu_x \\geq \\mu_y$\n", + "\n", + "$H_1:\\ \\mu_x < \\mu_y$\n", + "\n", + "$\\\\ $\n", + "\n", + "Testing statistics: \n", + "\n", + "$t_0 \\equiv \\frac{\\overline{X}-\\ \\overline{Y}\\ -\\ 0}{{S_p} \\sqrt{\\frac{1}{n_x} + \\frac{1}{n_y}}} \\sim t_{{n_x}+{n_y}-2}$\n", + "\n", + "$S_p^2 = \\frac{(n_x-1)S_x^2 + (n_y-1)S_y^2}{n_x+n_y-2}$\n", + "\n", + "$\\\\ $\n", + "\n", + "Significance level = $\\alpha$\n", + "\n", + "We accept $H_0$ if:\n", + "\n", + "1. $-\\ t_{{\\alpha},{n_x} + {n_y}-2} <\\ t_0 = \\frac{\\overline{X}-\\ \\overline{Y}\\ -\\ 0}{{S_p} \\sqrt{\\frac{1}{n_x} + \\frac{1}{n_y}}} <\\ \\infty $\n", + "\n", + "2. $\\ -t_{{\\alpha},{n_x} + {n_y}-2} {{S_p} \\sqrt{\\frac{1}{n_x} + \\frac{1}{n_y}}}\\ <\\ \\overline{X} - \\overline{Y}\\ <\\ \\infty$\n", + "\n", + "3. P_value $ = P(t_{{\\alpha},{n_x} + {n_y}-2} \\leq t_0) > \\alpha$" + ], + "metadata": { + "id": "8CjR8DPccmoH" + } + }, + { + "cell_type": "code", + "source": [ + "class equal_means_with_unknown_variances:\n", + " \"\"\"\n", + " Parameters\n", + " ----------\n", + " diff_mean_null : difference between the means of two population in the null hypothesis\n", + " S1 : optional, std of the sample1\n", + " S2 : optional, std of the sample2\n", + " n1 : optional, number of sample1 members\n", + " n2 : optional, number of sample2 members\n", + " alpha : significance level\n", + " type_t : 'two_tailed', 'right_tailed', 'left_tailed'\n", + " Sample_mean1 : mean of the sample1\n", + " Sample_mean2 : mean of the sample2\n", + " data1 : optional, if you do not know the sample mean1 and S1, just pass the first sample\n", + " data2 : optional, if you do not know the sample mean2 and S2, just pass the second sample\n", + " \"\"\"\n", + " def __init__(self, diff_mean_null, alpha, type_t, n1 = 0., n2 = 0., S1 = 0., S2 = 0., \n", + " Sample_mean1 = 0., Sample_mean2 = 0., data1=None, data2=None):\n", + " self.Sample_mean1 = Sample_mean1\n", + " self.Sample_mean2 = Sample_mean2 \n", + " self.diff_mean_null = diff_mean_null\n", + " self.S1 = S1\n", + " self.S2 = S2\n", + " self.type_t = type_t\n", + " self.n1 = n1\n", + " self.n2 = n2\n", + " self.alpha = alpha\n", + " self.data1 = data1\n", + " self.data2 = data2\n", + " self.SP2 = None\n", + " if data1 is not None:\n", + " self.Sample_mean1 = np.mean(list(data1))\n", + " self.S1 = np.std(list(data1), ddof=1)\n", + " self.n1 = len(list(data1))\n", + "\n", + " if data2 is not None:\n", + " self.Sample_mean2 = np.mean(list(data2)) \n", + " self.S2 = np.std(data2, ddof=1)\n", + " self.n2 = len(list(data2))\n", + "\n", + " self.SP2 = ((self.n1-1)*(self.S1**2) + (self.n2-1)*(self.S2**2)) / (self.n1+self.n2-2)\n", + "\n", + " equal_means_with_unknown_variances.__test(self)\n", + " \n", + " def __test(self):\n", + " test_statistic = (self.Sample_mean1-self.Sample_mean2-self.diff_mean_null) / (np.sqrt(self.SP2)*np.sqrt(1/self.n1+1/self.n2))\n", + " print('test_statistic:', test_statistic, '\\n')\n", + "\n", + " if self.type_t == 'two_tailed':\n", + " p_value = 2*(1-t.cdf(abs(test_statistic), df = self.n1+self.n2-2))\n", + " print('P_value:', p_value, '\\n')\n", + " elif self.type_t == 'left_tailed':\n", + " p_value = (t.cdf(test_statistic, df = self.n1+self.n2-2))\n", + " print('P_value:', p_value, '\\n')\n", + " else:\n", + " p_value = 1-t.cdf(test_statistic, df = self.n1+self.n2-2)\n", + " print('P_value:', p_value, '\\n')\n", + "\n", + " if p_value < self.alpha:\n", + " print(f'Since p_value < {self.alpha}, reject null hypothesis.')\n", + " else:\n", + " print(f'Since p_value > {self.alpha}, the null hypothesis cannot be rejected.')" + ], + "metadata": { + "id": "IVC_vkmRaUZJ" + }, + "execution_count": 13, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "equal_means_with_unknown_variances(diff_mean_null = 0, S1 = np.sqrt(0.581), S2 = np.sqrt(0.778), n1 = 10, n2 = 12,\n", + " alpha = 0.05, type_t = 'left_tailed', Sample_mean1 = 6.45, Sample_mean2 = 7.125);" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "6-VjkWzZswic", + "outputId": "64e96e74-d6b4-4f2c-9efd-2a9f085fae63" + }, + "execution_count": 14, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "test_statistic: -1.8987297952365871 \n", + "\n", + "P_value: 0.03606193933099178 \n", + "\n", + "Since p_value < 0.05, reject null hypothesis.\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "data1 = [5.5,6,7,6,7.5,6,7.5,5.5,7,6.5]\n", + "data2 = [6.5,6,8.5,7,6.5,8,7.5,6.5,7.5,6,8.5,7]\n", + "np.mean(data1), np.mean(data2)" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "4GVzWRqD1e-g", + "outputId": "5538346f-fa85-4373-c67b-2db8b0fbbb27" + }, + "execution_count": 15, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "(6.45, 7.125)" + ] + }, + "metadata": {}, + "execution_count": 15 + } + ] + }, + { + "cell_type": "code", + "source": [ + "equal_means_with_unknown_variances(diff_mean_null = 0, alpha = 0.05, type_t = 'left_tailed', data1 = data1, data2 = data2);" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "46MLRnZBwKt4", + "outputId": "44eafe71-ea78-4856-b6b8-adc3b63ecaa6" + }, + "execution_count": 16, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "test_statistic: -1.8986953664603443 \n", + "\n", + "P_value: 0.03606431976959166 \n", + "\n", + "Since p_value < 0.05, reject null hypothesis.\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "You can also do this test using scipy.\n", + "\n", + "`scipy.stats.ttest_ind` calculate the T-test for the means of two independent samples.\n", + "\n", + "[Doc](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ttest_ind.html)" + ], + "metadata": { + "id": "Er9iBrjkOvUE" + } + }, + { + "cell_type": "code", + "source": [ + "alpha = 0.05\n", + "Test_statistic, p_value = ttest_ind(data1, data2, alternative='less')\n", + "\n", + "print(f'Test_statistic = {Test_statistic}, p_value = {p_value}', '\\n')\n", + "\n", + "if p_value < alpha:\n", + " print(f'Since p_value < {alpha}, reject null hypothesis.')\n", + "else:\n", + " print(f'Since p_value > {alpha}, the null hypothesis cannot be rejected.')" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "a5AQocXrN7ix", + "outputId": "7109f06e-b519-43cd-eb1a-02e574d67db3" + }, + "execution_count": 17, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Test_statistic = -1.8986953664603443, p_value = 0.03606431976959166 \n", + "\n", + "Since p_value < 0.05, reject null hypothesis.\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "\n", + "\n", + "## **4.4. Paired t-test:**" + ], + "metadata": { + "id": "Br8LlPdh4e9R" + } + }, + { + "cell_type": "markdown", + "source": [ + "**A. Two Tailed Test:**\n", + "\n", + "The data can be described by the $n$ pairs $(X_i, Y_i), i = 1, 2, ..., n$, where $X_i$ is the data before an action, and $Y_i$ is the respective data points after an action.\n", + "\n", + "It is important to note that we cannot treat $X_1, ... , X_n$ and $Y_1, ... , Y_n$ as being independent samples.\n", + "\n", + "$D_i = X_i − Y_i, \\quad i = 1, ... , n$\n", + "\n", + "$H_0 : \\mu_D = 0$\n", + "\n", + "$H_1 : \\mu_D \\neq 0$\n", + "\n", + "$\\\\ $\n", + "\n", + "Testing statistics: \n", + "\n", + "$t_0 \\equiv \\frac{\\overline{D}\\ -\\ 0}{\\frac{S_D}{\\sqrt{n}}} \\sim t_{n-1}$\n", + "\n", + "$S_D = \\sqrt{\\frac{\\sum_{i=1}^n\\ (D_i\\ -\\ \\overline{D})^2}{n\\ -\\ 1}} \\quad \\quad \\overline{D} = \\sqrt{\\frac{{\\sum_{i=1}^n\\ D_i}}{n}}$\n", + "\n", + "$\\\\ $\n", + "\n", + "Significance level = $\\alpha$\n", + "\n", + "We accept $H_0$ if:\n", + "\n", + "1. $ -\\ t_{\\frac{{\\alpha}}{2},{n-1}} <\\ t_0 = \\frac{\\overline{D}\\ -\\ 0}{\\frac{S_D}{\\sqrt{n}}}\\ <\\ t_{\\frac{{\\alpha}}{2},{n-1}} $\n", + "\n", + "2. $- t_{\\frac{{\\alpha}}{2},{n-1}} \\frac{S_D}{\\sqrt{n}} \\ <\\ \\overline{D}\\ <\\ t_{\\frac{{\\alpha}}{2},{n-1}} \\frac{S_D}{\\sqrt{n}}$\n", + "\n", + "3. $| \\overline{D} |\\ <\\ t_{\\frac{{\\alpha}}{2},{n-1}} \\frac{S_D}{\\sqrt{n}}$\n", + "\n", + "4. P_value $ = P(t_{n-1} \\geq |t_0|) > \\alpha$" + ], + "metadata": { + "id": "1mOZr_hk5i7m" + } + }, + { + "cell_type": "markdown", + "source": [ + "**B. One-sided tests:**\n", + "\n", + "**Right-tailed test:**\n", + "\n", + "The data can be described by the $n$ pairs $(X_i, Y_i), i = 1, 2, ..., n$, where $X_i$ is the data before an action, and $Y_i$ is the respective data points after an action.\n", + "\n", + "It is important to note that we cannot treat $X_1, ... , X_n$ and $Y_1, ... , Y_n$ as being independent samples.\n", + "\n", + "$D_i = X_i − Y_i, \\quad i = 1, ... , n$\n", + "\n", + "$H_0:\\ \\mu_D = 0\\ \\quad or \\quad H_0:\\ \\mu_D \\leq 0$\n", + "\n", + "$H_1 : \\mu_D > 0$\n", + "\n", + "$\\\\ $\n", + "\n", + "Testing statistics: \n", + "\n", + "$t_0 \\equiv \\frac{\\overline{D}\\ -\\ 0}{\\frac{S_D}{\\sqrt{n}}} \\sim t_{n-1}$\n", + "\n", + "$S_D = \\sqrt{\\frac{\\sum_{i=1}^n\\ (D_i\\ -\\ \\overline{D})^2}{n\\ -\\ 1}} \\quad \\quad \\overline{D} = \\sqrt{\\frac{{\\sum_{i=1}^n\\ D_i}}{n}}$\n", + "\n", + "$\\\\ $\n", + "\n", + "Significance level = $\\alpha$\n", + "\n", + "We accept $H_0$ if:\n", + "\n", + "1. $ -\\infty <\\ t_0 = \\frac{\\overline{D}\\ -\\ 0}{\\frac{S_D}{\\sqrt{n}}}\\ <\\ t_{{\\alpha},{n-1}} $\n", + "\n", + "2. $- \\infty \\ <\\ \\overline{D}\\ <\\ t_{{\\alpha},{n-1}} {\\frac{S_D}{\\sqrt{n}}}$\n", + "\n", + "3. P_value $ = P(t_{n-1} \\geq t_0) > \\alpha$" + ], + "metadata": { + "id": "Xmfb3znaIMuQ" + } + }, + { + "cell_type": "markdown", + "source": [ + "**Left-tailed test:**\n", + "\n", + "The data can be described by the $n$ pairs $(X_i, Y_i), i = 1, 2, ..., n$, where $X_i$ is the data before an action, and $Y_i$ is the respective data points after an action.\n", + "\n", + "It is important to note that we cannot treat $X_1, ... , X_n$ and $Y_1, ... , Y_n$ as being independent samples.\n", + "\n", + "$D_i = X_i − Y_i, \\quad i = 1, ... , n$\n", + "\n", + "$H_0:\\ \\mu_D = 0\\ \\quad or \\quad H_0:\\ \\mu_D \\geq 0$\n", + "\n", + "$H_1 : \\mu_D < 0$\n", + "\n", + "$\\\\ $\n", + "\n", + "Testing statistics: \n", + "\n", + "$t_0 \\equiv \\frac{\\overline{D}\\ -\\ 0}{\\frac{S_D}{\\sqrt{n}}} \\sim t_{n-1}$\n", + "\n", + "$S_D = \\sqrt{\\frac{\\sum_{i=1}^n\\ (D_i\\ -\\ \\overline{D})^2}{n\\ -\\ 1}} \\quad \\quad \\overline{D} = \\sqrt{\\frac{{\\sum_{i=1}^n\\ D_i}}{n}}$\n", + "\n", + "$\\\\ $\n", + "\n", + "Significance level = $\\alpha$\n", + "\n", + "We accept $H_0$ if:\n", + "\n", + "1. $-\\ t_{{\\alpha},{n-1}} <\\ t_0 = \\frac{\\overline{D}\\ -\\ 0}{\\frac{S_D}{\\sqrt{n}}} <\\ \\infty $\n", + "\n", + "2. $\\ -t_{{\\alpha},{n-1}} {\\frac{S_D}{\\sqrt{n}}}\\ <\\ \\overline{D}\\ <\\ \\infty$\n", + "\n", + "3. P_value $ = P(t_{n-1} \\leq t_0) > \\alpha$" + ], + "metadata": { + "id": "5tBxt9m2JWKe" + } + }, + { + "cell_type": "code", + "source": [ + "class paired_test:\n", + " \"\"\"\n", + " Parameters\n", + " ----------\n", + " null_mean : diff in the null hpothesis\n", + " n : optional, number of sample members\n", + " SD : D standard deviation\n", + " alpha : significance level\n", + " D_bar : mean of the D\n", + " type_t : 'two_tailed', 'right_tailed', 'left_tailed'\n", + " data1 : optional, if you do not know the D_bar and SD, just pass the data\n", + " data2 : optional, if you do not know the D_bar and SD, just pass the data\n", + " \"\"\"\n", + " def __init__(self, null_mean, alpha, type_t, n = 0., SD = 0., D_bar = 0., data1=None, data2=None):\n", + " self.D_bar = D_bar\n", + " self.SD = SD\n", + " self.null_mean = null_mean\n", + " self.type_t = type_t\n", + " self.n = n\n", + " self.alpha = alpha\n", + " self.data1 = data1\n", + " self.data2 = data2\n", + "\n", + " if data1 is not None:\n", + " D = np.subtract(data1, data2)\n", + " self.D_bar = np.mean(list(D))\n", + " self.SD = np.std(list(D), ddof=1)\n", + " self.n = len(list(data1))\n", + "\n", + " paired_test.__test(self)\n", + " \n", + " def __test(self):\n", + " test_statistic = (self.D_bar - self.null_mean) / (self.SD / np.sqrt(self.n))\n", + " print('test_statistic:', test_statistic, '\\n')\n", + " \n", + " if self.type_t == 'two_tailed':\n", + " p_value = 2*(1-t.cdf(abs(test_statistic), df=self.n-1))\n", + " print('P_value:', p_value, '\\n')\n", + " elif self.type_t == 'left_tailed':\n", + " p_value = (t.cdf(test_statistic, df=self.n-1))\n", + " print('P_value:', p_value, '\\n')\n", + " else:\n", + " p_value = (1-t.cdf(test_statistic, df=self.n-1))\n", + " print('P_value:', p_value, '\\n')\n", + "\n", + " if p_value < self.alpha:\n", + " print(f'Since p_value < {self.alpha}, reject null hypothesis.')\n", + " else:\n", + " print(f'Since p_value > {self.alpha}, the null hypothesis cannot be rejected.')" + ], + "metadata": { + "id": "83eVKnVs4osA" + }, + "execution_count": 18, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "data1 = [7,6,10,16]\n", + "data2 = [9,10,14,18]\n", + "paired_test(null_mean = 0, n = 4, alpha = 0.05, type_t = 'two_tailed', data1=data1, data2=data2);" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "L2HKLRMDAbat", + "outputId": "2d5f95a7-b8fe-4fa0-d044-0012b605a518" + }, + "execution_count": 19, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "test_statistic: -5.196152422706632 \n", + "\n", + "P_value: 0.013846832988859026 \n", + "\n", + "Since p_value < 0.05, reject null hypothesis.\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "data1 = [30.5,18.5,24.5,32,16,15,23.5,25.5,28,18]\n", + "data2 = [23,21,22,28.5,14.5,15.5,24.5,21,23.5,16.5]\n", + "paired_test(null_mean = 0, n = 10, alpha = 0.05, type_t = 'right_tailed', data1=data1, data2=data2);" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "vOK3B_4aHDH-", + "outputId": "39336454-bb13-49f7-b03f-59461ad081b3" + }, + "execution_count": 20, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "test_statistic: 2.2659493332258522 \n", + "\n", + "P_value: 0.02484551530199397 \n", + "\n", + "Since p_value < 0.05, reject null hypothesis.\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "You can also do this test using scipy.\n", + "\n", + "`scipy.stats.ttest_rel` calculate the T-test for the means of two independent samples.\n", + "\n", + "[Doc](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ttest_rel.html)" + ], + "metadata": { + "id": "1M5Wf-TFPzrU" + } + }, + { + "cell_type": "code", + "source": [ + "alpha = 0.05\n", + "Test_statistic, p_value = ttest_rel(data1, data2, alternative='greater')\n", + "\n", + "print(f'Test_statistic = {Test_statistic}, p_value = {p_value}', '\\n')\n", + "\n", + "if p_value < alpha:\n", + " print(f'Since p_value < {alpha}, reject null hypothesis.')\n", + "else:\n", + " print(f'Since p_value > {alpha}, the null hypothesis cannot be rejected.')" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "P8Xf8chpP0Xx", + "outputId": "30801515-d83e-4421-db52-667f4a013213" + }, + "execution_count": 21, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Test_statistic = 2.2659493332258522, p_value = 0.024845515301993935 \n", + "\n", + "Since p_value < 0.05, reject null hypothesis.\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "\n", + "\n", + "## **4.5. Test Concerning the Variance of a Normal Population:**" + ], + "metadata": { + "id": "ZJbTadX1NnaL" + } + }, + { + "cell_type": "markdown", + "source": [ + "**A. Two Tailed Test:**\n", + "\n", + "Suppose that $X_1, X_2, ..., X_n$ is a sample of size $n$ from a normal distribution having an unknown mean $\\mu$ and a known variance $\\sigma^2$.\n", + "\n", + "$X_1, X_2, ..., X_n \\sim N( \\mu, \\sigma^2)$\n", + "\n", + "$H_0:\\ \\sigma^2 = \\sigma_0^2$\n", + "\n", + "$H_1:\\ \\sigma^2 \\neq \\sigma_0^2$\n", + "\n", + "$\\\\ $\n", + "\n", + "Testing statistics: \n", + "\n", + "$ \\chi^2_0 \\equiv \\frac{(n-1)\\ S^2}{\\sigma_0^2}$\n", + "\n", + "$\\\\ $\n", + "\n", + "Significance level = $\\alpha$\n", + "\n", + "We accept $H_0$ if:\n", + "\n", + "1. $\\ \\chi^2_{1-\\frac{\\alpha}{2}, n-1}\\ <\\ \\chi^2_0 = \\frac{(n-1)\\ S^2}{\\sigma_0^2} \\ <\\chi^2_{\\frac{\\alpha}{2}, n-1} $\n", + "\n", + "2. $\\sigma_0^2\\ \\frac{\\chi^2_{1-\\frac{\\alpha}{2}, n-1}}{n-1} <\\ S^2 <\\ \\sigma_0^2\\ \\frac{\\chi^2_{\\frac{\\alpha}{2}, n-1}}{n-1}$\n", + "\n", + "3. P_value $ = 2 \\times Min\\ ([P(\\chi^2_{n-1})< \\chi^2_0],\\ [1-P(\\chi^2_{n-1}< \\chi^2_0)]) > \\alpha$" + ], + "metadata": { + "id": "Uk4LjdzUWEG2" + } + }, + { + "cell_type": "markdown", + "source": [ + "**B. One-sided tests:**\n", + "\n", + "**Right-tailed test:**\n", + "\n", + "Suppose that $X_1, X_2, ..., X_n$ is a sample of size $n$ from a normal distribution having an unknown mean $\\mu$ and a known variance $\\sigma^2$.\n", + "\n", + "$X_1, X_2, ..., X_n \\sim N( \\mu, \\sigma^2)$\n", + "\n", + "$H_0:\\ \\sigma^2 = \\sigma_0^2\\ \\quad or \\quad H_0:\\ \\sigma^2 \\leq \\sigma_0^2$\n", + "\n", + "$H_1:\\ \\sigma^2 > \\sigma_0^2$\n", + "\n", + "$\\\\ $\n", + "\n", + "Testing statistics: \n", + "\n", + "$ \\chi^2_0 \\equiv \\frac{(n-1)\\ S^2}{\\sigma_0^2}$\n", + "\n", + "$\\\\ $\n", + "\n", + "Significance level = $\\alpha$\n", + "\n", + "We accept $H_0$ if:\n", + "\n", + "1. $ 0 <\\ \\chi^2_0 = \\frac{(n-1)\\ S^2}{\\sigma_0^2}\\ <\\ \\chi^2_{{\\alpha}, n-1}$\n", + "\n", + "2. $0 \\ <\\ S^2\\ <\\ \\frac{\\chi^2_{{\\alpha}, n-1}\\ \\sigma_0^2}{(n-1)\\ S^2} $\n", + "\n", + "3. P_value $ = P(\\chi^2_{n-1} \\geq Z_0) > \\alpha$" + ], + "metadata": { + "id": "q4IQUnEmbd-A" + } + }, + { + "cell_type": "markdown", + "source": [ + "**B. One-sided tests:**\n", + "\n", + "**Left-tailed test:**\n", + "\n", + "Suppose that $X_1, X_2, ..., X_n$ is a sample of size $n$ from a normal distribution having an unknown mean $\\mu$ and a known variance $\\sigma^2$.\n", + "\n", + "$X_1, X_2, ..., X_n \\sim N( \\mu, \\sigma^2)$\n", + "\n", + "$H_0:\\ \\sigma^2 = \\sigma_0^2\\ \\quad or \\quad H_0:\\ \\sigma^2 \\geq \\sigma_0^2$\n", + "\n", + "$H_1:\\ \\sigma^2 < \\sigma_0^2$\n", + "\n", + "$\\\\ $\n", + "\n", + "Testing statistics: \n", + "\n", + "$ \\chi^2_0 \\equiv \\frac{(n-1)\\ S^2}{\\sigma_0^2}$\n", + "\n", + "$\\\\ $\n", + "\n", + "Significance level = $\\alpha$\n", + "\n", + "We accept $H_0$ if:\n", + "\n", + "1. $ \\chi^2_{1-{\\alpha}, n-1} <\\ \\chi^2_0 = \\frac{(n-1)\\ S^2}{\\sigma_0^2} <\\ \\infty$\n", + "\n", + "2. $\\frac{\\chi^2_{1-{\\alpha}, n-1}\\ \\sigma_0^2}{(n-1)\\ S^2} \\ <\\ S^2\\ <\\ \\infty $\n", + "\n", + "3. P_value $ = P(\\chi^2_{n-1} \\leq Z_0) > \\alpha$" + ], + "metadata": { + "id": "ROJhLnQMdknc" + } + }, + { + "cell_type": "code", + "source": [ + "class variance_normal:\n", + " \"\"\"\n", + " Parameters\n", + " ----------\n", + " null_v : v0\n", + " n : optional, number of sample members\n", + " S : optional, sample variance\n", + " alpha : significance level\n", + " type_t : 'two_tailed', 'right_tailed', 'left_tailed'\n", + " data : optional, if you do not know the S2, just pass the data\n", + " \"\"\"\n", + " def __init__(self, null_v, alpha, type_t, n = 0., S2 = 0., data=None):\n", + " self.S2 = S2\n", + " self.null_v = null_v\n", + " self.type_t = type_t\n", + " self.n = n\n", + " self.alpha = alpha\n", + " self.data = data\n", + " if data is not None:\n", + " self.S2 = np.std(list(data), ddof=1)**2\n", + " self.n = len(list(data))\n", + "\n", + " variance_normal.__test(self)\n", + " \n", + " def __test(self):\n", + " test_statistic = ((self.n-1)*(self.S2)) / self.null_v\n", + " print('test_statistic:', test_statistic, '\\n')\n", + " \n", + " if self.type_t == 'two_tailed':\n", + " a = 1-chi2.cdf(test_statistic, df=self.n-1)\n", + " b = chi2.cdf(test_statistic, df=self.n-1)\n", + " p_value = 2*np.min([a, b])\n", + " print('P_value:', p_value, '\\n')\n", + " elif self.type_t == 'left_tailed':\n", + " p_value = (chi2.cdf(test_statistic, df=self.n-1))\n", + " print('P_value:', p_value, '\\n')\n", + " else:\n", + " p_value = (1-chi2.cdf(test_statistic, df=self.n-1))\n", + " print('P_value:', p_value, '\\n')\n", + "\n", + " if p_value < self.alpha:\n", + " print(f'Since p_value < {self.alpha}, reject null hypothesis.')\n", + " else:\n", + " print(f'Since p_value > {self.alpha}, the null hypothesis cannot be rejected.')" + ], + "metadata": { + "id": "lectCJy-L6Op" + }, + "execution_count": 22, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "variance_normal(null_v = 0.15**2, n = 20, alpha = 0.05, type_t = 'right_tailed', S2 = 0.025);" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "TOr9beSdkOXv", + "outputId": "c4a14497-a9ce-4d5f-c1e3-5c3df2dd6fbc" + }, + "execution_count": 23, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "test_statistic: 21.111111111111114 \n", + "\n", + "P_value: 0.33069403418551535 \n", + "\n", + "Since p_value > 0.05, the null hypothesis cannot be rejected.\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "data = [11,10,9,10,10,11,11,10,12,9,7,9,11,10,11]\n", + "variance_normal(null_v = 2**2, alpha = 0.05, type_t = 'two_tailed', data = data);" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "7CqUa66upMdN", + "outputId": "639a3abf-ac5e-48f1-8518-bb00ee03db9b" + }, + "execution_count": 24, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "test_statistic: 5.233333333333333 \n", + "\n", + "P_value: 0.035414043881992714 \n", + "\n", + "Since p_value < 0.05, reject null hypothesis.\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "\n", + "\n", + "## **4.6. Test Concerning the Equality of Variances of Two Normal Populations:**" + ], + "metadata": { + "id": "a6nq2C5NlRKo" + } + }, + { + "cell_type": "markdown", + "source": [ + "**A. Two Tailed Test:**\n", + "\n", + "Suppose that $X_1, X_2, ..., X_n$ is a sample of size $n_x$ from a normal distribution having an unknown mean $\\mu_x$ and a known variance $\\sigma^2_x$. \n", + "\n", + "$Y_1, Y_2, ..., Y_n$ is a sample of size $n_y$ from a normal distribution having an unknown mean $\\mu_y$ and a known variance $\\sigma^2_y$.\n", + "\n", + "$X_1, X_2, ..., X_n \\sim N( \\mu_x, \\sigma^2_x)$\n", + "\n", + "$Y_1, Y_2, ..., Y_n \\sim N( \\mu_y, \\sigma^2_y)$\n", + "\n", + "$H_0:\\ \\sigma_x^2 / \\sigma_y^2 = 1$\n", + "\n", + "$H_1:\\ \\sigma_x^2 / \\sigma_y^2 \\neq 1$\n", + "\n", + "$\\\\ $\n", + "\n", + "Testing statistics: \n", + "\n", + "$ F_0 \\equiv \\frac{S^2_x}{S^2_y}$\n", + "\n", + "$S^2_x = \\frac{\\sum_{i=1}^{n_x}\\ (x_i\\ -\\ \\overline{x})^2}{{n_x}-1}$\n", + "\n", + "$S^2_y = \\frac{\\sum_{i=1}^{n_y}\\ (y_i\\ -\\ \\overline{y})^2}{{n_y}-1}$\n", + "\n", + "$\\\\ $\n", + "\n", + "Significance level = $\\alpha$\n", + "\n", + "We accept $H_0$ if:\n", + "\n", + "1. $F_{{1-\\frac{\\alpha}{2}}, n_x-1, n_y-1}\\ <\\ F_0 = \\frac{S^2_x}{S^2_y}\\ < F_{{\\frac{\\alpha}{2}}, n_x-1, n_y-1}$\n", + "\n", + "2. P_value $ = 2 \\times Min\\ ([P(F_{n_x-1,n_y-1})< F_0],\\ [1-P(F_{n_x-1,n_y-1}< F_0]) > \\alpha$" + ], + "metadata": { + "id": "pdlDGLQg6eYA" + } + }, + { + "cell_type": "markdown", + "source": [ + "**B. One-sided tests:**\n", + "\n", + "**Right-tailed test:**\n", + "\n", + "Suppose that $X_1, X_2, ..., X_n$ is a sample of size $n_x$ from a normal distribution having an unknown mean $\\mu_x$ and a known variance $\\sigma^2_x$. \n", + "\n", + "$Y_1, Y_2, ..., Y_n$ is a sample of size $n_y$ from a normal distribution having an unknown mean $\\mu_y$ and a known variance $\\sigma^2_y$.\n", + "\n", + "$X_1, X_2, ..., X_n \\sim N( \\mu_x, \\sigma^2_x)$\n", + "\n", + "$Y_1, Y_2, ..., Y_n \\sim N( \\mu_y, \\sigma^2_y)$\n", + "\n", + "$H_0:\\ \\sigma_x^2 / \\sigma_y^2 = 1 \\quad or \\quad \\sigma_x^2 / \\sigma_y^2 \\leq 1$\n", + "\n", + "$H_1:\\ \\sigma_x^2 / \\sigma_y^2 > 1$\n", + "\n", + "$\\\\ $\n", + "\n", + "Testing statistics: \n", + "\n", + "$ F_0 \\equiv \\frac{S^2_x}{S^2_y}$\n", + "\n", + "$S^2_x = \\frac{\\sum_{i=1}^{n_x}\\ (x_i\\ -\\ \\overline{x})^2}{{n_x}-1}$\n", + "\n", + "$S^2_y = \\frac{\\sum_{i=1}^{n_y}\\ (y_i\\ -\\ \\overline{y})^2}{{n_y}-1}$\n", + "\n", + "$\\\\ $\n", + "\n", + "Significance level = $\\alpha$\n", + "\n", + "We accept $H_0$ if:\n", + "\n", + "1. $0\\ <\\ F_0 = \\frac{S^2_x}{S^2_y}\\ < F_{{\\alpha}, n_x-1, n_y-1}$\n", + "\n", + "2. P_value $ = P(F_{n_x-1,n_y-1} \\geq F_0) > \\alpha$" + ], + "metadata": { + "id": "wIkaIsU9-gBg" + } + }, + { + "cell_type": "markdown", + "source": [ + "**Left-tailed test:**\n", + "\n", + "Suppose that $X_1, X_2, ..., X_n$ is a sample of size $n_x$ from a normal distribution having an unknown mean $\\mu_x$ and a known variance $\\sigma^2_x$. \n", + "\n", + "$Y_1, Y_2, ..., Y_n$ is a sample of size $n_y$ from a normal distribution having an unknown mean $\\mu_y$ and a known variance $\\sigma^2_y$.\n", + "\n", + "$X_1, X_2, ..., X_n \\sim N( \\mu_x, \\sigma^2_x)$\n", + "\n", + "$Y_1, Y_2, ..., Y_n \\sim N( \\mu_y, \\sigma^2_y)$\n", + "\n", + "$H_0:\\ \\sigma_x^2 / \\sigma_y^2 = 1 \\quad or \\quad \\sigma_x^2 / \\sigma_y^2 \\geq 1$\n", + "\n", + "$H_1:\\ \\sigma_x^2 / \\sigma_y^2 < 1$\n", + "\n", + "$\\\\ $\n", + "\n", + "Testing statistics: \n", + "\n", + "$ F_0 \\equiv \\frac{S^2_x}{S^2_y}$\n", + "\n", + "$S^2_x = \\frac{\\sum_{i=1}^{n_x}\\ (x_i\\ -\\ \\overline{x})^2}{{n_x}-1}$\n", + "\n", + "$S^2_y = \\frac{\\sum_{i=1}^{n_y}\\ (y_i\\ -\\ \\overline{y})^2}{{n_x}-1}$\n", + "\n", + "$\\\\ $\n", + "\n", + "Significance level = $\\alpha$\n", + "\n", + "We accept $H_0$ if:\n", + "\n", + "1. $F_{{1-\\alpha}, n_x-1, n_y-1}\\ <\\ F_0 = \\frac{S^2_x}{S^2_y}\\ < \\infty$\n", + "\n", + "2. P_value $ = P(F_{n_x-1,n_y-1} \\leq F_0) > \\alpha$" + ], + "metadata": { + "id": "vCD8AO5_BDBR" + } + }, + { + "cell_type": "code", + "source": [ + "class equal_variances:\n", + " \"\"\"\n", + " Parameters\n", + " ----------\n", + " null_v : v0\n", + " n1 : optional, number of data1 members\n", + " n2 : optional, number of data2 members\n", + " S2X : sample1 variance\n", + " S2Y : sample2 variance\n", + " alpha : significance level\n", + " type_t : 'two_tailed', 'right_tailed', 'left_tailed'\n", + " data1 : optional, if you do not know the S2X, just pass the data\n", + " data2 : optional, if you do not know the S2Y, just pass the data\n", + " \"\"\"\n", + " def __init__(self, null_v, alpha, type_t, n1 = 0., n2 = 0., S2X = 0., S2Y = 0., data1=None, data2=None):\n", + " self.S2X = S2X\n", + " self.S2Y = S2Y\n", + " self.null_v = null_v\n", + " self.type_t = type_t\n", + " self.n1 = n1\n", + " self.n2 = n2\n", + " self.alpha = alpha\n", + " self.data1 = data1\n", + " self.data2 = data2\n", + " if data1 is not None:\n", + " self.SX2 = np.std(list(data1), ddof=1)**2\n", + " self.n1 = len(list(data1))\n", + " if data2 is not None:\n", + " self.S2Y = np.std(list(data2), ddof=1)**2\n", + " self.n2 = len(list(data2))\n", + "\n", + " equal_variances.__test(self)\n", + " \n", + " def __test(self):\n", + " test_statistic = self.S2X / self.S2Y\n", + " print('test_statistic:', test_statistic, '\\n')\n", + " \n", + " if self.type_t == 'two_tailed':\n", + " a = 1-stats.f.cdf(test_statistic, self.n1-1, self.n2-1)\n", + " b = stats.f.cdf(test_statistic, self.n1-1, self.n2-1)\n", + " p_value = 2*np.min([a, b])\n", + " print('P_value:', p_value, '\\n')\n", + " elif self.type_t == 'left_tailed':\n", + " p_value = (stats.f.cdf(test_statistic, self.n1-1, self.n2-2))\n", + " print('P_value:', p_value, '\\n')\n", + " else:\n", + " p_value = (1-stats.f.cdf(test_statistic, self.n1-1, self.n2-2))\n", + " print('P_value:', p_value, '\\n')\n", + "\n", + " if p_value < self.alpha:\n", + " print(f'Since p_value < {self.alpha}, reject null hypothesis')\n", + " else:\n", + " print(f'Since p_value > {self.alpha}, the null hypothesis cannot be rejected.')" + ], + "metadata": { + "id": "TPjrukBQlDVd" + }, + "execution_count": 25, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "equal_variances(null_v = 1, n1 = 10, n2 = 12, alpha = 0.05, type_t = 'two_tailed', S2X = .14, S2Y = .28);" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "KQgyQ4LzDHKp", + "outputId": "6bafa54d-3128-4202-87ae-3951a6988d04" + }, + "execution_count": 26, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "test_statistic: 0.5 \n", + "\n", + "P_value: 0.3075191907594375 \n", + "\n", + "Since p_value > 0.05, the null hypothesis cannot be rejected.\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "\n", + "\n", + "## **4.7. Test Concerning P in Bernoulli Populations:**" + ], + "metadata": { + "id": "I2zKIWZWFf4V" + } + }, + { + "cell_type": "markdown", + "source": [ + "**A. Two Tailed Test:**\n", + "\n", + "Suppose that $X_1, X_2, ..., X_n$ is a sample of size $n$ (large) from a bernoulli distribution having an unknown parameter $P$. \n", + "\n", + "$X_1, X_2, ..., X_n \\sim Ber(P)$\n", + "\n", + "$H_0 : P = P_0$\n", + "\n", + "$H_1 : P \\neq P_0$\n", + "\n", + "$\\\\ $\n", + "\n", + "Testing statistics: \n", + "\n", + "$ Z_0 \\equiv \\frac{\\overline{X}\\ -\\ P_0}{\\sqrt{\\frac{P_0(1-P_0)}{n}}}$\n", + "\n", + "$\\\\ $\n", + "\n", + "Significance level = $\\alpha$\n", + "\n", + "We accept $H_0$ if:\n", + "\n", + "1. $-\\ Z_{\\frac{\\alpha}{2}}\\ <\\ Z_0 = \\frac{\\overline{X}\\ -\\ P_0}{\\sqrt{\\frac{P_0(1-P_0)}{n}}}\\ < Z_{\\frac{\\alpha}{2}}$\n", + "\n", + "2. $P_0\\ -\\ Z_{\\frac{\\alpha}{2}} \\sqrt{\\frac{P_0(1-P_0)}{n}} \\ <\\ \\overline{X} \\ < P_0\\ +\\ Z_{\\frac{\\alpha}{2}} \\sqrt{\\frac{P_0(1-P_0)}{n}}$\n", + "\n", + "3. P_value $ = 2 \\times P(Z \\geq |Z_0|) > \\alpha$" + ], + "metadata": { + "id": "0ba1PtjFHYji" + } + }, + { + "cell_type": "markdown", + "source": [ + "**B. One-sided tests:**\n", + "\n", + "**Right-tailed test:**\n", + "\n", + "Suppose that $X_1, X_2, ..., X_n$ is a sample of size $n$ (large) from a bernoulli distribution having an unknown parameter $P$. \n", + "\n", + "$X_1, X_2, ..., X_n \\sim Ber(P)$\n", + "\n", + "$H_0 : P = P_0 \\quad or \\quad P \\leq P_0$\n", + "\n", + "$H_1 : P > P_0$\n", + "\n", + "$\\\\ $\n", + "\n", + "Testing statistics: \n", + "\n", + "$ Z_0 \\equiv \\frac{\\overline{X}\\ -\\ P_0}{\\sqrt{\\frac{P_0(1-P_0)}{n}}}$\n", + "\n", + "$\\\\ $\n", + "\n", + "Significance level = $\\alpha$\n", + "\n", + "We accept $H_0$ if:\n", + "\n", + "1. $-\\ \\infty <\\ Z_0 = \\frac{\\overline{X}\\ -\\ P_0}{\\sqrt{\\frac{P_0(1-P_0)}{n}}}\\ < Z_{\\alpha}$\n", + "\n", + "2. $-\\ \\infty \\ <\\ \\overline{X} \\ < P_0\\ +\\ Z_{\\alpha} \\sqrt{\\frac{P_0(1-P_0)}{n}}$\n", + "\n", + "3. P_value $ = P(Z \\geq Z_0) > \\alpha$" + ], + "metadata": { + "id": "D3iUqEh4K65A" + } + }, + { + "cell_type": "markdown", + "source": [ + "**Left-tailed test:**\n", + "\n", + "Suppose that $X_1, X_2, ..., X_n$ is a sample of size $n$ (large) from a bernoulli distribution having an unknown parameter $P$. \n", + "\n", + "$X_1, X_2, ..., X_n \\sim Ber(P)$\n", + "\n", + "$H_0 : P = P_0 \\quad or \\quad P \\geq P_0$\n", + "\n", + "$H_1 : P < P_0$\n", + "\n", + "$\\\\ $\n", + "\n", + "Testing statistics: \n", + "\n", + "$ Z_0 \\equiv \\frac{\\overline{X}\\ -\\ P_0}{\\sqrt{\\frac{P_0(1-P_0)}{n}}}$\n", + "\n", + "$\\\\ $\n", + "\n", + "Significance level = $\\alpha$\n", + "\n", + "We accept $H_0$ if:\n", + "\n", + "1. $-\\ Z_{\\alpha} <\\ Z_0 = \\frac{\\overline{X}\\ -\\ P_0}{\\sqrt{\\frac{P_0(1-P_0)}{n}}}\\ < \\infty$\n", + "\n", + "2. $\\ P_0\\ -\\ Z_{\\alpha} \\sqrt{\\frac{P_0(1-P_0)}{n}} \\ <\\ \\overline{X} \\ < \\infty$\n", + "\n", + "3. P_value $ = P(Z \\leq Z_0) > \\alpha$" + ], + "metadata": { + "id": "dxmzJiCbQ29D" + } + }, + { + "cell_type": "code", + "source": [ + "class p_bernoulli:\n", + " \"\"\"\n", + " Parameters\n", + " ----------\n", + " null_mean : u0\n", + " population_sd : known standrad deviation of the population\n", + " n : optional, number of sample members\n", + " alpha : significance level\n", + " type_t : 'two_tailed', 'right_tailed', 'left_tailed'\n", + " Sample_mean : optional, mean of the sample\n", + " data : optional, if you do not know the sample mean, just pass the data\n", + " \"\"\"\n", + " def __init__(self, null_p, alpha, type_t, n = 0., Sample_mean = 0., data=None):\n", + " self.Sample_mean = Sample_mean\n", + " self.null_p = null_p\n", + " self.type_t = type_t\n", + " self.n = n\n", + " self.alpha = alpha\n", + " self.data = data\n", + " if data is not None:\n", + " self.Sample_mean = np.mean(list(data))\n", + " self.n = len(list(data))\n", + "\n", + " p_bernoulli.__test(self)\n", + " \n", + " def __test(self):\n", + " test_statistic = (self.Sample_mean - self.null_p) / np.sqrt(self.null_p * (1-self.null_p) / self.n)\n", + " print('test_statistic:', test_statistic, '\\n')\n", + "\n", + " if self.type_t == 'two_tailed':\n", + " p_value = 2*(1-norm.cdf(abs(test_statistic)))\n", + " print('P_value:', p_value, '\\n')\n", + " elif self.type_t == 'left_tailed':\n", + " p_value = (norm.cdf(test_statistic))\n", + " print('P_value:', p_value, '\\n')\n", + " else:\n", + " p_value = (1-norm.cdf(test_statistic))\n", + " print('P_value:', p_value, '\\n')\n", + "\n", + " if p_value < self.alpha:\n", + " print(f'Since p_value < {self.alpha}, reject null hypothesis.')\n", + " else:\n", + " print(f'Since p_value > {self.alpha}, the null hypothesis cannot be rejected.')" + ], + "metadata": { + "id": "6Dhc2QP2RwDe" + }, + "execution_count": 27, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "p_bernoulli(null_p = 0.2, n = 100, alpha = 0.05, type_t = 'right_tailed', Sample_mean = 0.3);" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "pBQw3rpNTRdE", + "outputId": "494610d4-8594-49de-9ff6-3d281ed69355" + }, + "execution_count": 28, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "test_statistic: 2.4999999999999996 \n", + "\n", + "P_value: 0.006209665325776159 \n", + "\n", + "Since p_value < 0.05, reject null hypothesis.\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "\n", + "\n", + "## **4.8. Test Concerning the Equality of P in Two Bernoulli Populations:**\n", + "\n" + ], + "metadata": { + "id": "ro9fDzmQU8IZ" + } + }, + { + "cell_type": "markdown", + "source": [ + "**A. Two Tailed Test:**\n", + "\n", + "Suppose that $X_1, X_2, ..., X_{n_x}$ is a sample of size $n_x$ (large) from a bernoulli distribution having an unknown parameter $P_x$ and $Y_1, Y_2, ..., Y_{n_y}$ is a sample of size $n_y$ (large) from a bernoulli distribution having an unknown parameter $P_y$.\n", + "\n", + "$X_1, X_2, ..., X_{n_x} \\sim Ber(P_x)$\n", + "\n", + "$Y_1, Y_2, ..., Y_{n_y} \\sim Ber(P_y)$\n", + "\n", + "$H_0 : P_x = P_y$\n", + "\n", + "$H_1 : P_x \\neq P_y$\n", + "\n", + "$\\\\ $\n", + "\n", + "Testing statistics:\n", + "\n", + "$ Z_0 \\equiv \\frac{\\overline{X}\\ -\\ \\overline{Y}\\ -\\ 0}{\\sqrt{\\frac{\\widehat{p}(1- \\widehat{p})}{n_x} + \\frac{\\widehat{p}(1-\\widehat{p})}{n_y}}}$\n", + "\n", + "$\\widehat{p} = \\frac{n_x \\overline{X}\\ +\\ n_y \\overline{Y}}{n_x\\ +\\ n_y}$\n", + "\n", + "$\\\\ $\n", + "\n", + "Significance level = $\\alpha$\n", + "\n", + "We accept $H_0$ if:\n", + "\n", + "1. $-\\ Z_{\\frac{\\alpha}{2}}\\ <\\ Z_0 = \\frac{\\overline{X}\\ -\\ \\overline{Y}\\ -\\ 0}{\\sqrt{\\frac{\\widehat{p}(1- \\widehat{p})}{n_x} + \\frac{\\widehat{p}(1-\\widehat{p})}{n_y}}}\\ < Z_{\\frac{\\alpha}{2}}$\n", + "\n", + "2. $-\\ Z_{\\frac{\\alpha}{2}} \\sqrt{\\frac{\\widehat{p}(1- \\widehat{p})}{n_x} + \\frac{\\widehat{p}(1-\\widehat{p})}{n_y}}\\ <\\ \\overline{X}\\ -\\ \\overline{Y} < Z_{\\frac{\\alpha}{2}} \\sqrt{\\frac{\\widehat{p}(1- \\widehat{p})}{n_x} + \\frac{\\widehat{p}(1-\\widehat{p})}{n_y}}$\n", + "\n", + "3. P_value $ = 2 \\times P(Z \\geq |Z_0|) > \\alpha$" + ], + "metadata": { + "id": "vjQ7CTG8WJOq" + } + }, + { + "cell_type": "markdown", + "source": [ + "**B. One-sided tests:**\n", + "\n", + "**Right-tailed test:**\n", + "\n", + "Suppose that $X_1, X_2, ..., X_{n_x}$ is a sample of size $n_x$ (large) from a bernoulli distribution having an unknown parameter $P_x$ and $Y_1, Y_2, ..., Y_{n_y}$ is a sample of size $n_y$ (large) from a bernoulli distribution having an unknown parameter $P_y$.\n", + "\n", + "$X_1, X_2, ..., X_{n_x} \\sim Ber(P_x)$\n", + "\n", + "$Y_1, Y_2, ..., Y_{n_y} \\sim Ber(P_y)$\n", + "\n", + "$H_0 : P_x = P_y \\quad or \\quad P_x \\leq P_y$\n", + "\n", + "$H_1 : P_x > P_y$\n", + "\n", + "$\\\\ $\n", + "\n", + "Testing statistics: \n", + "\n", + "$ Z_0 \\equiv \\frac{\\overline{X}\\ -\\ \\overline{Y}\\ -\\ 0}{\\sqrt{\\frac{\\widehat{p}(1- \\widehat{p})}{n_x} + \\frac{\\widehat{p}(1-\\widehat{p})}{n_y}}}$\n", + "\n", + "$\\widehat{p} = \\frac{n_x \\overline{X}\\ +\\ n_y \\overline{Y}}{n_x\\ +\\ n_y}$\n", + "\n", + "$\\\\ $\n", + "\n", + "Significance level = $\\alpha$\n", + "\n", + "We accept $H_0$ if:\n", + "\n", + "1. $-\\ \\infty\\ <\\ Z_0 = \\frac{\\overline{X}\\ -\\ \\overline{Y}\\ -\\ 0}{\\sqrt{\\frac{\\widehat{p}(1- \\widehat{p})}{n_x} + \\frac{\\widehat{p}(1-\\widehat{p})}{n_y}}}\\ < Z_{\\alpha}$\n", + "\n", + "2. $-\\ \\infty\\ <\\ \\overline{X}\\ -\\ \\overline{Y} < Z_{\\alpha} \\sqrt{\\frac{\\widehat{p}(1- \\widehat{p})}{n_x} + \\frac{\\widehat{p}(1-\\widehat{p})}{n_y}}$\n", + "\n", + "3. P_value $ = P(Z \\geq Z_0) > \\alpha$" + ], + "metadata": { + "id": "ETaJekx1ZkP6" + } + }, + { + "cell_type": "markdown", + "source": [ + "**Left-tailed test:**\n", + "\n", + "Suppose that $X_1, X_2, ..., X_{n_x}$ is a sample of size $n_x$ (large) from a bernoulli distribution having an unknown parameter $P_x$ and $Y_1, Y_2, ..., Y_{n_y}$ is a sample of size $n_y$ (large) from a bernoulli distribution having an unknown parameter $P_y$.\n", + "\n", + "$X_1, X_2, ..., X_{n_x} \\sim Ber(P_x)$\n", + "\n", + "$Y_1, Y_2, ..., Y_{n_y} \\sim Ber(P_y)$\n", + "\n", + "$H_0 : P_x = P_y \\quad or \\quad P_x \\geq P_y$\n", + "\n", + "$H_1 : P_x < P_y$\n", + "\n", + "$\\\\ $\n", + "\n", + "Testing statistics:\n", + "\n", + "$ Z_0 \\equiv \\frac{\\overline{X}\\ -\\ \\overline{Y}\\ -\\ 0}{\\sqrt{\\frac{\\widehat{p}(1- \\widehat{p})}{n_x} + \\frac{\\widehat{p}(1-\\widehat{p})}{n_y}}}$\n", + "\n", + "$\\widehat{p} = \\frac{n_x \\overline{X}\\ +\\ n_y \\overline{Y}}{n_x\\ +\\ n_y}$\n", + "\n", + "$\\\\ $\n", + "\n", + "Significance level = $\\alpha$\n", + "\n", + "We accept $H_0$ if:\n", + "\n", + "1. $-\\ Z_{\\alpha}\\ <\\ Z_0 = \\frac{\\overline{X}\\ -\\ \\overline{Y}\\ -\\ 0}{\\sqrt{\\frac{\\widehat{p}(1- \\widehat{p})}{n_x} + \\frac{\\widehat{p}(1-\\widehat{p})}{n_y}}}\\ < \\infty$\n", + "\n", + "2. $-\\ Z_{\\alpha} \\sqrt{\\frac{\\widehat{p}(1- \\widehat{p})}{n_x} + \\frac{\\widehat{p}(1-\\widehat{p})}{n_y}} <\\ \\overline{X}\\ -\\ \\overline{Y} < \\infty$\n", + "\n", + "3. P_value $ = P(Z \\leq Z_0) > \\alpha$" + ], + "metadata": { + "id": "KICWomUlal8o" + } + }, + { + "cell_type": "code", + "source": [ + "class p_bernoulli_two_populations:\n", + " \"\"\"\n", + " Parameters\n", + " ----------\n", + " null_mean : u0\n", + " population_sd : known standrad deviation of the population\n", + " n1 : optional, number of data1 members\n", + " n2 : optional, number of data2 members\n", + " alpha : significance level\n", + " type_t : 'two_tailed', 'right_tailed', 'left_tailed'\n", + " Sample_mean : mean of the sample\n", + " data : optional, if you do not know the sample mean, just pass the data\n", + " \"\"\"\n", + " def __init__(self, null_p, alpha, type_t, n1 = 0., n2 = 0., Sample_mean1 = 0., Sample_mean2 = 0., data1=None, data2=None):\n", + " self.Sample_mean1 = Sample_mean1\n", + " self.Sample_mean2 = Sample_mean2\n", + " self.null_p = null_p\n", + " self.type_t = type_t\n", + " self.n1 = n1\n", + " self.n2 = n2\n", + " self.alpha = alpha\n", + " self.data1 = data1\n", + " self.data2 = data2\n", + " if data1 is not None:\n", + " self.Sample_mean1 = np.mean(list(data1))\n", + " self.n1 = len(list(data1))\n", + " if data2 is not None:\n", + " self.Sample_mean2 = np.mean(list(data2))\n", + " self.n2 = len(list(data2))\n", + "\n", + " p_bernoulli_two_populations.__test(self)\n", + " \n", + " def __test(self):\n", + " p_hat = ((self.n1 * self.Sample_mean1) + (self.n2 * self.Sample_mean2)) / (self.n1 + self.n2)\n", + " test_statistic = (self.Sample_mean1 - self.Sample_mean2) / np.sqrt((p_hat*(1-p_hat)/self.n1) + (p_hat*(1-p_hat)/self.n2))\n", + " print('test_statistic:', test_statistic, '\\n')\n", + "\n", + " if self.type_t == 'two_tailed':\n", + " p_value = 2*(1-norm.cdf(abs(test_statistic)))\n", + " print('P_value:', p_value, '\\n')\n", + " elif self.type_t == 'left_tailed':\n", + " p_value = (norm.cdf(test_statistic))\n", + " print('P_value:', p_value, '\\n')\n", + " else:\n", + " p_value = (1-norm.cdf(test_statistic))\n", + " print('P_value:', p_value, '\\n')\n", + "\n", + " if p_value < self.alpha:\n", + " print(f'Since p_value < {self.alpha}, reject null hypothesis.')\n", + " else:\n", + " print(f'Since p_value > {self.alpha}, the null hypothesis cannot be rejected.')" + ], + "metadata": { + "id": "Q4xWEZaMa_vJ" + }, + "execution_count": 29, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "p_bernoulli_two_populations(null_p = 0, n1 = 200, n2 = 200, alpha = 0.05, Sample_mean1 = 150/200,\n", + " Sample_mean2 = 170/200, type_t = 'two_tailed');" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "wI-ZscfYcrAo", + "outputId": "308d954a-ea99-4b4a-d65f-44801d19df58" + }, + "execution_count": 30, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "test_statistic: -2.4999999999999996 \n", + "\n", + "P_value: 0.012419330651552318 \n", + "\n", + "Since p_value < 0.05, reject null hypothesis.\n" + ] + } + ] + } + ] +} \ No newline at end of file