Commit baca951

authored

Add files via upload

1 parent 5f24b29 commit baca951Copy full SHA for baca951

File tree

3 files changed

+283

-0

lines changed

Statistical_Testing
- Non-Parametric_Tests
  - Chi-Square_Test.py
- Parametric_Tests
  - Z_Test.ipynb
  - t_Test.ipynb

3 files changed

+283

-0

lines changed

`‎Statistical_Testing/Non-Parametric_Tests/Chi-Square_Test.py‎`

Lines changed: 45 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,45 @@`
	`1`	`+import numpy as np`
	`2`	`+import pandas as pd`
	`3`	`+import scipy.stats as stats`
	`4`	`+`
	`5`	`+national = pd.DataFrame(["white"] * 100000 + ["hispanic"] * 60000 + \`
	`6`	`+ ["black"] * 50000 + ["asian"] * 15000 + ["other"] * 35000)`
	`7`	`+`
	`8`	`+minnesota = pd.DataFrame(["white"] * 600 + ["hispanic"] * 300 + \`
	`9`	`+ ["black"] * 250 + ["asian"] * 75 + ["other"] * 150)`
	`10`	`+`
	`11`	`+national_table = pd.crosstab(index=national[0], columns="count")`
	`12`	`+minnesota_table = pd.crosstab(index=minnesota[0], columns="count")`
	`13`	`+`
	`14`	`+print("National")`
	`15`	`+print(national_table)`
	`16`	`+print(" ")`
	`17`	`+print("Minnesota")`
	`18`	`+print(minnesota_table)`
	`19`	`+`
	`20`	`+observed = minnesota_table`
	`21`	`+`
	`22`	`+national_ratios = national_table/len(national) # Get population ratios`
	`23`	`+`
	`24`	`+expected = national_ratios * len(minnesota) # Get expected counts`
	`25`	`+`
	`26`	`+chi_squared_stat = (((observed-expected)**2)/expected).sum()`
	`27`	`+`
	`28`	`+print(chi_squared_stat)`
	`29`	`+`
	`30`	`+`
	`31`	`+crit = stats.chi2.ppf(q = 0.95, # Find the critical value for 95% confidence*`
	`32`	`+ df = 4) # Df = number of variable categories - 1`
	`33`	`+`
	`34`	`+print("Critical value")`
	`35`	`+print(crit)`
	`36`	`+`
	`37`	`+p_value = 1 - stats.chi2.cdf(x=chi_squared_stat, # Find the p-value`
	`38`	`+ df=4)`
	`39`	`+print("P value")`
	`40`	`+print(p_value)`
	`41`	`+`
	`42`	`+stats.chisquare(f_obs= observed, # Array of observed counts`
	`43`	`+ f_exp= expected) # Array of expected counts`
	`44`	`+`
	`45`	`+`

`‎Statistical_Testing/Parametric_Tests/Z_Test.ipynb‎`

Lines changed: 55 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,55 @@`
	`1`	`+{`
	`2`	`+ "nbformat": 4,`
	`3`	`+ "nbformat_minor": 0,`
	`4`	`+ "metadata": {`
	`5`	`+ "colab": {`
	`6`	`+ "name": "Z-Test.ipynb",`
	`7`	`+ "version": "0.3.2",`
	`8`	`+ "provenance": []`
	`9`	`+ },`
	`10`	`+ "kernelspec": {`
	`11`	`+ "name": "python3",`
	`12`	`+ "display_name": "Python 3"`
	`13`	`+ }`
	`14`	`+ },`
	`15`	`+ "cells": [`
	`16`	`+ {`
	`17`	`+ "cell_type": "code",`
	`18`	`+ "metadata": {`
	`19`	`+ "id": "AfRng3KPuSBQ",`
	`20`	`+ "colab_type": "code",`
	`21`	`+ "colab": {`
	`22`	`+ "base_uri": "https://localhost:8080/",`
	`23`	`+ "height": 51`
	`24`	`+ },`
	`25`	`+ "outputId": "01b490f3-42b4-410f-ff53-bd9bf782e0d6"`
	`26`	`+ },`
	`27`	`+ "source": [`
	`28`	`+ "def twoSampZ(X1, X2, mudiff, sd1, sd2, n1, n2):\n",`
	`29`	`+ " from numpy import sqrt, abs, round\n",`
	`30`	`+ " from scipy.stats import norm\n",`
	`31`	`+ " pooledSE = sqrt(sd12/n1 + sd22/n2)\n",`
	`32`	`+ " z = ((X1 - X2) - mudiff)/pooledSE\n",`
	`33`	`+ " pval = 2*(1 - norm.cdf(abs(z)))\n",`
	`34`	`+ " return round(z, 3), round(pval, 4)\n",`
	`35`	`+ "\n",`
	`36`	`+ "\n",`
	`37`	`+ "\n",`
	`38`	`+ "z, p = twoSampZ(28, 33, 0, 14.1, 9.5, 75, 50)\n",`
	`39`	`+ "print(\"Z Score:\",z)\n",`
	`40`	`+ "print(\"P-Value:\",p)"`
	`41`	`+ ],`
	`42`	`+ "execution_count": 2,`
	`43`	`+ "outputs": [`
	`44`	`+ {`
	`45`	`+ "output_type": "stream",`
	`46`	`+ "text": [`
	`47`	`+ "Z Score: -2.369\n",`
	`48`	`+ "P-Value: 0.0179\n"`
	`49`	`+ ],`
	`50`	`+ "name": "stdout"`
	`51`	`+ }`
	`52`	`+ ]`
	`53`	`+ }`
	`54`	`+ ]`
	`55`	`+}`

`‎Statistical_Testing/Parametric_Tests/t_Test.ipynb‎`

Lines changed: 183 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,183 @@`
	`1`	`+{`
	`2`	`+ "nbformat": 4,`
	`3`	`+ "nbformat_minor": 0,`
	`4`	`+ "metadata": {`
	`5`	`+ "colab": {`
	`6`	`+ "name": "t-Test.ipynb",`
	`7`	`+ "version": "0.3.2",`
	`8`	`+ "provenance": []`
	`9`	`+ },`
	`10`	`+ "kernelspec": {`
	`11`	`+ "name": "python3",`
	`12`	`+ "display_name": "Python 3"`
	`13`	`+ }`
	`14`	`+ },`
	`15`	`+ "cells": [`
	`16`	`+ {`
	`17`	`+ "cell_type": "code",`
	`18`	`+ "metadata": {`
	`19`	`+ "id": "Y21N_2yv3Grl",`
	`20`	`+ "colab_type": "code",`
	`21`	`+ "colab": {}`
	`22`	`+ },`
	`23`	`+ "source": [`
	`24`	`+ "## Import the packages\n",`
	`25`	`+ "import numpy as np\n",`
	`26`	`+ "from scipy import stats"`
	`27`	`+ ],`
	`28`	`+ "execution_count": 0,`
	`29`	`+ "outputs": []`
	`30`	`+ },`
	`31`	`+ {`
	`32`	`+ "cell_type": "code",`
	`33`	`+ "metadata": {`
	`34`	`+ "id": "Aga_0SM43OdO",`
	`35`	`+ "colab_type": "code",`
	`36`	`+ "colab": {`
	`37`	`+ "base_uri": "https://localhost:8080/",`
	`38`	`+ "height": 85`
	`39`	`+ },`
	`40`	`+ "outputId": "8d4b8f3d-a6df-4129-b3b6-867144288009"`
	`41`	`+ },`
	`42`	`+ "source": [`
	`43`	`+ "## Define 2 random distributions\n",`
	`44`	`+ "\n",`
	`45`	`+ "#Sample Size\n",`
	`46`	`+ "N = 10\n",`
	`47`	`+ "\n",`
	`48`	`+ "#Gaussian distributed data with mean = 2 and var = 1\n",`
	`49`	`+ "a = np.random.randn(N) + 2\n",`
	`50`	`+ "print(a)\n",`
	`51`	`+ "\n",`
	`52`	`+ "#Gaussian distributed data with with mean = 0 and var = 1\n",`
	`53`	`+ "b = np.random.randn(N)\n",`
	`54`	`+ "print(b)"`
	`55`	`+ ],`
	`56`	`+ "execution_count": 6,`
	`57`	`+ "outputs": [`
	`58`	`+ {`
	`59`	`+ "output_type": "stream",`
	`60`	`+ "text": [`
	`61`	`+ "[3.41987841 2.4642942 1.3074381 1.88900262 1.5018451 2.08785958\n",`
	`62`	`+ " 4.18763608 2.76111147 1.25673154 1.22916177]\n",`
	`63`	`+ "[ 0.09625918 -0.426427 -0.81593085 -0.27386856 -0.19758738 0.71729565\n",`
	`64`	`+ " -0.44211666 0.07106772 -0.53144206 -0.21403634]\n"`
	`65`	`+ ],`
	`66`	`+ "name": "stdout"`
	`67`	`+ }`
	`68`	`+ ]`
	`69`	`+ },`
	`70`	`+ {`
	`71`	`+ "cell_type": "code",`
	`72`	`+ "metadata": {`
	`73`	`+ "id": "DGw_0SoQ2Uhj",`
	`74`	`+ "colab_type": "code",`
	`75`	`+ "colab": {`
	`76`	`+ "base_uri": "https://localhost:8080/",`
	`77`	`+ "height": 51`
	`78`	`+ },`
	`79`	`+ "outputId": "b6caa6b7-64df-44e7-b626-13fb8165baeb"`
	`80`	`+ },`
	`81`	`+ "source": [`
	`82`	`+ "## Calculate the Standard Deviation\n",`
	`83`	`+ "\n",`
	`84`	`+ "#Calculate the variance to get the standard deviation\n",`
	`85`	`+ "\n",`
	`86`	`+ "#For unbiased max likelihood estimate we have to divide the var by N-1, and therefore the parameter ddof = 1\n",`
	`87`	`+ "var_a = a.var(ddof=1)\n",`
	`88`	`+ "var_b = b.var(ddof=1)\n",`
	`89`	`+ "\n",`
	`90`	`+ "#std deviation\n",`
	`91`	`+ "s = np.sqrt((var_a + var_b)/2)\n",`
	`92`	`+ "\n",`
	`93`	`+ "print(\"Std Deviation:\", s)\n",`
	`94`	`+ "\n",`
	`95`	`+ "## Calculate the t-statistics\n",`
	`96`	`+ "t = (a.mean() - b.mean())/(s*np.sqrt(2/N))\n",`
	`97`	`+ "\n",`
	`98`	`+ "print(\"T-value:\", t)"`
	`99`	`+ ],`
	`100`	`+ "execution_count": 8,`
	`101`	`+ "outputs": [`
	`102`	`+ {`
	`103`	`+ "output_type": "stream",`
	`104`	`+ "text": [`
	`105`	`+ "Std Deviation: 0.7693967525636721\n",`
	`106`	`+ "T-value: 7.0104093570005945\n"`
	`107`	`+ ],`
	`108`	`+ "name": "stdout"`
	`109`	`+ }`
	`110`	`+ ]`
	`111`	`+ },`
	`112`	`+ {`
	`113`	`+ "cell_type": "code",`
	`114`	`+ "metadata": {`
	`115`	`+ "id": "9atPC3HO3Z2U",`
	`116`	`+ "colab_type": "code",`
	`117`	`+ "colab": {`
	`118`	`+ "base_uri": "https://localhost:8080/",`
	`119`	`+ "height": 51`
	`120`	`+ },`
	`121`	`+ "outputId": "683ac7bd-8bd8-4e55-d3fc-7942748b66d1"`
	`122`	`+ },`
	`123`	`+ "source": [`
	`124`	`+ "## Compare with the critical t-value\n",`
	`125`	`+ "\n",`
	`126`	`+ "#Degrees of freedom\n",`
	`127`	`+ "df = 2*N - 2\n",`
	`128`	`+ "\n",`
	`129`	`+ "#p-value after comparison with the t\n",`
	`130`	`+ "p = 1 - stats.t.cdf(t,df=df)\n",`
	`131`	`+ "\n",`
	`132`	`+ "print(\"t-Score = \" + str(t))\n",`
	`133`	`+ "print(\"p-Value = \" + str(2*p))\n",`
	`134`	`+ "\n",`
	`135`	`+ "#Note that we multiply the p value by 2 because its a twp tail t-test\n",`
	`136`	`+ "\n",`
	`137`	`+ "### You can see that after comparing the t statistic with the critical t value (computed internally)\n",`
	`138`	`+ "# we get a good p value of 0.0005 and thus we reject the null hypothesis and thus it proves that the mean\n",`
	`139`	`+ "# of the two distributions are different and statistically significant."`
	`140`	`+ ],`
	`141`	`+ "execution_count": 9,`
	`142`	`+ "outputs": [`
	`143`	`+ {`
	`144`	`+ "output_type": "stream",`
	`145`	`+ "text": [`
	`146`	`+ "t-Score = 7.0104093570005945\n",`
	`147`	`+ "p-Value = 1.522899394812427e-06\n"`
	`148`	`+ ],`
	`149`	`+ "name": "stdout"`
	`150`	`+ }`
	`151`	`+ ]`
	`152`	`+ },`
	`153`	`+ {`
	`154`	`+ "cell_type": "code",`
	`155`	`+ "metadata": {`
	`156`	`+ "id": "I_ve3N6a3Mlo",`
	`157`	`+ "colab_type": "code",`
	`158`	`+ "colab": {`
	`159`	`+ "base_uri": "https://localhost:8080/",`
	`160`	`+ "height": 51`
	`161`	`+ },`
	`162`	`+ "outputId": "cc8bcc64-e1a2-4c05-98b9-db0cf91a0a01"`
	`163`	`+ },`
	`164`	`+ "source": [`
	`165`	`+ "## Cross Checking with the internal scipy function\n",`
	`166`	`+ "t2, p2 = stats.ttest_ind(a,b)\n",`
	`167`	`+ "print(\"t = \" + str(t2))\n",`
	`168`	`+ "print(\"p = \" + str(2*p2))"`
	`169`	`+ ],`
	`170`	`+ "execution_count": 10,`
	`171`	`+ "outputs": [`
	`172`	`+ {`
	`173`	`+ "output_type": "stream",`
	`174`	`+ "text": [`
	`175`	`+ "t = 7.010409357000594\n",`
	`176`	`+ "p = 3.045798789679482e-06\n"`
	`177`	`+ ],`
	`178`	`+ "name": "stdout"`
	`179`	`+ }`
	`180`	`+ ]`
	`181`	`+ }`
	`182`	`+ ]`
	`183`	`+}`

0 commit comments

Comments

(0)

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit baca951

File tree

3 files changed

3 files changed

`‎Statistical_Testing/Non-Parametric_Tests/Chi-Square_Test.py‎`

`‎Statistical_Testing/Parametric_Tests/Z_Test.ipynb‎`

`‎Statistical_Testing/Parametric_Tests/t_Test.ipynb‎`

0 commit comments