Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit baca951

Browse files
Add files via upload
1 parent 5f24b29 commit baca951

File tree

3 files changed

+283
-0
lines changed

3 files changed

+283
-0
lines changed
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
import numpy as np
2+
import pandas as pd
3+
import scipy.stats as stats
4+
5+
national = pd.DataFrame(["white"] * 100000 + ["hispanic"] * 60000 + \
6+
["black"] * 50000 + ["asian"] * 15000 + ["other"] * 35000)
7+
8+
minnesota = pd.DataFrame(["white"] * 600 + ["hispanic"] * 300 + \
9+
["black"] * 250 + ["asian"] * 75 + ["other"] * 150)
10+
11+
national_table = pd.crosstab(index=national[0], columns="count")
12+
minnesota_table = pd.crosstab(index=minnesota[0], columns="count")
13+
14+
print("National")
15+
print(national_table)
16+
print(" ")
17+
print("Minnesota")
18+
print(minnesota_table)
19+
20+
observed = minnesota_table
21+
22+
national_ratios = national_table/len(national) # Get population ratios
23+
24+
expected = national_ratios * len(minnesota) # Get expected counts
25+
26+
chi_squared_stat = (((observed-expected)**2)/expected).sum()
27+
28+
print(chi_squared_stat)
29+
30+
31+
crit = stats.chi2.ppf(q = 0.95, # Find the critical value for 95% confidence*
32+
df = 4) # Df = number of variable categories - 1
33+
34+
print("Critical value")
35+
print(crit)
36+
37+
p_value = 1 - stats.chi2.cdf(x=chi_squared_stat, # Find the p-value
38+
df=4)
39+
print("P value")
40+
print(p_value)
41+
42+
stats.chisquare(f_obs= observed, # Array of observed counts
43+
f_exp= expected) # Array of expected counts
44+
45+
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
{
2+
"nbformat": 4,
3+
"nbformat_minor": 0,
4+
"metadata": {
5+
"colab": {
6+
"name": "Z-Test.ipynb",
7+
"version": "0.3.2",
8+
"provenance": []
9+
},
10+
"kernelspec": {
11+
"name": "python3",
12+
"display_name": "Python 3"
13+
}
14+
},
15+
"cells": [
16+
{
17+
"cell_type": "code",
18+
"metadata": {
19+
"id": "AfRng3KPuSBQ",
20+
"colab_type": "code",
21+
"colab": {
22+
"base_uri": "https://localhost:8080/",
23+
"height": 51
24+
},
25+
"outputId": "01b490f3-42b4-410f-ff53-bd9bf782e0d6"
26+
},
27+
"source": [
28+
"def twoSampZ(X1, X2, mudiff, sd1, sd2, n1, n2):\n",
29+
" from numpy import sqrt, abs, round\n",
30+
" from scipy.stats import norm\n",
31+
" pooledSE = sqrt(sd1**2/n1 + sd2**2/n2)\n",
32+
" z = ((X1 - X2) - mudiff)/pooledSE\n",
33+
" pval = 2*(1 - norm.cdf(abs(z)))\n",
34+
" return round(z, 3), round(pval, 4)\n",
35+
"\n",
36+
"\n",
37+
"\n",
38+
"z, p = twoSampZ(28, 33, 0, 14.1, 9.5, 75, 50)\n",
39+
"print(\"Z Score:\",z)\n",
40+
"print(\"P-Value:\",p)"
41+
],
42+
"execution_count": 2,
43+
"outputs": [
44+
{
45+
"output_type": "stream",
46+
"text": [
47+
"Z Score: -2.369\n",
48+
"P-Value: 0.0179\n"
49+
],
50+
"name": "stdout"
51+
}
52+
]
53+
}
54+
]
55+
}
Lines changed: 183 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,183 @@
1+
{
2+
"nbformat": 4,
3+
"nbformat_minor": 0,
4+
"metadata": {
5+
"colab": {
6+
"name": "t-Test.ipynb",
7+
"version": "0.3.2",
8+
"provenance": []
9+
},
10+
"kernelspec": {
11+
"name": "python3",
12+
"display_name": "Python 3"
13+
}
14+
},
15+
"cells": [
16+
{
17+
"cell_type": "code",
18+
"metadata": {
19+
"id": "Y21N_2yv3Grl",
20+
"colab_type": "code",
21+
"colab": {}
22+
},
23+
"source": [
24+
"## Import the packages\n",
25+
"import numpy as np\n",
26+
"from scipy import stats"
27+
],
28+
"execution_count": 0,
29+
"outputs": []
30+
},
31+
{
32+
"cell_type": "code",
33+
"metadata": {
34+
"id": "Aga_0SM43OdO",
35+
"colab_type": "code",
36+
"colab": {
37+
"base_uri": "https://localhost:8080/",
38+
"height": 85
39+
},
40+
"outputId": "8d4b8f3d-a6df-4129-b3b6-867144288009"
41+
},
42+
"source": [
43+
"## Define 2 random distributions\n",
44+
"\n",
45+
"#Sample Size\n",
46+
"N = 10\n",
47+
"\n",
48+
"#Gaussian distributed data with mean = 2 and var = 1\n",
49+
"a = np.random.randn(N) + 2\n",
50+
"print(a)\n",
51+
"\n",
52+
"#Gaussian distributed data with with mean = 0 and var = 1\n",
53+
"b = np.random.randn(N)\n",
54+
"print(b)"
55+
],
56+
"execution_count": 6,
57+
"outputs": [
58+
{
59+
"output_type": "stream",
60+
"text": [
61+
"[3.41987841 2.4642942 1.3074381 1.88900262 1.5018451 2.08785958\n",
62+
" 4.18763608 2.76111147 1.25673154 1.22916177]\n",
63+
"[ 0.09625918 -0.426427 -0.81593085 -0.27386856 -0.19758738 0.71729565\n",
64+
" -0.44211666 0.07106772 -0.53144206 -0.21403634]\n"
65+
],
66+
"name": "stdout"
67+
}
68+
]
69+
},
70+
{
71+
"cell_type": "code",
72+
"metadata": {
73+
"id": "DGw_0SoQ2Uhj",
74+
"colab_type": "code",
75+
"colab": {
76+
"base_uri": "https://localhost:8080/",
77+
"height": 51
78+
},
79+
"outputId": "b6caa6b7-64df-44e7-b626-13fb8165baeb"
80+
},
81+
"source": [
82+
"## Calculate the Standard Deviation\n",
83+
"\n",
84+
"#Calculate the variance to get the standard deviation\n",
85+
"\n",
86+
"#For unbiased max likelihood estimate we have to divide the var by N-1, and therefore the parameter ddof = 1\n",
87+
"var_a = a.var(ddof=1)\n",
88+
"var_b = b.var(ddof=1)\n",
89+
"\n",
90+
"#std deviation\n",
91+
"s = np.sqrt((var_a + var_b)/2)\n",
92+
"\n",
93+
"print(\"Std Deviation:\", s)\n",
94+
"\n",
95+
"## Calculate the t-statistics\n",
96+
"t = (a.mean() - b.mean())/(s*np.sqrt(2/N))\n",
97+
"\n",
98+
"print(\"T-value:\", t)"
99+
],
100+
"execution_count": 8,
101+
"outputs": [
102+
{
103+
"output_type": "stream",
104+
"text": [
105+
"Std Deviation: 0.7693967525636721\n",
106+
"T-value: 7.0104093570005945\n"
107+
],
108+
"name": "stdout"
109+
}
110+
]
111+
},
112+
{
113+
"cell_type": "code",
114+
"metadata": {
115+
"id": "9atPC3HO3Z2U",
116+
"colab_type": "code",
117+
"colab": {
118+
"base_uri": "https://localhost:8080/",
119+
"height": 51
120+
},
121+
"outputId": "683ac7bd-8bd8-4e55-d3fc-7942748b66d1"
122+
},
123+
"source": [
124+
"## Compare with the critical t-value\n",
125+
"\n",
126+
"#Degrees of freedom\n",
127+
"df = 2*N - 2\n",
128+
"\n",
129+
"#p-value after comparison with the t\n",
130+
"p = 1 - stats.t.cdf(t,df=df)\n",
131+
"\n",
132+
"print(\"t-Score = \" + str(t))\n",
133+
"print(\"p-Value = \" + str(2*p))\n",
134+
"\n",
135+
"#Note that we multiply the p value by 2 because its a twp tail t-test\n",
136+
"\n",
137+
"### You can see that after comparing the t statistic with the critical t value (computed internally)\n",
138+
"# we get a good p value of 0.0005 and thus we reject the null hypothesis and thus it proves that the mean\n",
139+
"# of the two distributions are different and statistically significant."
140+
],
141+
"execution_count": 9,
142+
"outputs": [
143+
{
144+
"output_type": "stream",
145+
"text": [
146+
"t-Score = 7.0104093570005945\n",
147+
"p-Value = 1.522899394812427e-06\n"
148+
],
149+
"name": "stdout"
150+
}
151+
]
152+
},
153+
{
154+
"cell_type": "code",
155+
"metadata": {
156+
"id": "I_ve3N6a3Mlo",
157+
"colab_type": "code",
158+
"colab": {
159+
"base_uri": "https://localhost:8080/",
160+
"height": 51
161+
},
162+
"outputId": "cc8bcc64-e1a2-4c05-98b9-db0cf91a0a01"
163+
},
164+
"source": [
165+
"## Cross Checking with the internal scipy function\n",
166+
"t2, p2 = stats.ttest_ind(a,b)\n",
167+
"print(\"t = \" + str(t2))\n",
168+
"print(\"p = \" + str(2*p2))"
169+
],
170+
"execution_count": 10,
171+
"outputs": [
172+
{
173+
"output_type": "stream",
174+
"text": [
175+
"t = 7.010409357000594\n",
176+
"p = 3.045798789679482e-06\n"
177+
],
178+
"name": "stdout"
179+
}
180+
]
181+
}
182+
]
183+
}

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /