In [1]:
from datascience import *
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
import warnings
warnings.simplefilter("ignore")

Review: Comparing Two Samples

In [2]:
def difference_of_means(table, numeric_label, group_label):
 """
 Takes: name of table, column label of numerical variable,
 column label of group-label variable

 Returns: Difference of means of the two groups
 """
 
 #table with the two relevant columns
 reduced = table.select(numeric_label, group_label) 
 
 # table containing group means
 means_table = reduced.group(group_label, np.average)
 
 # array of group means
 means = means_table.column(1)
 
 return means.item(1) - means.item(0)
In [3]:
def one_simulated_difference(table, numeric_label, group_label):
 """
 Takes: name of table, column label of numerical variable,
 column label of group-label variable

 Returns: Difference of means of the two groups after shuffling labels
 """
 
 # array of shuffled labels
 shuffled_labels = table.sample(
 with_replacement = False).column(group_label)
 
 # table of numerical variable and shuffled labels
 shuffled_table = table.select(numeric_label).with_column(
 'Shuffled Label', shuffled_labels)
 
 return difference_of_means(
 shuffled_table, numeric_label, 'Shuffled Label') 
In [4]:
births = Table.read_table('baby.csv')
In [5]:
births.group('Maternal Smoker', np.average)
Out[5]:
Maternal Smoker Birth Weight average Gestational Days average Maternal Age average Maternal Height average Maternal Pregnancy Weight average
False 123.085 279.874 27.5441 64.014 129.48
True 113.819 277.898 26.7364 64.1046 126.919

Randomized Control Experiment

In [6]:
botox = Table.read_table('bta.csv')
botox.show()
Group Result
Control 1
Control 1
Control 0
Control 0
Control 0
Control 0
Control 0
Control 0
Control 0
Control 0
Control 0
Control 0
Control 0
Control 0
Control 0
Control 0
Treatment 1
Treatment 1
Treatment 1
Treatment 1
Treatment 1
Treatment 1
Treatment 1
Treatment 1
Treatment 1
Treatment 0
Treatment 0
Treatment 0
Treatment 0
Treatment 0
Treatment 0
In [7]:
botox.pivot('Result', 'Group')
Out[7]:
Group 0.0 1.0
Control 14 2
Treatment 6 9
In [8]:
botox.group('Group', np.average)
Out[8]:
Group Result average
Control 0.125
Treatment 0.6

Testing the Hypothesis

In [9]:
observed_diff = difference_of_means(botox, 'Result', 'Group')
observed_diff
Out[9]:
0.475
In [10]:
one_simulated_difference(botox, 'Result', 'Group')
Out[10]:
0.08750000000000002
In [11]:
simulated_diffs = make_array()
for i in np.arange(10000):
 sim_diff = one_simulated_difference(botox, 'Result', 'Group')
 simulated_diffs = np.append(simulated_diffs, sim_diff)
In [12]:
col_name = 'Distances between groups'
Table().with_column(col_name, simulated_diffs).hist(col_name)
In [13]:
# p-value
sum(simulated_diffs >= observed_diff)/len(simulated_diffs)
Out[13]:
0.0060000000000000001

AltStyle によって変換されたページ (->オリジナル) /