In [1]:

from datascience import *
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
import warnings
warnings.simplefilter("ignore")

Review: Comparing Two Samples¶

In [2]:

def difference_of_means(table, numeric_label, group_label):
 """
 Takes: name of table, column label of numerical variable,
 column label of group-label variable

 Returns: Difference of means of the two groups
 """
 
 #table with the two relevant columns
 reduced = table.select(numeric_label, group_label) 
 
 # table containing group means
 means_table = reduced.group(group_label, np.average)
 
 # array of group means
 means = means_table.column(1)
 
 return means.item(1) - means.item(0)

In [3]:

def one_simulated_difference(table, numeric_label, group_label):
 """
 Takes: name of table, column label of numerical variable,
 column label of group-label variable

 Returns: Difference of means of the two groups after shuffling labels
 """
 
 # array of shuffled labels
 shuffled_labels = table.sample(
 with_replacement = False).column(group_label)
 
 # table of numerical variable and shuffled labels
 shuffled_table = table.select(numeric_label).with_column(
 'Shuffled Label', shuffled_labels)
 
 return difference_of_means(
 shuffled_table, numeric_label, 'Shuffled Label')

In [4]:

births = Table.read_table('baby.csv')

In [5]:

births.group('Maternal Smoker', np.average)

Out[5]:

Maternal Smoker	Birth Weight average	Gestational Days average	Maternal Age average	Maternal Height average	Maternal Pregnancy Weight average
False	123.085	279.874	27.5441	64.014	129.48
True	113.819	277.898	26.7364	64.1046	126.919

Randomized Control Experiment¶

In [6]:

botox = Table.read_table('bta.csv')
botox.show()

Group	Result
Control	1
Control	1
Control	0
Control	0
Control	0
Control	0
Control	0
Control	0
Control	0
Control	0
Control	0
Control	0
Control	0
Control	0
Control	0
Control	0
Treatment	1
Treatment	1
Treatment	1
Treatment	1
Treatment	1
Treatment	1
Treatment	1
Treatment	1
Treatment	1
Treatment	0
Treatment	0
Treatment	0
Treatment	0
Treatment	0
Treatment	0

In [7]:

botox.pivot('Result', 'Group')

Out[7]:

Group	0.0	1.0
Control	14	2
Treatment	6	9

In [8]:

botox.group('Group', np.average)

Out[8]:

Group	Result average
Control	0.125
Treatment	0.6

Testing the Hypothesis¶

In [9]:

observed_diff = difference_of_means(botox, 'Result', 'Group')
observed_diff

Out[9]:

0.475

In [10]:

one_simulated_difference(botox, 'Result', 'Group')

Out[10]:

0.08750000000000002

In [11]:

simulated_diffs = make_array()
for i in np.arange(10000):
 sim_diff = one_simulated_difference(botox, 'Result', 'Group')
 simulated_diffs = np.append(simulated_diffs, sim_diff)

In [12]:

col_name = 'Distances between groups'
Table().with_column(col_name, simulated_diffs).hist(col_name)

In [13]:

# p-value
sum(simulated_diffs >= observed_diff)/len(simulated_diffs)

Out[13]:

0.0060000000000000001