from datascience import * import numpy as np %matplotlib inline import matplotlib.pyplot as plots plots.style.use('fivethirtyeight') import warnings warnings.simplefilter("ignore")
def difference_of_means(table, numeric_label, group_label): """ Takes: name of table, column label of numerical variable, column label of group-label variable Returns: Difference of means of the two groups """ #table with the two relevant columns reduced = table.select(numeric_label, group_label) # table containing group means means_table = reduced.group(group_label, np.average) # array of group means means = means_table.column(1) return means.item(1) - means.item(0)
def one_simulated_difference(table, numeric_label, group_label): """ Takes: name of table, column label of numerical variable, column label of group-label variable Returns: Difference of means of the two groups after shuffling labels """ # array of shuffled labels shuffled_labels = table.sample( with_replacement = False).column(group_label) # table of numerical variable and shuffled labels shuffled_table = table.select(numeric_label).with_column( 'Shuffled Label', shuffled_labels) return difference_of_means( shuffled_table, numeric_label, 'Shuffled Label')
births = Table.read_table('baby.csv')
births.group('Maternal Smoker', np.average)
| Maternal Smoker | Birth Weight average | Gestational Days average | Maternal Age average | Maternal Height average | Maternal Pregnancy Weight average |
|---|---|---|---|---|---|
| False | 123.085 | 279.874 | 27.5441 | 64.014 | 129.48 |
| True | 113.819 | 277.898 | 26.7364 | 64.1046 | 126.919 |
botox = Table.read_table('bta.csv') botox.show()
| Group | Result |
|---|---|
| Control | 1 |
| Control | 1 |
| Control | 0 |
| Control | 0 |
| Control | 0 |
| Control | 0 |
| Control | 0 |
| Control | 0 |
| Control | 0 |
| Control | 0 |
| Control | 0 |
| Control | 0 |
| Control | 0 |
| Control | 0 |
| Control | 0 |
| Control | 0 |
| Treatment | 1 |
| Treatment | 1 |
| Treatment | 1 |
| Treatment | 1 |
| Treatment | 1 |
| Treatment | 1 |
| Treatment | 1 |
| Treatment | 1 |
| Treatment | 1 |
| Treatment | 0 |
| Treatment | 0 |
| Treatment | 0 |
| Treatment | 0 |
| Treatment | 0 |
| Treatment | 0 |
botox.pivot('Result', 'Group')
| Group | 0.0 | 1.0 |
|---|---|---|
| Control | 14 | 2 |
| Treatment | 6 | 9 |
botox.group('Group', np.average)
| Group | Result average |
|---|---|
| Control | 0.125 |
| Treatment | 0.6 |
observed_diff = difference_of_means(botox, 'Result', 'Group') observed_diff
0.475
one_simulated_difference(botox, 'Result', 'Group')
0.08750000000000002
simulated_diffs = make_array() for i in np.arange(10000): sim_diff = one_simulated_difference(botox, 'Result', 'Group') simulated_diffs = np.append(simulated_diffs, sim_diff)
col_name = 'Distances between groups' Table().with_column(col_name, simulated_diffs).hist(col_name)
# p-value sum(simulated_diffs >= observed_diff)/len(simulated_diffs)
0.0060000000000000001