import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
def difference_of_means(table, numeric_label, group_label):
"""
Takes: name of table, column label of numerical variable,
column label of group-label variable
Returns: Difference of means of the two groups
"""
reduced = table[[numeric_label, group_label]]
means_table = reduced.groupby(by=group_label).mean()
means = means_table
return (means[numeric_label][1] - means[numeric_label][0])
def one_simulated_difference(table, numeric_label, group_label):
"""
Takes: name of table, column label of numerical variable,
column label of group-label variable
Returns: Difference of means of the two groups after shuffling labels
"""
shuffled_labels = table[group_label].sample(len(table), replace=False)
shuffled_labels = list(shuffled_labels)
table['Shuffled Label'] = shuffled_labels
shuffled_table = table
return difference_of_means(shuffled_table, numeric_label, 'Shuffled Label')
births = pd.read_csv('../data/baby.csv')
births.groupby(by=(['Maternal Smoker'])).mean()
Birth Weight | Gestational Days | Maternal Age | Maternal Height | Maternal Pregnancy Weight | |
---|---|---|---|---|---|
Maternal Smoker | |||||
False | 123.085315 | 279.874126 | 27.544056 | 64.013986 | 129.47972 |
True | 113.819172 | 277.897603 | 26.736383 | 64.104575 | 126.91939 |
botox = pd.read_csv('../data/bta.csv')
botox[:10]
Group | Result | |
---|---|---|
0 | Control | 1.0 |
1 | Control | 1.0 |
2 | Control | 0.0 |
3 | Control | 0.0 |
4 | Control | 0.0 |
5 | Control | 0.0 |
6 | Control | 0.0 |
7 | Control | 0.0 |
8 | Control | 0.0 |
9 | Control | 0.0 |
type(botox['Result'][0])
numpy.float64
data1 = botox.groupby(['Group','Result'])[['Result']].count()
ResultCount = data1['Result']
data1['ResultCount'] = ResultCount
data2 = data1.drop(columns=['Result'])
data2
ResultCount | ||
---|---|---|
Group | Result | |
Control | 0.0 | 14 |
1.0 | 2 | |
Treatment | 0.0 | 6 |
1.0 | 9 |
data1 = pd.DataFrame(data1)
type(data1)
pandas.core.frame.DataFrame
pd.pivot_table(data2, index=['Group'], columns=['Result'], values=['ResultCount']).fillna(0)
ResultCount | ||
---|---|---|
Result | 0.0 | 1.0 |
Group | ||
Control | 14 | 2 |
Treatment | 6 | 9 |
botox.groupby(by=['Group'])[['Result']].mean()
Result | |
---|---|
Group | |
Control | 0.125 |
Treatment | 0.600 |
table = botox
numeric_label = 'Result'
group_label = 'Group'
reduced = table[[group_label, numeric_label]]
means_table = reduced.groupby(by=group_label).mean()
means = means_table
means[numeric_label][1] - means[numeric_label][0]
0.475
observed_diff = difference_of_means(botox, 'Result', 'Group')
observed_diff
0.475
one_simulated_difference(botox, 'Result', 'Group')
-0.17083333333333334
simulated_diffs = np.array([])
for i in np.arange(10000):
sim_diff = one_simulated_difference(botox, 'Result', 'Group')
simulated_diffs = np.append(simulated_diffs, sim_diff)
simulated_diffs
array([-0.04166667, -0.04166667, -0.17083333, ..., -0.17083333, 0.0875 , 0.0875 ])
col_name = 'Distances between groups'
table = pd.DataFrame({'Distances between groups':simulated_diffs})
table.hist(bins=8, ec='white');
# proportion of data greater or equal to
sum(simulated_diffs >= observed_diff)/len(simulated_diffs)
0.0074