import numpy as np
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')


def difference_of_means(table, numeric_label, group_label):
    """
    Takes: name of table, column label of numerical variable,
    column label of group-label variable
    
    Returns: Difference of means of the two groups
    """
    reduced = table[[numeric_label, group_label]]

    means_table = reduced.groupby(by=group_label).mean()

    means = means_table

    return (means[numeric_label][1] - means[numeric_label][0])


def one_simulated_difference(table, numeric_label, group_label):
    """
    Takes: name of table, column label of numerical variable,
    column label of group-label variable
    
    Returns: Difference of means of the two groups after shuffling labels
    """
    shuffled_labels = table[group_label].sample(len(table), replace=False)

    shuffled_labels = list(shuffled_labels)

    table['Shuffled Label'] = shuffled_labels

    shuffled_table = table
    
    return difference_of_means(shuffled_table, numeric_label, 'Shuffled Label')


births = pd.read_csv('../data/baby.csv')


births.groupby(by=(['Maternal Smoker'])).mean()


botox = pd.read_csv('../data/bta.csv')
botox[:10]


type(botox['Result'][0])

numpy.float64


data1 = botox.groupby(['Group','Result'])[['Result']].count()
ResultCount = data1['Result']
data1['ResultCount'] = ResultCount
data2 = data1.drop(columns=['Result'])
data2


data1 = pd.DataFrame(data1)
type(data1)

pandas.core.frame.DataFrame


pd.pivot_table(data2, index=['Group'], columns=['Result'], values=['ResultCount']).fillna(0)


botox.groupby(by=['Group'])[['Result']].mean()


table = botox
numeric_label = 'Result'
group_label = 'Group'

reduced = table[[group_label, numeric_label]]

means_table = reduced.groupby(by=group_label).mean()

means = means_table

means[numeric_label][1] - means[numeric_label][0]

0.475


observed_diff = difference_of_means(botox, 'Result', 'Group')
observed_diff

0.475


one_simulated_difference(botox, 'Result', 'Group')

-0.17083333333333334


simulated_diffs = np.array([])

for i in np.arange(10000):
    sim_diff = one_simulated_difference(botox, 'Result', 'Group')
    simulated_diffs = np.append(simulated_diffs, sim_diff)
    
simulated_diffs

array([-0.04166667, -0.04166667, -0.17083333, ..., -0.17083333,
        0.0875    ,  0.0875    ])


col_name = 'Distances between groups'

table = pd.DataFrame({'Distances between groups':simulated_diffs})

table.hist(bins=8, ec='white');


# proportion of data greater or equal to 
sum(simulated_diffs >= observed_diff)/len(simulated_diffs)

0.0074

	Birth Weight	Gestational Days	Maternal Age	Maternal Height	Maternal Pregnancy Weight
Maternal Smoker
False	123.085315	279.874126	27.544056	64.013986	129.47972
True	113.819172	277.897603	26.736383	64.104575	126.91939

	Group	Result
0	Control	1.0
1	Control	1.0
2	Control	0.0
3	Control	0.0
4	Control	0.0
5	Control	0.0
6	Control	0.0
7	Control	0.0
8	Control	0.0
9	Control	0.0

		ResultCount
Group	Result
Control	0.0	14
Control	1.0	2
Treatment	0.0	6
Treatment	1.0	9

	Result
Group
Control	0.125
Treatment	0.600

Randomized Control Experiment¶