import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
births = pd.read_csv('../data/baby.csv')
births.head()
Birth Weight | Gestational Days | Maternal Age | Maternal Height | Maternal Pregnancy Weight | Maternal Smoker | |
---|---|---|---|---|---|---|
0 | 120 | 284 | 27 | 62 | 100 | False |
1 | 113 | 282 | 33 | 64 | 135 | False |
2 | 128 | 279 | 28 | 64 | 115 | True |
3 | 108 | 282 | 23 | 67 | 125 | True |
4 | 136 | 286 | 25 | 62 | 93 | False |
DataFrame.filter(items=None, like=None, regex=None, axis=None)
Subset the dataframe rows or columns according to the specified index labels.
Note that this routine does not filter a dataframe on its contents. The filter is applied to the labels of the index.
Parameters
items: list-like
Keep labels from axis which are in items.
like: str
Keep labels from axis for which “like in label == True”.
regex: str (regular expression)
Keep labels from axis for which re.search(regex, label) == True.
axis: {0 or ‘index’, 1 or ‘columns’, None}, default None
The axis to filter on, expressed either as an index (int) or axis name (str). By default this is the info axis, ‘index’ for Series, ‘columns’ for DataFrame.
smoking_and_birthweight = births.filter(['Maternal Smoker', 'Birth Weight'])
smoking_and_birthweight
Maternal Smoker | Birth Weight | |
---|---|---|
0 | False | 120 |
1 | False | 113 |
2 | True | 128 |
3 | True | 108 |
4 | False | 136 |
... | ... | ... |
1169 | False | 113 |
1170 | False | 128 |
1171 | True | 130 |
1172 | False | 125 |
1173 | False | 117 |
1174 rows × 2 columns
SB = smoking_and_birthweight.groupby(['Maternal Smoker']).count()
SB
Birth Weight | |
---|---|
Maternal Smoker | |
False | 715 |
True | 459 |
# N.B. use of 'ax = ax' to plot two sets of data on shared axis.
fig, ax = plt.subplots(figsize=(8,6))
smoking_and_birthweight.groupby(by='Maternal Smoker').hist('Birth Weight', bins = 20, alpha=0.5, ax=ax)
ax.legend(['Non Smoker', 'Smoker']);
[Question] What values of our statistic are in favor of the alternative: positive or negative?
means_table = smoking_and_birthweight.groupby(by='Maternal Smoker').mean()
means_table
Birth Weight | |
---|---|
Maternal Smoker | |
False | 123.085315 |
True | 113.819172 |
means_table['Birth Weight'][0]
123.08531468531469
type(means_table)
pandas.core.frame.DataFrame
# mean infant birthweight for maternal smoker == False
means_table.loc[:,'Birth Weight'][0]
# or
# means_table[0]
123.08531468531469
# mean infant birthweight for maternal smoker == True
means_table.loc[:,'Birth Weight'][1]
113.81917211328977
means = means_table['Birth Weight'] # single column
means
Maternal Smoker False 123.085315 True 113.819172 Name: Birth Weight, dtype: float64
means = pd.DataFrame(means) # convert to df
observed_difference = means.loc[:,'Birth Weight'][1] - means.loc[:,'Birth Weight'][0]
observed_difference
-9.266142572024918
def difference_of_means(df, col_label, group_label):
"""Takes: name of df, column label of numerical variable, column label of group-label variable
Returns: Difference of means of the two groups
"""
reduced = df[[col_label, group_label]]
means_df = reduced.groupby(by=group_label).mean()
return (means_df.loc[:,'Birth Weight'][1] - means_df.loc[:,'Birth Weight'][0])
reduced = births[['Birth Weight', 'Maternal Smoker']]
reduced
Birth Weight | Maternal Smoker | |
---|---|---|
0 | 120 | False |
1 | 113 | False |
2 | 128 | True |
3 | 108 | True |
4 | 136 | False |
... | ... | ... |
1169 | 113 | False |
1170 | 128 | False |
1171 | 130 | True |
1172 | 125 | False |
1173 | 117 | False |
1174 rows × 2 columns
group_label = 'Maternal Smoker'
means_table = reduced.groupby(by=group_label).mean()
means_table
Birth Weight | |
---|---|
Maternal Smoker | |
False | 123.085315 |
True | 113.819172 |
difference_of_means(births, 'Birth Weight', 'Maternal Smoker')
-9.266142572024918
letters = pd.DataFrame({'Letter':('a', 'b', 'c', 'd', 'e')})
letters
Letter | |
---|---|
0 | a |
1 | b |
2 | c |
3 | d |
4 | e |
# N.B. Return a random sample of items from an axis of object.
# By default the random sample will be a single row i.e. sample (len=1)
letters.sample()
Letter | |
---|---|
1 | b |
#unless the number of rows required is specified the pandas 'sample' method will sample
#and return one row of a datframe. To sample a number of rows an integer may be provided
#which in this case is (5), the int paramtere can be replaced e.g. (len(df)) or
#the 'frac' parameter can be set to 1 (100%). If the frac parameter is used the
# initial 'int' paramter cannot be used
#Setting the boolean value of 'replace' to '1' will allow resampling of rows
letters.sample(5, replace=1)
Letter | |
---|---|
3 | d |
0 | a |
0 | a |
1 | b |
4 | e |
# Not setting the boolean value of 'replace' to '1' means that the default value of 'replace=0' is used
# i.e. replacement not allowed
letter_s = letters.sample(len(letters))
letter_s
Letter | |
---|---|
4 | e |
3 | d |
2 | c |
1 | b |
0 | a |
print(type(letter_s.Letter))
print(type(letter_s.Letter[0]))
<class 'pandas.core.series.Series'> <class 'str'>
#let = list(letter_s['Letter'])
# or
let = np.array(letter_s['Letter'])
let
array(['e', 'd', 'c', 'b', 'a'], dtype=object)
shuffle = pd.DataFrame({'Shuffled': let})
letters['Shuffled'] = let
letters
Letter | Shuffled | |
---|---|---|
0 | a | e |
1 | b | d |
2 | c | c |
3 | d | b |
4 | e | a |
smoking_and_birthweight.head()
Maternal Smoker | Birth Weight | |
---|---|---|
0 | False | 120 |
1 | False | 113 |
2 | True | 128 |
3 | True | 108 |
4 | False | 136 |
shuffled_labels = smoking_and_birthweight[['Maternal Smoker']].sample(len(smoking_and_birthweight))
shuffled_labels = shuffled_labels.reset_index(drop=True)
shuffled_labels
Maternal Smoker | |
---|---|
0 | False |
1 | True |
2 | True |
3 | False |
4 | False |
... | ... |
1169 | False |
1170 | True |
1171 | False |
1172 | False |
1173 | False |
1174 rows × 1 columns
#original_and_shuffled = smoking_and_birthweight.assign(Shuffled_Label = shuffled_labels)
# or
original_and_shuffled = smoking_and_birthweight.copy()
original_and_shuffled['Shuffled Label'] = shuffled_labels
original_and_shuffled
Maternal Smoker | Birth Weight | Shuffled Label | |
---|---|---|---|
0 | False | 120 | False |
1 | False | 113 | True |
2 | True | 128 | True |
3 | True | 108 | False |
4 | False | 136 | False |
... | ... | ... | ... |
1169 | False | 113 | False |
1170 | False | 128 | True |
1171 | True | 130 | False |
1172 | False | 125 | False |
1173 | False | 117 | False |
1174 rows × 3 columns
difference_of_means(original_and_shuffled, 'Birth Weight', 'Shuffled Label')
1.0506482624129632
difference_of_means(original_and_shuffled, 'Birth Weight', 'Maternal Smoker')
-9.266142572024918
def one_simulated_difference(df, label, group_label):
"""Takes: name of table, column label of numerical variable,
column label of group-label variable
Returns: Difference of means of the two groups after shuffling labels"""
shuffled_labels = df[group_label].sample(len(df))
shuffled_labels = list(shuffled_labels)
# table of numerical variable and shuffled labels
df['Shuffled Label'] = shuffled_labels
shuffled_df = df
return difference_of_means(shuffled_df, label, 'Shuffled Label')
one_simulated_difference(births, 'Birth Weight', 'Maternal Smoker')
1.0327620092325986
differences = np.array([])
for i in np.arange(2500):
new_difference = one_simulated_difference(births, 'Birth Weight', 'Maternal Smoker')
differences = np.append(differences, new_difference)
differences
array([ 1.39048707, 0.83959047, -0.35163399, ..., 0.87178573, -0.36594299, 1.2366653 ])
len(differences)
2500
means = pd.DataFrame(means) # convert to df
observed_difference = means.loc[:,'Birth Weight'][1] - means.loc[:,'Birth Weight'][0]
observed_difference
-9.266142572024918
df = pd.DataFrame({'Difference Between Group Means' : differences})
print('Observed Difference:', observed_difference)
df.hist(ec='yellow');
plt.scatter(observed_difference, 0, color = 'red', s=40, zorder=10).set_clip_on(False);
Observed Difference: -9.266142572024918
df = pd.DataFrame({'Difference Between Group Means' : differences})
df.hist(bins=25, ec='white')
plt.scatter(observed_difference, 0, color = 'red', s=40, zorder=10).set_clip_on(False);