import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
united = pd.read_csv('../data/united.csv')
united.head()
Date | Flight Number | Destination | Delay | |
---|---|---|---|---|
0 | 6/1/15 | 73 | HNL | 257 |
1 | 6/1/15 | 217 | EWR | 28 |
2 | 6/1/15 | 237 | STL | -3 |
3 | 6/1/15 | 250 | SAN | 0 |
4 | 6/1/15 | 267 | PHL | 64 |
united['Row'] = np.arange(len(united))
united.head()
Date | Flight Number | Destination | Delay | Row | |
---|---|---|---|---|---|
0 | 6/1/15 | 73 | HNL | 257 | 0 |
1 | 6/1/15 | 217 | EWR | 28 | 1 |
2 | 6/1/15 | 237 | STL | -3 | 2 |
3 | 6/1/15 | 250 | SAN | 0 | 3 |
4 | 6/1/15 | 267 | PHL | 64 | 4 |
#move column to first postion
col_name = 'Row'
first_col = united.pop('Row')
united.insert(0, 'Row', first_col)
#N.B. use of '.pop()' instead of '.drop()'
united
Row | Date | Flight Number | Destination | Delay | |
---|---|---|---|---|---|
0 | 0 | 6/1/15 | 73 | HNL | 257 |
1 | 1 | 6/1/15 | 217 | EWR | 28 |
2 | 2 | 6/1/15 | 237 | STL | -3 |
3 | 3 | 6/1/15 | 250 | SAN | 0 |
4 | 4 | 6/1/15 | 267 | PHL | 64 |
... | ... | ... | ... | ... | ... |
13820 | 13820 | 8/31/15 | 1978 | LAS | -4 |
13821 | 13821 | 8/31/15 | 1993 | IAD | 8 |
13822 | 13822 | 8/31/15 | 1994 | ORD | 3 |
13823 | 13823 | 8/31/15 | 2000 | PHX | -1 |
13824 | 13824 | 8/31/15 | 2013 | EWR | -2 |
13825 rows × 5 columns
united_bins = np.arange(-20, 201, 5)
# (POPULATION - not sample) Probability Distribution
united.hist('Delay', bins = united_bins, ec='white');
# (SAMPLE) Empirical Distribution
united.sample(10).hist('Delay', bins = united_bins);
# (Sample) Empirical Distribution
united.sample(1000).hist('Delay', bins = united_bins);
numpy.median
(a, axis=None, out=None, overwrite_input=False, keepdims=False)
Compute the median (positional middle value - can be more descriptive of a list than mean) along the specified axis.
Returns the median of the array elements.
# (Population) PARAMETER
np.median(united['Delay'])
2.0
# (Sample) STATISTIC
np.median(united['Delay'].sample(10))
3.0
# (Sample) STATISTIC
np.median(united['Delay'].sample(100))
1.0
# What has been 'hard coded' in to this function?
def sample_median(size):
return np.median((united[['Delay']]).sample(size))
sample_median(10)
5.0
sample_median(10)
-2.0
num_simulations = 4000
sample_medians = np.array([])
for i in np.arange(num_simulations):
new_median = sample_median(10)
sample_medians = np.append(sample_medians, new_median)
sample_medians
array([ 7. , -1.5, 13. , ..., 8.5, -1. , 0. ])
# How many elements should the array 'sample_medians' contain?
len(sample_medians)
4000
pd.DataFrame({'Sample medians (size=10)':sample_medians}).hist(bins = 20, ec='white');
sample_medians = np.array([])
for i in np.arange(num_simulations):
new_median = sample_median(1000)
sample_medians = np.append(sample_medians, new_median)
pd.DataFrame({'Sample medians (size=1k)':sample_medians}).hist(bins = 20, ec='white');
sample_medians_10 = np.array([])
sample_medians_100 = np.array([])
sample_medians_1000 = np.array([])
num_simulations = 2000
for i in np.arange(num_simulations):
new_median_10 = sample_median(10)
sample_medians_10 = np.append(sample_medians_10, new_median_10)
new_median_100 = sample_median(100)
sample_medians_100 = np.append(sample_medians_100, new_median_100)
new_median_1000 = sample_median(1000)
sample_medians_1000 = np.append(sample_medians_1000, new_median_1000)
sample_medians = pd.DataFrame({'Size 10':sample_medians_10,
'Size 100':sample_medians_100,
'Size 1000':sample_medians_1000})
sample_medians
Size 10 | Size 100 | Size 1000 | |
---|---|---|---|
0 | 8.0 | 1.0 | 3.0 |
1 | 4.5 | 0.0 | 3.0 |
2 | 8.5 | 1.0 | 3.0 |
3 | -0.5 | 1.5 | 2.0 |
4 | 5.0 | 2.5 | 2.0 |
... | ... | ... | ... |
1995 | 5.5 | 2.0 | 2.0 |
1996 | 0.0 | 2.5 | 3.0 |
1997 | 9.0 | 4.0 | 2.0 |
1998 | 0.0 | 8.0 | 2.0 |
1999 | 1.5 | 3.0 | 3.0 |
2000 rows × 3 columns
sample_medians['Size 10'].hist(bins = np.arange(-5, 30), color='green', alpha=0.5)
sample_medians['Size 100'].hist(bins = np.arange(-5, 30), color='blue',alpha=0.5)
sample_medians['Size 1000'].hist(bins = np.arange(-5, 30), color='yellow',alpha=0.5)
plt.show()
population_proportions = np.array([.26, .74])
population_proportions
array([0.26, 0.74])
Draw samples from a multinomial distribution
def sample_proportions(sample_size, probabilities):
return np.random.multinomial(sample_size, probabilities) / sample_size
def panel_proportion():
return sample_proportions(100, population_proportions).item(0)
panel_proportion()
0.32
panels = np.array([])
for i in np.arange(10000):
new_panel = panel_proportion() * 100
panels = np.append(panels, new_panel)
NBM = pd.DataFrame({'Number of Black Men on Panel of 100':panels}).hist(bins=np.arange(5.5,40.), ec='white')
plt.scatter(8, 0, color='red', s=50, zorder=10).set_clip_on(False);
## Mendel had 929 plants, of which 709 had purple flowers
observed_purples = 709 / 929
observed_purples
0.7631862217438106
predicted_proportions = np.array([.75, .25])
def sample_proportions(sample_size, probabilities):
return np.random.multinomial(sample_size, probabilities) / sample_size
sample_proportions(929, predicted_proportions)
array([0.72766416, 0.27233584])
def purple_flowers():
return sample_proportions(929, predicted_proportions).item(0) * 100
purple_flowers()
72.22820236813779
purples = np.array([])
for i in np.arange(10000):
new_purple = purple_flowers()
purples = np.append(purples, new_purple)
PercentPurple = pd.DataFrame({'Percent of purple flowers in sample of 929':purples}).hist(ec='white');
Discrepancy = pd.DataFrame({'Discrepancy in sample of 929 if the model is true':abs(purples- 75)}).hist(ec='white');
Discrepancy = pd.DataFrame({'Discrepancy in sample of 929 if the model is true':abs(purples- 75)})\
.hist(alpha=0.5, ec='white', zorder=5)
obs = abs((observed_purples * 100) - 75)
print("Obs =", obs)
plt.scatter(obs, 0, color='red', s=50, zorder=10).set_clip_on(False);
Obs = 1.318622174381062