import numpy as np
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')


united = pd.read_csv('../data/united.csv')

united.head()


united['Row'] = np.arange(len(united))
united.head()


#move column to first postion
col_name = 'Row'
first_col = united.pop('Row')

united.insert(0, 'Row', first_col)

#N.B. use of '.pop()' instead of '.drop()'

united


united_bins = np.arange(-20, 201, 5)


# (POPULATION - not sample) Probability Distribution
united.hist('Delay', bins = united_bins, ec='white');


# (SAMPLE) Empirical Distribution

united.sample(10).hist('Delay', bins = united_bins);


# (Sample) Empirical Distribution

united.sample(1000).hist('Delay', bins = united_bins);


# (Population) PARAMETER

np.median(united['Delay'])

2.0


# (Sample) STATISTIC

np.median(united['Delay'].sample(10))

3.0


# (Sample) STATISTIC

np.median(united['Delay'].sample(100))

1.0


# What has been 'hard coded' in to this function?

def sample_median(size):
    return np.median((united[['Delay']]).sample(size))

sample_median(10)

5.0


sample_median(10)

-2.0


num_simulations = 4000


sample_medians = np.array([])

for i in np.arange(num_simulations):
    new_median = sample_median(10)
    sample_medians = np.append(sample_medians, new_median)
    
sample_medians

array([ 7. , -1.5, 13. , ...,  8.5, -1. ,  0. ])


# How many elements should the array 'sample_medians' contain?
len(sample_medians)

4000


pd.DataFrame({'Sample medians (size=10)':sample_medians}).hist(bins = 20, ec='white');


sample_medians = np.array([])

for i in np.arange(num_simulations):
    new_median = sample_median(1000)
    sample_medians = np.append(sample_medians, new_median)


pd.DataFrame({'Sample medians (size=1k)':sample_medians}).hist(bins = 20, ec='white');


sample_medians_10 = np.array([])
sample_medians_100 = np.array([])
sample_medians_1000 = np.array([])

num_simulations = 2000

for i in np.arange(num_simulations):
    new_median_10 = sample_median(10)
    sample_medians_10 = np.append(sample_medians_10, new_median_10)
    new_median_100 = sample_median(100)
    sample_medians_100 = np.append(sample_medians_100, new_median_100)
    new_median_1000 = sample_median(1000)
    sample_medians_1000 = np.append(sample_medians_1000, new_median_1000)


sample_medians = pd.DataFrame({'Size 10':sample_medians_10, 
                                      'Size 100':sample_medians_100,
                                      'Size 1000':sample_medians_1000})
sample_medians


sample_medians['Size 10'].hist(bins = np.arange(-5, 30), color='green', alpha=0.5)
sample_medians['Size 100'].hist(bins = np.arange(-5, 30), color='blue',alpha=0.5)
sample_medians['Size 1000'].hist(bins = np.arange(-5, 30), color='yellow',alpha=0.5)
plt.show()


population_proportions = np.array([.26, .74])
population_proportions

array([0.26, 0.74])


def sample_proportions(sample_size, probabilities):
    return np.random.multinomial(sample_size, probabilities) / sample_size


def panel_proportion():
    return sample_proportions(100, population_proportions).item(0)


panel_proportion()

0.32


panels = np.array([])

for i in np.arange(10000):
    new_panel = panel_proportion() * 100
    panels = np.append(panels, new_panel)


NBM = pd.DataFrame({'Number of Black Men on Panel of 100':panels}).hist(bins=np.arange(5.5,40.), ec='white')
plt.scatter(8, 0, color='red', s=50, zorder=10).set_clip_on(False);


## Mendel had 929 plants, of which 709 had purple flowers
observed_purples = 709 / 929
observed_purples

0.7631862217438106


predicted_proportions = np.array([.75, .25])

def sample_proportions(sample_size, probabilities):
    return np.random.multinomial(sample_size, probabilities) / sample_size

sample_proportions(929, predicted_proportions)

array([0.72766416, 0.27233584])


def purple_flowers():
    return sample_proportions(929, predicted_proportions).item(0) * 100


purple_flowers()

72.22820236813779


purples = np.array([])

for i in np.arange(10000):
    new_purple = purple_flowers()
    purples = np.append(purples, new_purple)


PercentPurple = pd.DataFrame({'Percent of purple flowers in sample of 929':purples}).hist(ec='white');


Discrepancy = pd.DataFrame({'Discrepancy in sample of 929 if the model is true':abs(purples- 75)}).hist(ec='white');


Discrepancy = pd.DataFrame({'Discrepancy in sample of 929 if the model is true':abs(purples- 75)})\
.hist(alpha=0.5, ec='white', zorder=5)

obs = abs((observed_purples * 100) - 75)
print("Obs =", obs)
plt.scatter(obs, 0, color='red', s=50, zorder=10).set_clip_on(False);

Obs = 1.318622174381062

	Date	Flight Number	Destination	Delay
0	6/1/15	73	HNL	257
1	6/1/15	217	EWR	28
2	6/1/15	237	STL	-3
3	6/1/15	250	SAN	0
4	6/1/15	267	PHL	64

	Date	Flight Number	Destination	Delay	Row
0	6/1/15	73	HNL	257	0
1	6/1/15	217	EWR	28	1
2	6/1/15	237	STL	-3	2
3	6/1/15	250	SAN	0	3
4	6/1/15	267	PHL	64	4

	Row	Date	Flight Number	Destination	Delay
0	0	6/1/15	73	HNL	257
1	1	6/1/15	217	EWR	28
2	2	6/1/15	237	STL	-3
3	3	6/1/15	250	SAN	0
4	4	6/1/15	267	PHL	64
...	...	...	...	...	...
13820	13820	8/31/15	1978	LAS	-4
13821	13821	8/31/15	1993	IAD	8
13822	13822	8/31/15	1994	ORD	3
13823	13823	8/31/15	2000	PHX	-1
13824	13824	8/31/15	2013	EWR	-2

Lecture - Large Random Samples¶

Statistics - the fun starts here!¶

numpy.median¶

Probability & Empirical Distributions of a Statistic¶

Empirical Distributions Overlayed¶

np.random.multinomial()¶

Mendel and Pea Flowers¶

A Model¶

Choosing a Statistic¶

	Size 10	Size 100	Size 1000
0	8.0	1.0	3.0
1	4.5	0.0	3.0
2	8.5	1.0	3.0
3	-0.5	1.5	2.0
4	5.0	2.5	2.0
...	...	...	...
1995	5.5	2.0	2.0
1996	0.0	2.5	3.0
1997	9.0	4.0	2.0
1998	0.0	8.0	2.0
1999	1.5	3.0	3.0