#from datascience import *
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
scores = pd.read_csv('../data/scores_by_section.csv')
scores
Section | Midterm | |
---|---|---|
0 | 1 | 22 |
1 | 2 | 12 |
2 | 2 | 23 |
3 | 2 | 14 |
4 | 1 | 20 |
... | ... | ... |
354 | 5 | 24 |
355 | 2 | 16 |
356 | 2 | 17 |
357 | 12 | 16 |
358 | 10 | 14 |
359 rows × 2 columns
scores1 = scores.groupby(by=['Section']).count()
scores1
Midterm | |
---|---|
Section | |
1 | 32 |
2 | 32 |
3 | 27 |
4 | 30 |
5 | 33 |
6 | 32 |
7 | 24 |
8 | 29 |
9 | 30 |
10 | 34 |
11 | 26 |
12 | 30 |
scores.groupby(by=['Section']).mean()
Midterm | |
---|---|
Section | |
1 | 15.593750 |
2 | 15.125000 |
3 | 13.666667 |
4 | 14.766667 |
5 | 17.454545 |
6 | 15.031250 |
7 | 16.625000 |
8 | 16.310345 |
9 | 14.566667 |
10 | 15.235294 |
11 | 15.807692 |
12 | 15.733333 |
observed_average = 13.6667
random_sample = scores.sample(27, replace=False)
random_sample['Midterm'].mean()
16.62962962962963
np.average(random_sample['Midterm'])
16.62962962962963
# Simulate one value of the test statistic
# under the hypothesis that the section is like a random sample from the class
# this function is 'hard wired' to use the 'scores' data set
def random_sample_midterm_avg():
random_sample = scores.sample(27, replace = False)
return np.average(random_sample['Midterm'])
# Simulate 50,000 copies of the test statistic - may take a moment or two
sample_averages = np.array([])
for i in np.arange(50000):
sample_averages = np.append(sample_averages, random_sample_midterm_avg())
sample_averages[:10]
array([15.55555556, 15.44444444, 15.18518519, 15.18518519, 15.33333333, 13.14814815, 15.59259259, 16.77777778, 14.51851852, 14.96296296])
# Compare the simulated distribution of the statistic
# and the actual observed statistic
averages_tbl = pd.DataFrame({'Random Sample Average':sample_averages})
averages_tbl.hist(bins = 20, alpha=0.5, ec='white', zorder=5)
plt.scatter(observed_average, 0, color = 'red', s=40, zorder=10).set_clip_on(False)
plt.show()
averages_tbl
Random Sample Average | |
---|---|
0 | 15.555556 |
1 | 15.444444 |
2 | 15.185185 |
3 | 15.185185 |
4 | 15.333333 |
... | ... |
49995 | 16.407407 |
49996 | 17.222222 |
49997 | 14.851852 |
49998 | 16.000000 |
49999 | 16.925926 |
50000 rows × 1 columns
# what proportion of the results were less than or equal to the observed average?
sum(sample_averages <= observed_average) / 50000
0.05676
averages_tbl = averages_tbl.sort_values(by=['Random Sample Average'])
averages_tbl
Random Sample Average | |
---|---|
28119 | 10.629630 |
27278 | 10.814815 |
3750 | 10.851852 |
26693 | 10.962963 |
47400 | 10.962963 |
... | ... |
11862 | 19.333333 |
14309 | 19.444444 |
514 | 19.592593 |
26762 | 19.703704 |
17336 | 19.740741 |
50000 rows × 1 columns
five_percent_point = averages_tbl['Random Sample Average'].loc[2500]
five_percent_point
17.22222222222222
five_percent_point = averages_tbl.sort_values(['Random Sample Average']).loc[2500]
five_percent_point
Random Sample Average 17.222222 Name: 2500, dtype: float64
averages_tbl.hist(bins = 20, alpha=0.5)
plt.vlines(five_percent_point, 0, 8000, colors='gold', linestyles='solid', lw=2)
plt.title('Area to the left of the gold line: contains the 5% cut-off');
averages_tbl1 = averages_tbl.sort_values(by=['Random Sample Average']).reset_index()
averages_tbl1 = averages_tbl1.drop(columns=['index'])
averages_tbl1
Random Sample Average | |
---|---|
0 | 10.629630 |
1 | 10.814815 |
2 | 10.851852 |
3 | 10.962963 |
4 | 10.962963 |
... | ... |
49995 | 19.333333 |
49996 | 19.444444 |
49997 | 19.592593 |
49998 | 19.703704 |
49999 | 19.740741 |
50000 rows × 1 columns
five_percent_point1 = averages_tbl1['Random Sample Average'].loc[2500]
five_percent_point1
13.592592592592593
averages_tbl.hist(bins = 20, alpha=0.5)
plt.vlines(five_percent_point1, 0, 8000, colors='gold', linestyles='solid', lw=2)
plt.scatter(observed_average, 0, color = 'red', s=40, zorder=10).set_clip_on(False)
plt.title('Area to the left of the gold line: contains the 5% cut-off');