import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
die = pd.DataFrame({'Face':np.arange(1,7)})
die
Face | |
---|---|
0 | 1 |
1 | 2 |
2 | 3 |
3 | 4 |
4 | 5 |
5 | 6 |
die.hist();
# can use '.reset_index()', random_state for reproducability
die[['Face']].sample(n=10, random_state=1, replace=True)
Face | |
---|---|
5 | 6 |
3 | 4 |
4 | 5 |
0 | 1 |
1 | 2 |
3 | 4 |
5 | 6 |
0 | 1 |
0 | 1 |
1 | 2 |
The RandomState provides access to legacy generators. This generator is considered frozen and will have no further improvements. It is guaranteed to produce the same values as the final point release of NumPy v1.16. These all depend on Box-Muller normals or inverse CDF exponentials or gammas. This class should only be used if it is essential to have randoms that are identical to what would have been produced by previous versions of NumPy.
die_random = die[['Face']].sample(n=10, random_state=1, replace=True).hist();
die_random;
# can use '.reset_index()'
#random_state for reproducability
roll_bins = np.arange(0.5, 6.6, 1)
die.hist(bins=roll_bins, ec='black');
die.sample(10, replace=True).hist(bins=roll_bins, ec='red');
die.sample(1000, replace=True).hist(bins=roll_bins, ec='black');
die.sample(100000, replace=True).hist(bins=roll_bins, ec='white');
numpy.arange([start, ]stop, [step, ]dtype=None, *, like=None)
Return evenly spaced values within a given interval.
Values are generated within the half-open interval [start, stop), in other words, the interval including start but excluding stop (bottom heavy). For integer arguments the function is equivalent to the Python built-in range function, but returns an ndarray rather than a list.
united = pd.read_csv('../data/united.csv')
united['Row'] = np.arange(len(united))
united.head()
Date | Flight Number | Destination | Delay | Row | |
---|---|---|---|---|---|
0 | 6/1/15 | 73 | HNL | 257 | 0 |
1 | 6/1/15 | 217 | EWR | 28 | 1 |
2 | 6/1/15 | 237 | STL | -3 | 2 |
3 | 6/1/15 | 250 | SAN | 0 | 3 |
4 | 6/1/15 | 267 | PHL | 64 | 4 |
cols = united.columns.tolist()
cols = cols[-1:] + cols[:-1] #move the 'Row' column to start position 0
cols
['Row', 'Date', 'Flight Number', 'Destination', 'Delay']
united = united[cols]
united
Row | Date | Flight Number | Destination | Delay | |
---|---|---|---|---|---|
0 | 0 | 6/1/15 | 73 | HNL | 257 |
1 | 1 | 6/1/15 | 217 | EWR | 28 |
2 | 2 | 6/1/15 | 237 | STL | -3 |
3 | 3 | 6/1/15 | 250 | SAN | 0 |
4 | 4 | 6/1/15 | 267 | PHL | 64 |
... | ... | ... | ... | ... | ... |
13820 | 13820 | 8/31/15 | 1978 | LAS | -4 |
13821 | 13821 | 8/31/15 | 1993 | IAD | 8 |
13822 | 13822 | 8/31/15 | 1994 | ORD | 3 |
13823 | 13823 | 8/31/15 | 2000 | PHX | -1 |
13824 | 13824 | 8/31/15 | 2013 | EWR | -2 |
13825 rows × 5 columns
united[united['Destination']=='JFK'].head()
Row | Date | Flight Number | Destination | Delay | |
---|---|---|---|---|---|
26 | 26 | 6/1/15 | 502 | JFK | -4 |
33 | 33 | 6/1/15 | 637 | JFK | 141 |
39 | 39 | 6/1/15 | 704 | JFK | -8 |
50 | 50 | 6/1/15 | 758 | JFK | -5 |
51 | 51 | 6/1/15 | 760 | JFK | 352 |
united.loc[(np.arange(0, len(united), 1000))]
Row | Date | Flight Number | Destination | Delay | |
---|---|---|---|---|---|
0 | 0 | 6/1/15 | 73 | HNL | 257 |
1000 | 1000 | 6/7/15 | 1692 | EWR | 7 |
2000 | 2000 | 6/14/15 | 824 | JFK | -1 |
3000 | 3000 | 6/21/15 | 217 | EWR | 5 |
4000 | 4000 | 6/27/15 | 1401 | SAN | -4 |
5000 | 5000 | 7/4/15 | 1614 | LAS | -2 |
6000 | 6000 | 7/11/15 | 401 | CLE | 4 |
7000 | 7000 | 7/17/15 | 1645 | IAD | 0 |
8000 | 8000 | 7/24/15 | 624 | ORD | 61 |
9000 | 9000 | 7/30/15 | 1728 | ORD | 209 |
10000 | 10000 | 8/6/15 | 620 | ORD | -4 |
11000 | 11000 | 8/12/15 | 1608 | EWR | -4 |
12000 | 12000 | 8/19/15 | 677 | RNO | -1 |
13000 | 13000 | 8/26/15 | 637 | JFK | 2 |
united.loc[[34, 6321, 10040]]
Row | Date | Flight Number | Destination | Delay | |
---|---|---|---|---|---|
34 | 34 | 6/1/15 | 650 | AUS | 44 |
6321 | 6321 | 7/13/15 | 1124 | SEA | -7 |
10040 | 10040 | 8/6/15 | 1216 | LAS | 2 |
start = np.random.choice(np.arange(1000))
systematic_sample = united.iloc[(np.arange(start, len(united), 1000))]
systematic_sample
Row | Date | Flight Number | Destination | Delay | |
---|---|---|---|---|---|
22 | 22 | 6/1/15 | 477 | ORD | 10 |
1022 | 1022 | 6/7/15 | 1946 | BOS | 0 |
2022 | 2022 | 6/14/15 | 1216 | IAH | 9 |
3022 | 3022 | 6/21/15 | 391 | SEA | 0 |
4022 | 4022 | 6/27/15 | 1717 | ORD | -2 |
5022 | 5022 | 7/4/15 | 1922 | EWR | -6 |
6022 | 6022 | 7/11/15 | 710 | DEN | -6 |
7022 | 7022 | 7/17/15 | 1754 | EWR | -4 |
8022 | 8022 | 7/24/15 | 1052 | EWR | 19 |
9022 | 9022 | 7/30/15 | 1960 | LAX | 8 |
10022 | 10022 | 8/6/15 | 1062 | MSP | 1 |
11022 | 11022 | 8/12/15 | 1731 | BOS | 1 |
12022 | 12022 | 8/19/15 | 1120 | BOS | -5 |
13022 | 13022 | 8/26/15 | 1073 | BOS | 8 |
united.set_index('Row')
Date | Flight Number | Destination | Delay | |
---|---|---|---|---|
Row | ||||
0 | 6/1/15 | 73 | HNL | 257 |
1 | 6/1/15 | 217 | EWR | 28 |
2 | 6/1/15 | 237 | STL | -3 |
3 | 6/1/15 | 250 | SAN | 0 |
4 | 6/1/15 | 267 | PHL | 64 |
... | ... | ... | ... | ... |
13820 | 8/31/15 | 1978 | LAS | -4 |
13821 | 8/31/15 | 1993 | IAD | 8 |
13822 | 8/31/15 | 1994 | ORD | 3 |
13823 | 8/31/15 | 2000 | PHX | -1 |
13824 | 8/31/15 | 2013 | EWR | -2 |
13825 rows × 4 columns
united_bins = np.arange(-20, 201, 5)
united.hist('Delay', bins = united_bins, ec='white');
min(united['Delay'])
-16
max(united['Delay'])
580
np.average(united['Delay'])
16.658155515370705
united.sample(10).hist('Delay', bins = united_bins, ec='white')
plt.show()
united.sample(1000).hist('Delay', bins = united_bins, ec='white')
plt.show()
np.median(united['Delay'])
2.0
np.median((united[['Delay']]).sample(10))
4.5
def sample_median(size):
return np.median((united[['Delay']]).sample(size))
sample_median(10)
19.5
sample_medians = np.array([])
for i in np.arange(1000):
new_median = sample_median(1000)
sample_medians = np.append(sample_medians, new_median)
SampleMedians = pd.DataFrame({'Sample medians':sample_medians})\
.hist(bins = np.arange(-10,31), ec='white');
SampleMedians = pd.DataFrame({'Sample medians':sample_medians})\
.hist(bins \
= np.arange(-1,10),
ec='white');