import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
3 > 1
True
type(3 > 1)
bool
True
True
true
--------------------------------------------------------------------------- NameError Traceback (most recent call last) Input In [5], in <cell line: 1>() ----> 1 true NameError: name 'true' is not defined
3 = 3
Input In [6] 3 = 3 ^ SyntaxError: cannot assign to literal here. Maybe you meant '==' instead of '='?
3 == 3.0
True
10 != 2
True
x = 14
y = 3
x > 15
False
12 < x
True
x < 20
True
12 < x < 20
True
10 < x-y < 13
True
x > 13 and y < 3.14159
True
pets = np.array(['cat', 'cat', 'dog', 'cat', 'dog', 'rabbit'])
pets == 'cat'
array([ True, True, False, True, False, False])
1 + 1 + 0 + 1 + 0 + 0
3
#sum(make_array(True, True, False, True, False, False))
np.sum(np.array([True, True, False, True, False, False]))
3
sum(pets == 'dog')
2
np.count_nonzero(pets == 'dog')
2
x = np.arange(20, 31)
x > 28
array([False, False, False, False, False, False, False, False, False, True, True])
Let's play a game: we each roll a die.
If my number is bigger: you pay me a dollar.
If they're the same: we do nothing.
If your number is bigger: I pay you a dollar.
Steps:
# Work in progress
def one_round(my_roll, your_roll):
if my_roll > your_roll:
return 1
one_round(4, 3)
1
one_round(2, 6)
# Final correct version
def one_round(my_roll, your_roll):
if my_roll > your_roll:
return 1
elif your_roll > my_roll:
return -1
elif your_roll == my_roll:
return 0
one_round(1, 1)
0
one_round(6, 5)
1
one_round(7, -1)
1
mornings = np.array(['wake up', 'sleep in'])
mornings
array(['wake up', 'sleep in'], dtype='<U8')
np.random.choice(mornings)
'wake up'
np.random.choice(mornings)
'sleep in'
np.random.choice(mornings)
'wake up'
np.random.choice(mornings, 7)
array(['sleep in', 'wake up', 'sleep in', 'wake up', 'wake up', 'wake up', 'wake up'], dtype='<U8')
sum(np.random.choice(mornings, 7) == 'wake up')
5
sum(np.random.choice(mornings, 7) == 'sleep in')
5
morning_week = np.random.choice(mornings, 7)
morning_week
array(['wake up', 'sleep in', 'sleep in', 'sleep in', 'sleep in', 'sleep in', 'sleep in'], dtype='<U8')
sum(morning_week == 'wake up')
1
sum(morning_week == 'sleep in')
6
die_faces = np.arange(1, 7)
die_faces
array([1, 2, 3, 4, 5, 6])
np.random.choice(die_faces)
3
def simulate_one_round():
my_roll = np.random.choice(die_faces)
your_roll = np.random.choice(die_faces)
return one_round(my_roll, your_roll)
simulate_one_round()
-1
first = np.arange(4)
second = np.arange(10, 17)
np.append(first, 6)
array([0, 1, 2, 3, 6])
plus = np.append(first, 6)
plus
array([0, 1, 2, 3, 6])
first
array([0, 1, 2, 3])
plus
array([0, 1, 2, 3, 6])
np.append(first, second)
array([ 0, 1, 2, 3, 10, 11, 12, 13, 14, 15, 16])
first
array([0, 1, 2, 3])
second
array([10, 11, 12, 13, 14, 15, 16])
results = np.array([])
type(results)
results = np.append(results, simulate_one_round())
results
For
Statements¶for pet in np.array(['cat', 'dog', 'rabbit']):
print('I love my ' + pet)
I love my cat I love my dog I love my rabbit
pet = np.array(['cat', 'dog', 'rabbit']).item(0)
print('I love my ' + pet)
pet = np.array(['cat', 'dog', 'rabbit']).item(1)
print('I love my ' + pet)
pet = np.array(['cat', 'dog', 'rabbit']).item(2)
print('I love my ' + pet)
I love my cat I love my dog I love my rabbit
game_outcomes = np.array([])
for i in np.arange(5):
game_outcomes = np.append(game_outcomes, simulate_one_round())
game_outcomes
array([-1., 1., 1., -1., 1.])
game_outcomes = np.array([])
for i in np.arange(10000):
game_outcomes = np.append(game_outcomes, simulate_one_round())
game_outcomes
array([ 1., 1., 1., ..., 1., 1., -1.])
len(game_outcomes)
10000
results = pd.DataFrame({'My winnings':game_outcomes})
results
My winnings | |
---|---|
0 | 1.0 |
1 | 1.0 |
2 | 1.0 |
3 | -1.0 |
4 | 1.0 |
... | ... |
9995 | 1.0 |
9996 | 1.0 |
9997 | 1.0 |
9998 | 1.0 |
9999 | -1.0 |
10000 rows × 1 columns
chart = results.groupby(by='My winnings')\
[['My winnings']]\
.count()\
.plot\
.barh();
# Bonus question: This simulation is relatively simple.
# Can you find a way to run it without using a 'For' loop?
coin = np.array(['heads', 'tails'])
sum(np.random.choice(coin, 100) == 'heads')
47
# Simulate one outcome
def num_heads():
return sum(np.random.choice(coin, 100) == 'heads')
# Decide how many times you want to repeat the experiment
repetitions = 10000
# Simulate that many outcomes
outcomes = np.array([])
for i in np.arange(repetitions):
outcomes = np.append(outcomes, num_heads())
outcomes
array([46., 52., 44., ..., 47., 52., 50.])
len(outcomes)
10000
heads = pd.DataFrame({'Heads':outcomes})
heads.hist(bins = np.arange(29.5, 70.6), ec='white');
trip = pd.read_csv('../data/trip.csv')
trip.head()
Trip ID | Duration | Start Date | Start Station | Start Terminal | End Date | End Station | End Terminal | Bike # | Subscriber Type | Zip Code | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 913460 | 765 | 8/31/2015 23:26 | Harry Bridges Plaza (Ferry Building) | 50 | 8/31/2015 23:39 | San Francisco Caltrain (Townsend at 4th) | 70 | 288 | Subscriber | 2139 |
1 | 913459 | 1036 | 8/31/2015 23:11 | San Antonio Shopping Center | 31 | 8/31/2015 23:28 | Mountain View City Hall | 27 | 35 | Subscriber | 95032 |
2 | 913455 | 307 | 8/31/2015 23:13 | Post at Kearny | 47 | 8/31/2015 23:18 | 2nd at South Park | 64 | 468 | Subscriber | 94107 |
3 | 913454 | 409 | 8/31/2015 23:10 | San Jose City Hall | 10 | 8/31/2015 23:17 | San Salvador at 1st | 8 | 68 | Subscriber | 95113 |
4 | 913453 | 789 | 8/31/2015 23:09 | Embarcadero at Folsom | 51 | 8/31/2015 23:22 | Embarcadero at Sansome | 60 | 487 | Customer | 9069 |
commute = trip[trip['Duration'] < 1800]
commute.hist('Duration', ec='yellow');
commute.hist('Duration', bins=60, ec='red');
# Percent of people who have a ride duration between 500 and 250 seconds
(500-250) * 0.15
37.5
#NB - in this instance use of 'Duration' is arbitrary as field required for count()
starts = commute.groupby(by='Start Station')[['Duration']].count()#.sort_values(by='Start Station', ascending=False)
starts.sort_values(by='Duration', ascending=False)
Duration | |
---|---|
Start Station | |
San Francisco Caltrain (Townsend at 4th) | 25858 |
San Francisco Caltrain 2 (330 Townsend) | 21523 |
Harry Bridges Plaza (Ferry Building) | 15543 |
Temporary Transbay Terminal (Howard at Beale) | 14298 |
2nd at Townsend | 13674 |
... | ... |
Mezes Park | 189 |
Redwood City Medical Center | 139 |
San Mateo County Center | 108 |
Redwood City Public Library | 101 |
Franklin at Maple | 62 |
70 rows × 1 columns
# what is the problem here?
pd.pivot_table(commute, index=['Start Station'], columns=['End Station']).fillna(0)
/var/folders/bm/l_yhcr911wv7_tf_ywk2h2mh0000gn/T/ipykernel_12782/2694654531.py:1: FutureWarning: pivot_table dropped a column because it failed to aggregate. This behavior is deprecated and will raise in a future version of pandas. Select only the columns that can be aggregated. pd.pivot_table(commute, index=['Start Station'], columns=['End Station']).fillna(0)
Bike # | ... | Trip ID | |||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
End Station | 2nd at Folsom | 2nd at South Park | 2nd at Townsend | 5th at Howard | Adobe on Almaden | Arena Green / SAP Center | Beale at Market | Broadway St at Battery St | California Ave Caltrain Station | Castro Street and El Camino Real | ... | South Van Ness at Market | Spear at Folsom | St James Park | Stanford in Redwood City | Steuart at Market | Temporary Transbay Terminal (Howard at Beale) | Townsend at 7th | University and Emerson | Washington at Kearny | Yerba Buena Center of the Arts (3rd @ Howard) |
Start Station | |||||||||||||||||||||
2nd at Folsom | 437.148148 | 441.155932 | 445.883295 | 436.752212 | 0.000000 | 0.000000 | 446.007874 | 437.223881 | 0.000000 | 0.0 | ... | 731516.456522 | 641003.357798 | 0.0 | 0.0 | 681677.226562 | 700727.048309 | 711661.028818 | 0.000000 | 621464.521127 | 658447.493976 |
2nd at South Park | 440.700000 | 415.981707 | 434.788079 | 431.531073 | 0.000000 | 0.000000 | 462.189873 | 421.089888 | 0.000000 | 0.0 | ... | 649108.048780 | 672376.186603 | 0.0 | 0.0 | 687462.008929 | 620272.384439 | 677145.012945 | 0.000000 | 692072.852113 | 641217.822222 |
2nd at Townsend | 444.411552 | 433.366197 | 460.389189 | 431.087838 | 0.000000 | 0.000000 | 430.218579 | 436.491039 | 0.000000 | 0.0 | ... | 624034.820000 | 675171.864865 | 0.0 | 0.0 | 689369.168491 | 647041.948560 | 722415.416268 | 0.000000 | 645436.111111 | 641750.689655 |
5th at Howard | 450.878505 | 441.883333 | 412.934783 | 463.493976 | 0.000000 | 0.000000 | 456.067797 | 427.647059 | 0.000000 | 0.0 | ... | 659459.941176 | 668855.610000 | 0.0 | 0.0 | 695081.692722 | 681125.388592 | 658704.217252 | 0.000000 | 751567.319149 | 647610.811111 |
Adobe on Almaden | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 210.727273 | 190.857143 | 0.000000 | 0.000000 | 0.000000 | 0.0 | ... | 0.000000 | 0.000000 | 725137.3 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
Temporary Transbay Terminal (Howard at Beale) | 446.573840 | 442.955711 | 442.817602 | 433.681333 | 0.000000 | 0.000000 | 435.742515 | 435.077540 | 0.000000 | 0.0 | ... | 667164.854701 | 650493.939394 | 0.0 | 0.0 | 752327.857843 | 653980.553191 | 697919.355152 | 0.000000 | 667566.544444 | 672149.610422 |
Townsend at 7th | 444.014620 | 434.839161 | 448.628297 | 437.110000 | 0.000000 | 0.000000 | 465.514286 | 441.240000 | 0.000000 | 0.0 | ... | 685422.292350 | 702776.401786 | 0.0 | 0.0 | 641267.253623 | 679435.512295 | 675926.075758 | 0.000000 | 696148.068966 | 719238.209150 |
University and Emerson | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 292.175439 | 0.0 | ... | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.000000 | 618905.774194 | 0.000000 | 0.000000 |
Washington at Kearny | 390.235294 | 440.460317 | 413.403509 | 425.651163 | 0.000000 | 0.000000 | 444.718750 | 447.164557 | 0.000000 | 0.0 | ... | 680496.680000 | 653986.791667 | 0.0 | 0.0 | 646498.354839 | 709506.234694 | 674198.301887 | 0.000000 | 693241.600000 | 644410.527778 |
Yerba Buena Center of the Arts (3rd @ Howard) | 428.548387 | 430.301435 | 444.855422 | 434.737828 | 0.000000 | 0.000000 | 408.711111 | 422.212766 | 0.000000 | 0.0 | ... | 686250.573913 | 653510.140845 | 0.0 | 0.0 | 662276.247525 | 637054.646018 | 702914.881226 | 0.000000 | 690123.318182 | 641848.219178 |
70 rows × 350 columns
duration = trip[['Start Station', 'End Station', 'Duration']]
duration
Start Station | End Station | Duration | |
---|---|---|---|
0 | Harry Bridges Plaza (Ferry Building) | San Francisco Caltrain (Townsend at 4th) | 765 |
1 | San Antonio Shopping Center | Mountain View City Hall | 1036 |
2 | Post at Kearny | 2nd at South Park | 307 |
3 | San Jose City Hall | San Salvador at 1st | 409 |
4 | Embarcadero at Folsom | Embarcadero at Sansome | 789 |
... | ... | ... | ... |
354147 | Powell Street BART | Townsend at 7th | 619 |
354148 | Harry Bridges Plaza (Ferry Building) | San Francisco Caltrain (Townsend at 4th) | 6712 |
354149 | South Van Ness at Market | 5th at Howard | 538 |
354150 | South Van Ness at Market | 5th at Howard | 568 |
354151 | South Van Ness at Market | 5th at Howard | 569 |
354152 rows × 3 columns
where
¶ages = np.array([16, 22, 18, 15, 19, 15, 16, 21])
age = pd.DataFrame({'Age':ages})
age
Age | |
---|---|
0 | 16 |
1 | 22 |
2 | 18 |
3 | 15 |
4 | 19 |
5 | 15 |
6 | 16 |
7 | 21 |
age[age['Age']>=18]
Age | |
---|---|
1 | 22 |
2 | 18 |
4 | 19 |
7 | 21 |
voter = ages >= 18
voter
array([False, True, True, False, True, False, False, True])
voter1 = age >= 18
voter1
Age | |
---|---|
0 | False |
1 | True |
2 | True |
3 | False |
4 | True |
5 | False |
6 | False |
7 | True |
voter
array([False, True, True, False, True, False, False, True])
def is_voter(j):
return j >= 18
type(is_voter)
function
is_voter(22)
True
is_voter(3)
False
is_voter(age)
Age | |
---|---|
0 | False |
1 | True |
2 | True |
3 | False |
4 | True |
5 | False |
6 | False |
7 | True |
ages >= 18
array([False, True, True, False, True, False, False, True])
voter
array([False, True, True, False, True, False, False, True])
def my_voter_function(x):
return x >= 18
age.where(age['Age'] >= 18)
Age | |
---|---|
0 | NaN |
1 | 22.0 |
2 | 18.0 |
3 | NaN |
4 | 19.0 |
5 | NaN |
6 | NaN |
7 | 21.0 |