import numpy as np
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')


jury = pd.DataFrame({
    'Ethnicity':np.array(['Asian', 'Black', 'Latino', 'White', 'Other']),
    'Eligible':np.array([0.15, 0.18, 0.12, 0.54, 0.01]),
    'Panels':np.array([0.26, 0.08, 0.08, 0.54, 0.04])
})

jury


jury.plot.barh('Ethnicity');


# Legacy version
from numpy import random
vals = random.standard_normal(10)
more_vals = random.standard_normal(10)
more_vals

array([ 1.04178783,  1.30550513,  1.95578289, -0.46404912, -0.5442649 ,
       -0.17819663, -0.32144174, -0.95697455,  1.20457654, -0.52214162])


# New version
from numpy.random import default_rng
rng = default_rng()
vals = rng.standard_normal(10)
more_vals = rng.standard_normal(10)
more_vals

array([-0.30723259, -1.98679925,  0.62638645,  1.2351097 , -1.12786288,
        0.1655601 , -1.94840903,  0.61373912,  1.76144768,  0.25620496])


model = np.array([0.15, 0.18, 0.12, 0.54, 0.01])
model

array([0.15, 0.18, 0.12, 0.54, 0.01])


def sample_proportions(sample_size, probabilities):
    return np.random.multinomial(sample_size, probabilities) / sample_size

simulated = sample_proportions(1423, model)
simulated

array([0.15811665, 0.1658468 , 0.11314125, 0.55024596, 0.01264933])


jury_with_simulated = jury.assign(Simulated = simulated)

jury_with_simulated


jury_with_simulated.plot.barh('Ethnicity');


diffs = jury['Panels'] - jury['Eligible']
jury_with_difference = jury.assign(Difference = diffs)
jury_with_difference


def tvd(dist1, dist2):
    return sum(abs(dist1 - dist2))/2


obsvd_tvd = tvd(jury['Panels'], jury['Eligible'])
obsvd_tvd

0.14


tvd(sample_proportions(1423, model), jury['Eligible'])

0.022030920590302157


def simulated_tvd():
    return tvd(sample_proportions(1423, model), model)

tvds = np.array([])

num_simulations = 10000
for i in np.arange(num_simulations):
    new_tvd = simulated_tvd()
    tvds = np.append(tvds, new_tvd)


title = 'Simulated TVDs (if model is true)'
bins = np.arange(0, .05, .005)
tvd = pd.DataFrame({title:tvds}).hist(bins = bins, ec='white');

print('Observed TVD: ' + str(obsvd_tvd))

Observed TVD: 0.14

Comparing Distributions - Alameda County Jury Panels¶

numpy.random.multinomial¶

What is a Binomial Distribution?¶

DataFrame.assign(**kwargs)¶

Distance Between Distributions¶

Total Variation Distance¶

	Ethnicity	Eligible	Panels
0	Asian	0.15	0.26
1	Black	0.18	0.08
2	Latino	0.12	0.08
3	White	0.54	0.54
4	Other	0.01	0.04