import numpy as np
import pandas as pd
idx = pd.IndexSlice
import matplotlib.pyplot as plt
%matplotlib inline
import sympy
import ipywidgets as widgets
import warnings,pprint
warnings.simplefilter(action='ignore', category=FutureWarning)

def plotHist(x,bins=15,probability=False,width=None,ec='k',**kwargs):
    counts,bins = np.histogram(x,bins=bins)
    if width is None:
        width=bins[1]-bins[0]
    label="Counts"
    if probability:
        counts = counts.astype(float)/np.sum(counts)
        label = "Probability"
    plt.bar(bins[:-1],counts,width=width,edgecolor=ec,**kwargs)
    plt.xticks(fontsize=15)
    plt.yticks(fontsize=15)
    plt.ylabel(label,fontsize=15)

def weightedDice(
    p1=1/6,
    p2=1/6,
    p3=1/6,
    p4=1/6,
    p5=1/6,
):
    p6 = np.max([0,1-(p1+p2+p3+p4+p5)])
    weights = np.array([p1,p2,p3,p4,p5,p6])
    s = np.sum(weights)
    if np.round(s,decimals=2) != 1:
        print("Entered probabilities sum to %0.2f" % s)
        print("Probabilities must sum to 1.")
        return
    x = np.arange(6)+1
    e = np.dot(x,weights)
    plt.figure()
    plt.bar(x,weights,width=1,edgecolor='k',align="center")
    plt.xticks(x,x.astype(int),fontsize=15)
    plt.xlabel("Die Roll Outcome",fontsize=15)
    plt.ylim((0,1))
    plt.yticks(fontsize=15)
    plt.ylabel("Probability",fontsize=15)
    plt.title("E[D]= \\$1(%0.2f)+\\$2(%0.2f)+\\$3(%0.2f)\\$4(%0.2f)+\\$5(%0.2f)+\\$6(%0.2f)\n= \\$%0.2f" % (p1,
                                                                                                            p2,
                                                                                                            p3,
                                                                                                            p4,
                                                                                                            p5,
                                                                                                            p6,
                                                                                                            e),
              fontsize=12);
    
widgets.interact(
    weightedDice,
    p1 = (0,1,.05),
    p2 = (0,1,.05),
    p3 = (0,1,.05),
    p4 = (0,1,.05),
    p5 = (0,1,.05),
);

interactive(children=(FloatSlider(value=0.16666666666666666, description='p1', max=1.0, step=0.05), FloatSlide…

n = 100_000
diceRolls = np.random.choice([1,2,3,4,5,6],size=n,replace=True)
# diceRolls
plotHist(diceRolls,bins=[1,2,3,4,5,6,7],probability=True,width=1)
plt.xlabel("Die Outcome",fontsize=15);

sampleSize = 4
sampleCount = 3
data = np.random.choice(
    [1,2,3,4,5,6],                 # possible outcomes  
    size=(sampleCount,sampleSize), # number of random selections to make
    replace=True,                  # select from outcomes with replacement
)
print(data)
print()
print(np.mean(data,axis=1)) # take the average of each row

[[4 3 4 1]
 [6 6 5 5]
 [3 2 2 4]]

[3.   5.5  2.75]

# hide
def sampleDiceRolls(sampleSize=1,sampleCount=10):
    data = np.random.choice([1,2,3,4,5,6],size=(sampleCount,sampleSize),replace=True)
    data = np.mean(data,axis=1)
    plt.figure(figsize=(10,4))
    plotHist(data,bins=np.linspace(1,6,20),probability=True,facecolor='r',label="_data")
    plt.xlabel("Sample Average (Fair Dice Roll)",fontsize=15)
    plt.title("%d samples each containing %d dice rolls" % (sampleCount,sampleSize),fontsize=15)
    yy = plt.ylim()
    plt.plot([3.5,3.5],yy,'k--',label="$E[D]=3.5$")
    plt.ylim(yy)
    plt.legend(frameon=False,fontsize=15,loc="upper right")
    
widgets.interact(sampleDiceRolls,sampleSize=(1,50,1),sampleCount=(1,10_000,10));

interactive(children=(IntSlider(value=1, description='sampleSize', max=50, min=1), IntSlider(value=10, descrip…

p = np.arange(6).astype(float)+1
p /= np.sum(p)
print(np.dot(p,np.arange(6)+1))

4.333333333333333

# hide 
def diceRollDistributionP(n=10,showAvg=True):
    p = np.arange(6).astype(float)+1
    p /= np.sum(p)
    diceRolls = np.random.choice([1,2,3,4,5,6],size=n,replace=True,p=p)
    plotHist(diceRolls,bins=[1,2,3,4,5,6,7],probability=True,width=1,label="_data")
    plt.xlabel("Die Outcome",fontsize=15);
    if showAvg:
        e = np.dot(np.arange(6)+1,p)
        yy = plt.ylim()
        plt.plot([e,e],yy,'k--',linewidth=3,label="$E[D]=%0.2f$" % e)
        plt.ylim(yy)
        plt.legend(fontsize=15,frameon=False)
    
    
widgets.interact(diceRollDistributionP,n=(1,1000),showAvg=widgets.fixed(True));

interactive(children=(IntSlider(value=10, description='n', max=1000, min=1), Output()), _dom_classes=('widget-…

# hide
def sampleDiceRollsP(sampleSize=1,sampleCount=10):
    p = np.arange(6).astype(float)+1
    p /= np.sum(p)
    data = np.random.choice([1,2,3,4,5,6],size=(sampleCount,sampleSize),replace=True,p=p)
    data = np.mean(data,axis=1)
    plt.figure(figsize=(10,4))
    plotHist(data,bins=np.linspace(1,6,20),probability=True,facecolor='r',label="_data")
    plt.xlabel("Sample Average (Weighted Dice Roll)",fontsize=15)
    plt.title("%d samples each containing %d dice rolls" % (sampleCount,sampleSize),fontsize=15)
    yy = plt.ylim()
    e = np.dot(p,np.arange(6)+1)
    plt.plot([e,e],yy,'k--',label="$E[D]=%0.2f$" % e)
    plt.ylim(yy)
    plt.legend(frameon=False,fontsize=15,loc="upper right")
    
widgets.interact(sampleDiceRollsP,sampleSize=(1,50,1),sampleCount=(1,10_000,10));

interactive(children=(IntSlider(value=1, description='sampleSize', max=50, min=1), IntSlider(value=10, descrip…

# height statistics from
# https://www.who.int/tools/growth-reference-data-for-5to19-years/indicators/height-for-age
girls = {
    "Mean height":164,
    "Standard deviation":6,
}
boys = {
    "Mean height":176,
    "Standard deviation":8,
}
girls = pd.Series(girls)
boys = pd.Series(boys)
# cm to inches
girls *= 0.393701
boys *= 0.393701

# simulate "ground truth" data based on statistics
popSize = 1000
boyHeights = np.random.normal(
    loc = boys["Mean height"],
    scale = boys["Standard deviation"],
    size = popSize,
)
girlHeights = np.random.normal(
    loc = girls["Mean height"],
    scale = girls["Standard deviation"],
    size = popSize,
)
allHeights = np.hstack((boyHeights,girlHeights))

# array of heights for 15-19 year old girls and boys
print((len(boyHeights),len(girlHeights)))
print(boyHeights[:10])
print(girlHeights[:10])

(1000, 1000)
[69.86033388 67.95722424 74.60923419 67.40736605 71.38430099 66.77913354
 72.32494801 70.55916304 71.39291538 72.99552667]
[62.29659234 66.70572029 64.2484856  67.66282228 66.11466656 67.0571548
 57.07542114 68.05777718 63.74939162 62.37946379]

# population counts for 15-19 year olds from: 
# https://www.statista.com/statistics/241488/population-of-the-us-by-sex-and-age/
# Counts in millions
maleCount = 11.0
femaleCount = 10.58
totalCount = maleCount + femaleCount
maleP = (maleCount/totalCount)
femaleP = (femaleCount/totalCount)
avgHeight = maleP*np.mean(boyHeights) + femaleP*np.mean(girlHeights)
print("Avg Height: %0.2f inches" % avgHeight)

Avg Height: 67.08 inches

# hide
bins = np.linspace(55,85,30)
plt.figure(figsize=(12,4))
plotHist(boyHeights,bins=bins,label="Boy Heights",alpha=.5)
plotHist(girlHeights,bins=bins,label="Girl Heights",alpha=.5)
yy = plt.ylim()
plt.plot([avgHeight,avgHeight],yy,'r--',linewidth=3,label="Avg. Height")
plt.ylim(yy)
plt.xlim((55,78))
plt.legend(fontsize=15)
plt.xlabel("Height (inches)",fontsize=15);

np.mean(boyHeights),np.mean(girlHeights),avgHeight

(69.39746454496027, 64.66157982636437, 67.07560818153374)

# hide
def sampleHeights(sampleSize=1,sampleCount=10):
    allHeights = np.hstack((boyHeights,girlHeights))
    boyCount = len(boyHeights)
    femaleCount = len(girlHeights)
    p = np.hstack((maleP/boyCount*np.ones(boyCount),femaleP/femaleCount*np.ones(femaleCount)))
    data = np.random.choice(allHeights,size=(sampleCount,sampleSize),replace=True,p=p)
    data = np.mean(data,axis=1)
    plt.figure(figsize=(10,4))
    bins = np.linspace(np.min(allHeights),np.max(allHeights),50)
    plotHist(data,bins=bins,probability=True,facecolor='r',label="_data")
    plt.xlabel("Sample Average (inches)",fontsize=15)
    plt.title("%d samples each containing %d individuals" % (sampleCount,sampleSize),fontsize=15)
    yy = plt.ylim()
    plt.plot([avgHeight,avgHeight],yy,'k--',label="$E[H]=%0.2f$" % avgHeight)
    plt.ylim(yy)
    plt.legend(frameon=False,fontsize=15,loc="upper right")
    
widgets.interact(sampleHeights,sampleSize=(1,50,1),sampleCount=(1,10_000,10));

interactive(children=(IntSlider(value=1, description='sampleSize', max=50, min=1), IntSlider(value=10, descrip…

# read in CSV data as a dataframe using Pandas
national = pd.read_csv("../nationalOews.csv").drop(columns=["total employment","H_MEAN","EMP_SHR"])
# remove occupation aggregations
# note: occupation codes follow the Standard Occupation Classification system
national = national[
    (national["OCC_CODE"].apply(lambda s: s[-1] != '0')) & \
    (~np.isnan(national["A_MEAN"]))
]
# filter to the year of interest
national = national[national["year"] == 2019]
# give dataframe columns more intuitive names
national = national.rename(columns={
    "OCC_CODE":"occupation code",
    "OCC_TITLE":"occupation title",
    "TOT_EMP":"national employment",
#     "H_MEAN":"avg hourly wage", 
    "A_MEAN":"avg annual wage",
})
# look at data dimensions
print(national.shape)
national.head()

(773, 5)

# calculate employment share 
# (ie, the fraction of the total employment associated with each occupation)
national["p(occupation)"] = national["national employment"] / national["national employment"].sum()
# test that p(occupation) sums to 1
print("sum p(occupation) = %0.2f" % national["p(occupation)"].sum())
national.head()

sum p(occupation) = 1.00

E = national["avg annual wage"].dot(national["p(occupation)"])
print("The average annual wage is $%0.2f" % E)

The average annual wage is $54078.05

# identify equally spaced wage bins & place rows into bins
wages = national["avg annual wage"]
bins = np.linspace(wages.min(),wages.max(),20)
national["wage bin"] = pd.cut(wages,bins=bins,)
# Count number of people in each bin
temp = national.groupby("wage bin").agg({"national employment":"sum"})
# Plot distribution
bins /= 1000
plt.bar(
    bins[:-1],
    temp["national employment"]/10**6,
    width=bins[1]-bins[0],
    edgecolor='k',
    label="_data",
)
yy = plt.ylim()
plt.plot([E/1000,E/1000],yy,'r--',linewidth=3,label="$E[W]$")
plt.ylim(yy)
plt.legend(fontsize=15,frameon=False)
plt.xticks(fontsize=15)
plt.xlabel("Annual Wage (\$1000)",fontsize=15)
plt.yticks(fontsize=15)
plt.ylabel("Number of People (millions)",fontsize=15);

print("Number of Unique Occupations: %d" % len(national))
print("Number of Workers: %d" % national["national employment"].sum())

Number of Unique Occupations: 773
Number of Workers: 140584900

def plotWageSampleDist(sample):
    plotHist(sample,label="_data")
    yy = plt.ylim()
    yy = (0,1.5*yy[1])
    plt.plot([E/1000,E/1000],yy,'r--',linewidth=3,label="Population Avg: $E[W]=\$%0.2f$" % E)
    e = np.mean(sample)
    plt.plot([e,e],yy,'k--',linewidth=3,label="Sample Avg: $\\bar{W}=\$%0.2f$" % (1000*e))
    plt.ylim(yy)
    plt.xlim((national["avg annual wage"].min()/1000,national["avg annual wage"].max()/1000))
    plt.legend(fontsize=15,frameon=True,loc="upper right") 
    plt.title("Sample Size: %d" % len(sample),fontsize=15)
    plt.xlabel("Thousands of Dollars",fontsize=15);

# We are randomly surveying workers, not occupations
# So we need to choose occupations from the data with probabilities proportional to p(occupation).
sample = np.random.choice(
    national["avg annual wage"],
    size=10,
    replace=True,
    p=national["p(occupation)"],
)
# try multiple samples. 
# Is the average wage of the sample always close to the true average wage?
plotWageSampleDist(sample/1000)

def plotWageSampleAvgDist(A,trials=1):
    sample = A[:trials]/1000
    plotHist(sample,label="_data",color='lightgray',alpha=.6)
    yy = plt.ylim()
    yy = (0,1.5*yy[1])
    plt.plot([E/1000,E/1000],yy,'r--',linewidth=3,label="$E[W]=\$%0.2f$" % E)
    e = np.mean(sample)
    plt.plot([e,e],yy,'k--',linewidth=3,label="Avg $\\bar{W}$: $\$%0.2f$" % (1000*e))
    plt.ylim(yy)
    plt.legend(fontsize=15,frameon=True,loc="upper right") 
    plt.title("Number of Sample Averages: %d" % trials,fontsize=15)
    plt.xlabel("Thousands of Dollars",fontsize=15);

trials = 1000
A = np.zeros(trials)
for trial in range(trials):
    sample = np.random.choice(national["avg annual wage"],size=100,replace=True,
                              p=national["p(occupation)"])
    A[trial] = np.mean(sample)

# hide 
widgets.interact(plotWageSampleAvgDist,A=widgets.fixed(A),trials=(1,len(A),5));

interactive(children=(IntSlider(value=1, description='trials', max=1000, min=1, step=5), Output()), _dom_class…

# hide
plt.figure(figsize=(14,6))
plt.subplot(2,4,1)
n = 10_000
diceRolls = np.random.choice([1,2,3,4,5,6],size=n,replace=True)
# diceRolls
plotHist(diceRolls,bins=[1,2,3,4,5,6,7],probability=True,width=1)
plt.xlabel("Die Outcome",fontsize=15);
plt.title("Fair Dice Roll",fontsize=15);

plt.subplot(2,4,2)
diceRollDistributionP(n=1000,showAvg=False)
plt.title("Weighted Dice Roll",fontsize=15)

plt.subplot(2,4,3)
bins = np.linspace(55,85,30)
plotHist(boyHeights,bins=bins,label="Boys",alpha=.5)
plotHist(girlHeights,bins=bins,label="Girls",alpha=.5)
# yy = plt.ylim()
# plt.plot([avgHeight,avgHeight],yy,'r--',linewidth=3,label="Avg. Height")
# plt.ylim(yy)
plt.xlim((55,78))
plt.legend(fontsize=12)
plt.xlabel("Height (inches)",fontsize=15);
plt.title("Boy/Girl Heights",fontsize=15)

plt.subplot(2,4,4)
wages = national["avg annual wage"]
bins = np.linspace(wages.min(),wages.max(),20)
national["wage bin"] = pd.cut(wages,bins=bins,)
# Count number of people in each bin
temp = national.groupby("wage bin").agg({"national employment":"sum"})
# Plot distribution
bins /= 1000
plt.bar(
    bins[:-1],
    temp["national employment"]/10**6,
    width=bins[1]-bins[0],
    edgecolor='k',
    label="_data",
)
# yy = plt.ylim()
# plt.plot([E/1000,E/1000],yy,'r--',linewidth=3,label="$E[W]$")
# plt.ylim(yy)
# plt.legend(fontsize=15,frameon=False)
plt.xticks(fontsize=15)
plt.xlabel("Annual Wage (\$1000)",fontsize=15)
plt.yticks(fontsize=15)
plt.ylabel("Number of People (millions)",fontsize=15);
plt.title("Annual Wages",fontsize=15)

plt.subplot(2,4,5)
data = np.random.choice([1,2,3,4,5,6],size=(8000,10),replace=True)
data = np.mean(data,axis=1)
plotHist(data,bins=np.linspace(1,6,20),probability=True,facecolor='r',label="_data")
plt.xlabel("Sample Average\n(Fair Dice Roll)",fontsize=15)
# plt.title("%d samples each containing %d dice rolls" % (sampleCount,sampleSize),fontsize=15)
yy = plt.ylim()
plt.plot([3.5,3.5],yy,'k--',label="$E[D]=3.5$",linewidth=3)
plt.ylim(yy)
# plt.legend(frameon=False,fontsize=15,loc="upper right")

plt.subplot(2,4,6)
p = np.arange(6).astype(float)+1
p /= np.sum(p)
data = np.random.choice([1,2,3,4,5,6],size=(8000,10),replace=True,p=p)
data = np.mean(data,axis=1)
plotHist(data,bins=np.linspace(1,6,20),probability=True,facecolor='r',label="_data")
plt.xlabel("Sample Average\n(Weighted Dice Roll)",fontsize=15)
# plt.title("%d samples each containing %d dice rolls" % (sampleCount,sampleSize),fontsize=15)
yy = plt.ylim()
e = np.dot(p,np.arange(6)+1)
plt.plot([e,e],yy,'k--',label="$E[D]=%0.2f$" % e,linewidth=3)
plt.ylim(yy)
# plt.legend(frameon=False,fontsize=15,loc="upper right")

plt.subplot(2,4,7)
boyCount = len(boyHeights)
girlCount = len(girlHeights)
p = np.hstack((maleP/boyCount*np.ones(boyCount),femaleP/girlCount*np.ones(girlCount)))
data = np.random.choice(allHeights,size=(1000,10),replace=True,p=p)
data = np.mean(data,axis=1)
bins = np.linspace(np.min(allHeights),np.max(allHeights),30)
plotHist(data,bins=bins,probability=True,facecolor='r',label="_data")
plt.xlabel("Sample Average (inches)",fontsize=15)
# plt.title("%d samples each containing %d individuals" % (sampleCount,sampleSize),fontsize=15)
yy = plt.ylim()
plt.plot([avgHeight,avgHeight],yy,'k--',label="$E[H]=%0.2f$" % avgHeight,linewidth=3)
plt.ylim(yy)
# plt.legend(frameon=False,fontsize=15,loc="upper right")

plt.subplot(2,4,8)
trials = 1_000
A2 = np.zeros(trials)
for trial in range(trials):
    sample = np.random.choice(national["avg annual wage"],size=500,replace=True,
                              p=national["p(occupation)"])
    A2[trial] = np.mean(sample)
sample = A2/1000
plotHist(sample,label="_data",color='lightgray',alpha=.6)
yy = plt.ylim()
yy = (0,1.3*yy[1])
plt.plot([E/1000,E/1000],yy,'k--',linewidth=3,label="Population Avg: $E[W]=\$%0.2f$" % E,)
# e = np.mean(sample)
# plt.plot([e,e],yy,'k--',linewidth=3,label="Avg $\\bar{W}$: $\$%0.2f$" % (1000*e))
plt.ylim(yy)
# plt.legend(fontsize=15,frameon=True,loc="upper right") 
# plt.title("Number of Sample Averages: %d" % trials,fontsize=15)
plt.xlabel("Thousands of Dollars",fontsize=15);

plt.tight_layout()

#hide
def plotSampleDistsAll():
    plt.figure(figsize=(12,3))
    plt.subplot(1,4,1)
    data = np.random.choice([1,2,3,4,5,6],size=(8000,10),replace=True)
    data = np.mean(data,axis=1)
    plotHist(data,bins=np.linspace(1,6,20),probability=True,facecolor='r',label="_data")
    plt.xlabel("Sample Average",fontsize=15)
    plt.title("Fair Dice Roll",fontsize=15)
    yy = plt.ylim()
    plt.plot([3.5,3.5],yy,'k--',label="$E[D]=3.5$",linewidth=3)
    plt.ylim(yy)
    # plt.legend(frameon=False,fontsize=15,loc="upper right")

    plt.subplot(1,4,2)
    p = np.arange(6).astype(float)+1
    p /= np.sum(p)
    data = np.random.choice([1,2,3,4,5,6],size=(8000,10),replace=True,p=p)
    data = np.mean(data,axis=1)
    plotHist(data,bins=np.linspace(1,6,20),probability=True,facecolor='r',label="_data")
    plt.xlabel("Sample Average",fontsize=15)
    plt.title("Weighted Dice Roll",fontsize=15)
    yy = plt.ylim()
    e = np.dot(p,np.arange(6)+1)
    plt.plot([e,e],yy,'k--',label="$E[D]=%0.2f$" % e,linewidth=3)
    plt.ylim(yy)
    # plt.legend(frameon=False,fontsize=15,loc="upper right")

    plt.subplot(1,4,3)
    boyCount = len(boyHeights)
    girlCount = len(girlHeights)
    p = np.hstack((maleP/boyCount*np.ones(boyCount),femaleP/girlCount*np.ones(girlCount)))
    data = np.random.choice(allHeights,size=(1000,10),replace=True,p=p)
    data = np.mean(data,axis=1)
    bins = np.linspace(np.min(allHeights),np.max(allHeights),30)
    plotHist(data,bins=bins,probability=True,facecolor='r',label="_data")
    plt.xlabel("Sample Average\n(inches)",fontsize=15)
    plt.title("Height",fontsize=15)
    yy = plt.ylim()
    plt.plot([avgHeight,avgHeight],yy,'k--',label="$E[H]=%0.2f$" % avgHeight,linewidth=3)
    plt.ylim(yy)
    # plt.legend(frameon=False,fontsize=15,loc="upper right")

    plt.subplot(1,4,4)
    trials = 1_000
    A2 = np.zeros(trials)
    for trial in range(trials):
        sample = np.random.choice(national["avg annual wage"],size=500,replace=True,
                                  p=national["p(occupation)"])
        A2[trial] = np.mean(sample)
    sample = A2/1000
    plotHist(sample,probability=True,label="_data",color='lightgray',alpha=.6)
    yy = plt.ylim()
    yy = (0,1.3*yy[1])
    plt.plot([E/1000,E/1000],yy,'k--',linewidth=3,label="Population Avg: $E[W]=\$%0.2f$" % E,)
#     e = np.mean(sample)
#     plt.plot([e,e],yy,'k--',linewidth=3,label="Avg $\\bar{W}$: $\$%0.2f$" % (1000*e))
    plt.ylim(yy)
    # plt.legend(fontsize=15,frameon=True,loc="upper right") 
    plt.title("Annual Wage",fontsize=15)
    plt.xlabel("Sample Average\n($\\times\$1000$)",fontsize=15);

    plt.tight_layout()
    
plotSampleDistsAll()

# simulate dice rolls for a fair die
sampleSize = 10      # number of dice rolls per sample
# number of samples
sampleCount = 100 
data = np.random.choice([1,2,3,4,5,6],size=(sampleCount,sampleSize),replace=True)
# calculate the average die roll of each sample
sampleMeans = np.mean(data,axis=1)
# calculate variance of the distribution of sample means
V = np.var(sampleMeans)
# calculate standard deviation of the distribution of sample means
SD = np.std(sampleMeans)
print("Variance: %0.2f" % V)
print("Standard Deviation: %0.2f" % SD)
np.round(np.sqrt(V),decimals=2)

Variance: 0.29
Standard Deviation: 0.54

0.54

# Define function to produce normal distribution given a distribution D
# The distribution is then evaluated at each value in x
def normal(x,D):
    sd = np.std(D)
    Ed = np.mean(D)
    return np.exp(-.5*((x-Ed)/sd)**2) / (sd*np.sqrt(2*np.pi))

D = sampleMeans
x = np.linspace(np.min(D),np.max(D),100)
y = normal(x,D)

# hide
plt.figure(figsize=(8,4))
plt.subplot(1,2,1)
plotHist(sampleMeans,bins=20,probability=True)
plt.xlabel("Sample Mean\n(Fair Dice Rolls)",fontsize=15)
xx = plt.xlim()

plt.subplot(1,2,2)
plt.plot(x,y,'r-')
plt.xlim(xx)
plt.ylim((0,plt.ylim()[1]))
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.xlabel("Sample Mean\n(Fair Dice Rolls)",fontsize=15)
plt.ylabel("Probability Mass Function",fontsize=15);
plt.tight_layout()

# hide
def normal2(D):
    x = np.linspace(np.min(D),np.max(D),100)
    return x,normal(x,D)

plt.figure(figsize=(12,6))
plt.subplot(2,4,1)
data = np.random.choice([1,2,3,4,5,6],size=(8000,10),replace=True)
data = np.mean(data,axis=1)
plotHist(data,bins=np.linspace(1,6,20),probability=True,facecolor='r',label="_data")
plt.xlabel("Sample Average",fontsize=15)
plt.title("Fair Dice Roll",fontsize=15)
yy = plt.ylim()
plt.plot([3.5,3.5],yy,'k--',label="$E[D]=3.5$",linewidth=3)
plt.ylim(yy)
# plt.legend(frameon=False,fontsize=15,loc="upper right")
xx = plt.xlim()

plt.subplot(2,4,5)
x,y = normal2(data)
plt.plot(x,y,'r-')
plt.xlim(xx)
plt.ylim((0,plt.ylim()[1]))

plt.subplot(2,4,2)
p = np.arange(6).astype(float)+1
p /= np.sum(p)
data = np.random.choice([1,2,3,4,5,6],size=(8000,10),replace=True,p=p)
data = np.mean(data,axis=1)
plotHist(data,bins=np.linspace(1,6,20),probability=True,facecolor='r',label="_data")
plt.xlabel("Sample Average",fontsize=15)
plt.title("Weighted Dice Roll",fontsize=15)
yy = plt.ylim()
e = np.dot(p,np.arange(6)+1)
plt.plot([e,e],yy,'k--',label="$E[D]=%0.2f$" % e,linewidth=3)
plt.ylim(yy)
xx = plt.xlim()
# plt.legend(frameon=False,fontsize=15,loc="upper right")

plt.subplot(2,4,6)
x,y = normal2(data)
plt.plot(x,y,'r-')
plt.xlim(xx)
plt.ylim((0,plt.ylim()[1]))

plt.subplot(2,4,3)
boyCount = len(boyHeights)
girlCount = len(girlHeights)
p = np.hstack((maleP/boyCount*np.ones(boyCount),femaleP/girlCount*np.ones(girlCount)))
data = np.random.choice(allHeights,size=(1000,10),replace=True,p=p)
data = np.mean(data,axis=1)
bins = np.linspace(np.min(allHeights),np.max(allHeights),30)
plotHist(data,bins=bins,probability=True,facecolor='r',label="_data")
plt.xlabel("Sample Average\n(inches)",fontsize=15)
plt.title("Height",fontsize=15)
yy = plt.ylim()
plt.plot([avgHeight,avgHeight],yy,'k--',label="$E[H]=%0.2f$" % avgHeight,linewidth=3)
plt.ylim(yy)
xx = plt.xlim()
# plt.legend(frameon=False,fontsize=15,loc="upper right")

plt.subplot(2,4,7)
x,y = normal2(data)
plt.plot(x,y,'r-')
plt.xlim(xx)
plt.ylim((0,plt.ylim()[1]))

plt.subplot(2,4,4)
trials = 1_000
A2 = np.zeros(trials)
for trial in range(trials):
    sample = np.random.choice(national["avg annual wage"],size=500,replace=True,
                              p=national["p(occupation)"])
    A2[trial] = np.mean(sample)
sample = A2/1000
plotHist(sample,probability=True,label="_data",color='lightgray',alpha=.6)
yy = plt.ylim()
yy = (0,1.3*yy[1])
plt.plot([E/1000,E/1000],yy,'k--',linewidth=3,label="Population Avg: $E[W]=\$%0.2f$" % E,)
#     e = np.mean(sample)
#     plt.plot([e,e],yy,'k--',linewidth=3,label="Avg $\\bar{W}$: $\$%0.2f$" % (1000*e))
plt.ylim(yy)
# plt.legend(fontsize=15,frameon=True,loc="upper right") 
plt.title("Annual Wage",fontsize=15)
xx = plt.xlim()
plt.xlabel("Sample Average\n($\\times\$1000$)",fontsize=15);

plt.subplot(2,4,8)
x,y = normal2(sample)
plt.plot(x,y,'r-')
plt.xlim(xx)
plt.ylim((0,plt.ylim()[1]))

plt.tight_layout()

# Define function to produce normal distribution given a distribution D
# The distribution is then evaluated at each value in x
def normal(x,D):
    sd = np.std(D)
    Ed = np.mean(D)
    return np.exp(-.5*((x-Ed)/sd)**2) / (sd*np.sqrt(2*np.pi))

D = sampleMeans
x = np.linspace(np.min(D),np.max(D),100)
y = normal(x,D)

# using the dice roll data
fakeData = np.random.normal(
    loc=np.mean(sampleMeans),
    scale=np.std(sampleMeans),
    size = 500
)

# hide
plt.figure(figsize=(12,4))
plt.subplot(1,3,1)
plotHist(sampleMeans,bins=20,probability=True)
plt.xlabel("Sample Mean\n(Fair Dice Rolls)",fontsize=15)
xx = plt.xlim()
plt.title("Random Dice Rolls Data",fontsize=15)

plt.subplot(1,3,2)
x,y = normal2(sampleMeans)
plt.plot(x,y,'r-')
plt.xlim(xx)
plt.ylim((0,plt.ylim()[1]))
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.xlabel("Sample Mean\n(Fair Dice Rolls)",fontsize=15)
plt.ylabel("Probability Mass Function",fontsize=15);
plt.title("Normal Distribution fitted to data",fontsize=15)

plt.subplot(1,3,3)
plotHist(fakeData,probability=True,alpha=.5)
plt.title("Synthetic Data",fontsize=15)

plt.tight_layout()

# compare synthetic data's mean to ground truth expected value of E[D]=3.5 for fair dice rolls
np.mean(fakeData)

3.4666472954463097

# define function to calculate standard error of a sample distribution
stdErr = lambda sample: np.std(sample)/np.sqrt(len(sample))
# apply the function to the synthetic data
se = stdErr(fakeData)
print("Standard error: %0.4f" % se)
Es = np.mean(fakeData)
lowerBound = Es - 1.96*se
upperBound = Es + 1.96*se
print(
    "Based on our sample, there is a 95% chance that the true population "+\
    "average falls between %0.4f and %0.4f" % (lowerBound,upperBound)
)

Standard error: 0.0238
Based on our sample, there is a 95% chance that the true population average falls between 3.4201 and 3.5132

	year	occupation code	occupation title	national employment	avg annual wage
2705	2019	11-1011	Chief Executives	205890	193850.0
2707	2019	11-1021	General and Operations Managers	2400280	123030.0
2709	2019	11-1031	Legislators	52280	49440.0
2712	2019	11-2011	Advertising and Promotions Managers	25100	141890.0
2714	2019	11-2021	Marketing Managers	263680	149200.0

	year	occupation code	occupation title	national employment	avg annual wage	p(occupation)
2705	2019	11-1011	Chief Executives	205890	193850.0	0.001465
2707	2019	11-1021	General and Operations Managers	2400280	123030.0	0.017074
2709	2019	11-1031	Legislators	52280	49440.0	0.000372
2712	2019	11-2011	Advertising and Promotions Managers	25100	141890.0	0.000179
2714	2019	11-2021	Marketing Managers	263680	149200.0	0.001876

Expected Values¶

Samples of Dice Rolls¶

Populations vs. Samples¶

Example 1: Estimating American adult heights¶

Populations vs. Samples¶

Example 2: Estimating Annual Wages for US Workers¶

The Central Limit Theorem and the Normal "Bell Curve" Distribution¶