Chapter 6: Probability#

import pandas as pd
import sidetable
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm, binom


import rpy2.robjects as ro
from rpy2.robjects.packages import importr
from rpy2.robjects import pandas2ri
pandas2ri.activate()
from rpy2.robjects.conversion import localconverter
# import NHANES package
base = importr('NHANES')

with localconverter(ro.default_converter + pandas2ri.converter):
  NHANES = ro.conversion.rpy2py(ro.r['NHANES'])

 
NHANES = NHANES.drop_duplicates(subset='ID')
NHANES['isChild'] = NHANES.Age < 18
NHANES_adult = NHANES.dropna(subset=['Height']).query('Age > 17')

rng = np.random.RandomState(123)

Figure 6.1#

fig, ax = plt.subplots(1, 2, figsize=(12,6))
nsamples = 30000

pDf = pd.DataFrame({'trial_number': list(range(1, nsamples + 1))})
pDf['outcomes'] = rng.binomial(1, .5, nsamples)
pDf['mean_probability'] = np.cumsum(pDf.outcomes) / pDf.trial_number
pDf = pDf.query('trial_number >= 10')

sns.lineplot(data=pDf, x='trial_number', y='mean_probability', ax=ax[0])
ax[0].plot([10, nsamples], [0.5, 0.5])
ax[0].set_xlabel("Number of trials")
ax[0].set_ylabel("Estimated probability of heads")

electionReturns = pd.read_csv('https://raw.githubusercontent.com/statsthinking21/statsthinking21-figures-data/main/03/alabama_election_returns.csv')
electionReturns = pd.melt(electionReturns, id_vars=['pctResp'],
                         var_name='Candidate', value_name='pctVotes')
sns.lineplot(data=electionReturns, x='pctResp', y='pctVotes', hue='Candidate', ax=ax[1])
ax[1].set_xlabel("Percentage of precincts reporting")
ax[1].set_ylabel("Percentage of votes")
Text(0, 0.5, 'Percentage of votes')
_images/06-Probability_3_1.png

Figure 6.2#

imgmtx = np.zeros((6, 6))
imgmtx[:, -1] = 1
imgmtx[-1, :] = 1

plt.imshow(imgmtx, cmap='bwr')
plt.xticks(np.arange(0.5, 6))
plt.yticks(np.arange(0.5, 6))

plt.grid(color='white', linestyle='-', linewidth=1, which='major')
for i in range(0, 6):
    for j in range(0, 6):
        plt.annotate(f'{i+1},{j+1}', [i-0.2, j+0.1], color='white')
        
plt.setp(plt.gca().get_xticklabels(), visible=False) 
_ = plt.setp(plt.gca().get_yticklabels(), visible=False) 
_images/06-Probability_5_0.png

Table 6.1#

curry_df = pd.DataFrame({'numSuccesses': list(range(0, 5))})
curry_df['Probability'] = binom.pmf(curry_df.numSuccesses, 4, 0.91)
curry_df['CumulativeProbability'] = binom.cdf(curry_df.numSuccesses, 4, 0.91)


curry_df
numSuccesses Probability CumulativeProbability
0 0 0.000066 0.000066
1 1 0.002654 0.002719
2 2 0.040246 0.042965
3 3 0.271286 0.314250
4 4 0.685750 1.000000

Table 6.2#

NHANES_diabetes_activity = NHANES_adult[['PhysActive', 'Diabetes']].dropna()
diabetes_summary = pd.DataFrame(NHANES_diabetes_activity.value_counts('Diabetes'), columns=['counts'])
diabetes_summary['prob'] = diabetes_summary.counts / NHANES_diabetes_activity.shape[0]
print(diabetes_summary)

activity_summary = pd.DataFrame(NHANES_diabetes_activity.value_counts('PhysActive'), columns=['counts'])
activity_summary['prob'] = activity_summary.counts / NHANES_diabetes_activity.shape[0]
print(activity_summary)
          counts      prob
Diabetes                  
No          4251  0.887659
Yes          538  0.112341
            counts      prob
PhysActive                  
Yes           2456  0.512842
No            2333  0.487158

Table 6.3#

NHANES_diabetes_stats_by_activity = pd.DataFrame(
    NHANES_diabetes_activity.value_counts(['Diabetes', 'PhysActive']), columns=['counts'])
NHANES_diabetes_stats_by_activity['prob'] = NHANES_diabetes_stats_by_activity / NHANES_diabetes_activity.shape[0]

NHANES_diabetes_stats_by_activity
counts prob
Diabetes PhysActive
No Yes 2261 0.472124
No 1990 0.415536
Yes No 343 0.071622
Yes 195 0.040718

Table 6.4#

NHANES_mh = NHANES_adult.dropna(subset=['PhysActive', 'DaysMentHlthBad'])
NHANES_mh['badMentalHealth'] = [
    'Bad Mental Health' if i > 7 else "Good Mental Health" for i in NHANES_mh.DaysMentHlthBad]

NHANES_mentalhealth_by_physactive_counts = pd.crosstab(
            NHANES_mh.PhysActive, 
            NHANES_mh.badMentalHealth,
            margins = True, normalize='all')
NHANES_mentalhealth_by_physactive_counts
badMentalHealth Bad Mental Health Good Mental Health All
PhysActive
No 0.085177 0.402088 0.487265
Yes 0.060543 0.452192 0.512735
All 0.145720 0.854280 1.000000

Table 6.5#

NHANES_mentalhealth_by_physactive_condp = pd.crosstab(
            NHANES_mh.PhysActive, 
            NHANES_mh.badMentalHealth,
            margins = True, normalize='index')

NHANES_mentalhealth_by_physactive_condp
badMentalHealth Bad Mental Health Good Mental Health
PhysActive
No 0.174807 0.825193
Yes 0.118078 0.881922
All 0.145720 0.854280