Chapter 9: Hypothesis testing#

import pandas as pd
import sidetable
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm, t, binom, ttest_ind
import pingouin as pg
import matplotlib

import rpy2.robjects as ro
from rpy2.robjects.packages import importr
from rpy2.robjects import pandas2ri
pandas2ri.activate()
from rpy2.robjects.conversion import localconverter
# import NHANES package
base = importr('NHANES')

with localconverter(ro.default_converter + pandas2ri.converter):
  NHANES = ro.conversion.rpy2py(ro.r['NHANES'])

 
NHANES = NHANES.drop_duplicates(subset='ID')
NHANES_adult = NHANES.dropna(subset=['PhysActive', 'BMI', 'BPSysAve']).query('Age > 17 and BPSysAve > 0')

rng = np.random.default_rng(123456)

Table 9.1#

sampSize = 250
NHANES_sample = NHANES_adult.dropna(subset=['BPSysAve']).sample(sampSize, random_state=rng)
healthGen_recoder = {'Poor': 1, 'Fair': 2, 'Good': 3, 'Vgood': 4, 'Excellent': 5}

sampleSummary = NHANES_sample.groupby('PhysActive')['BPSysAve'].describe()[['count', 'mean', 'std']]

print(sampleSummary)

sampleSummaryDiff = sampleSummary.diff().loc['Yes',:]

s1, s2 = sampleSummary['std'].values
n1, n2 = sampleSummary['count'].values

welch_df = (s1/n1 + s2/n2)**2 / ((s1/n1)**2/(n1-1) + (s2/n2)**2/(n2-1))
            count        mean        std
PhysActive                              
No          124.0  122.290323  22.539495
Yes         126.0  118.285714  15.404341

Figure 9.1#

sns.boxplot(data=NHANES_sample, x='PhysActive', y='BPSysAve')
plt.xlabel('Physically active?')
plt.ylabel('Average systolic blood pressure')
Text(0, 0.5, 'Average systolic blood pressure')
_images/09-HypothesisTesting_5_1.png

Figure 9.2#

distDfNormal = pd.DataFrame({'x': np.arange(-4, 4, .01)})
distDfNormal['dnorm'] = norm.pdf(distDfNormal.x)
distDfNormal['dt4'] = t.pdf(distDfNormal.x, 4)
distDfNormal['dt1000'] = t.pdf(distDfNormal.x, 1000)

fig, ax = plt.subplots(1, 2, figsize=(12,6))

sns.lineplot(data=distDfNormal, x='x',  y='dnorm', ax=ax[0], color='red')
sns.lineplot(data=distDfNormal, x='x',  y='dt4', ax=ax[0], color='blue', linestyle='dashed')
ax[0].legend(['Normal', 't (df=4)'])
ax[0].set_title('df = 4')
ax[0].set_ylabel('density')

sns.lineplot(data=distDfNormal, x='x',  y='dnorm', ax=ax[1], color='red')
sns.lineplot(data=distDfNormal, x='x',  y='dt1000', ax=ax[1], color='blue', linestyle='dashed')
ax[1].legend(['Normal', 't (df=1000)'])
ax[1].set_title('df = 1000')
ax[1].set_ylabel('density')
Text(0, 0.5, 'density')
_images/09-HypothesisTesting_7_1.png

Figure 9.3#

def tossCoins(n=100):
    return np.sum(rng.uniform(size=n) > 0.5)

# use a large number of replications since this is fast
coinFlips = np.array([tossCoins() for i in range(100000)])

sns.histplot(coinFlips, binwidth=1)
plt.plot([70, 70], [0, 7000], color='red')
[<matplotlib.lines.Line2D at 0x7fbe747708e0>]
_images/09-HypothesisTesting_9_1.png

Table 9.2#

def roundToNearest5(x, base = 5):
  return(base * np.round(x / base))

squatDf = pd.DataFrame({'group': 5 * ['FB'] + 5 * ['XC'],
                       'squat': roundToNearest5(
                           np.hstack((rng.normal(size=5) * 30 + 300,
                                      rng.normal(size=5) * 30 + 140))).astype('int')})

squatDf['shuffledSquat'] = rng.permuted(squatDf.squat.values)

squatDf
group squat shuffledSquat
0 FB 315 160
1 FB 295 125
2 FB 275 140
3 FB 320 105
4 FB 290 175
5 XC 105 290
6 XC 140 320
7 XC 125 295
8 XC 160 275
9 XC 175 315

Figure 9.4#

fig, ax = plt.subplots(1, 2, figsize=(12,6))

sns.boxplot(data=squatDf, x='group', y='squat', ax=ax[0])
ax[0].set_ylabel('max squat (lbs)')

sns.boxplot(data=squatDf, x='group', y='shuffledSquat', ax=ax[1])
ax[1].set_ylabel('max squat (lbs)')
Text(0, 0.5, 'max squat (lbs)')
_images/09-HypothesisTesting_13_1.png

Two-group t-test:#

tt = pg.ttest(x=squatDf.query('group == "FB"').squat, 
              y=squatDf.query('group == "XC"').squat,
              alternative='greater', correction=True)
tt
T dof alternative p-val CI95% cohen-d BF10 power
T-test 10.604266 6.977154 greater 0.000007 [129.76, inf] 6.706726 3708.105 1.0

Figure 9.5#

nRuns = 10000

def shuffleAndMeasure(df):
    dfScram = df.copy()
    dfScram['squat'] = rng.permuted(df.squat.values)
    tt = pg.ttest(x=dfScram.query('group == "FB"').squat, 
                  y=dfScram.query('group == "XC"').squat,
                  alternative='greater', correction=True)
    return(tt['T'][0])

shuffleDist = np.array([shuffleAndMeasure(squatDf) for i in range(nRuns)])
pvalRandomization = np.mean(shuffleDist >= tt['T'][0])

sns.histplot(shuffleDist, bins=50, stat='density')
distDfNormal['dt8'] = t.pdf(distDfNormal.x, 8)
sns.lineplot(data=distDfNormal, x='x',  y='dt8', color='blue', linestyle='dashed')
plt.plot([tt['T'][0], tt['T'][0]], [0, .4], color='black')
[<matplotlib.lines.Line2D at 0x7fbe74479d80>]
_images/09-HypothesisTesting_18_1.png

Figure 9.6#

def shuffleBPstat(df):
    dfScram = df.copy()
    dfScram['BPSysAve'] = rng.permuted(df.BPSysAve.values)
    tt = pg.ttest(x=dfScram.query('PhysActive == "No"').BPSysAve, 
                  y=dfScram.query('PhysActive == "Yes"').BPSysAve,
                  correction=True)
    return(tt['T'][0])

nRuns = 10000
meanDiffSim = np.array([shuffleBPstat(NHANES_sample) for i in range(nRuns)])
bp_tt = pg.ttest(x=NHANES_sample.query('PhysActive == "No"').BPSysAve, 
                  y=NHANES_sample.query('PhysActive == "Yes"').BPSysAve,
                  correction=True)
bp_tt
T dof alternative p-val CI95% cohen-d BF10 power
T-test 1.637566 216.959483 two-sided 0.102962 [-0.82, 8.82] 0.207749 0.492 0.373141
pvalRandomization = np.mean(meanDiffSim >= tt['T'][0])

sns.histplot(meanDiffSim, bins=50, stat='density')
plt.plot([bp_tt['T'][0], bp_tt['T'][0]], [0, .4], color='black')
plt.xlabel("T stat: BP difference between groups")
Text(0.5, 0, 'T stat: BP difference between groups')
_images/09-HypothesisTesting_22_1.png

Figure 9.7#

def exerciseTrial(nPerGroup, bpReduction=0.5):
    bp_mean = NHANES_adult.BPSysAve.mean()
    bp_sd = NHANES_adult.BPSysAve.std()
    controlGroup = rng.normal(bp_mean, bp_sd, nPerGroup)
    expGroup = rng.normal(bp_mean - bpReduction, bp_sd, nPerGroup)
    bp_tt = pg.ttest(x=controlGroup,
                     y=expGroup,
                     correction=True)
    return([nPerGroup, bpReduction, bp_tt['p-val'][0], controlGroup.mean() - expGroup.mean()])



nRuns = 1000
sampSizes = 2 ** np.arange(5,18) # powers of 2
simResults = []

for i, n in enumerate(sampSizes):
    runResults = [exerciseTrial(n) for i in range(nRuns)]
    runResultsDf = pd.DataFrame(runResults,
                                columns=['nPerGroup', 'bpReduction', 'pval', 'diff'])
    simResults.append([n, np.mean(runResultsDf.pval < .05), runResultsDf['diff'].mean()])

simResultsDf = pd.DataFrame(simResults, columns=['n', 'psig', 'meandiff'])
p = sns.lineplot(data=simResultsDf, x='n', y='psig')
plt.xscale('log')
plt.ylabel('Proportion of significant results')
plt.xlabel('sample size per group') 
_ = plt.xticks(simResultsDf.n, rotation=45)
p.get_xaxis().set_major_formatter(matplotlib.ticker.ScalarFormatter())
_images/09-HypothesisTesting_25_0.png

Figure 9.8#

nRuns = 1000 # number of simulated studies to run
nTests = 1000000 # number of simulated genes to test in each run

uncAlpha = 0.05 # alpha level
cutoff = norm.ppf(uncAlpha)
cor_cutoff = norm.ppf(uncAlpha/nTests)

uncOutcome = []
corOutcome = []
for i in range(nRuns):
    sample = rng.normal(size=nTests)
    uncOutcome.append(np.sum(sample < cutoff))
    corOutcome.append(np.sum(sample < cor_cutoff))

fig, ax = plt.subplots(1, 2, figsize=(12,6))
sns.histplot(uncOutcome, bins=50, ax=ax[0])
sns.histplot(corOutcome, bins=50, ax=ax[1])
ax[0].set_xlabel(f'Number of significant results (out of {nTests})')
ax[1].set_xlabel(f'Number of significant results (out of {nTests})')
Text(0.5, 0, 'Number of significant results (out of 1000000)')
_images/09-HypothesisTesting_27_1.png