Chapter 9: Hypothesis testing
Contents
Chapter 9: Hypothesis testing#
import pandas as pd
import sidetable
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm, t, binom, ttest_ind
import pingouin as pg
import matplotlib
import rpy2.robjects as ro
from rpy2.robjects.packages import importr
from rpy2.robjects import pandas2ri
pandas2ri.activate()
from rpy2.robjects.conversion import localconverter
# import NHANES package
base = importr('NHANES')
with localconverter(ro.default_converter + pandas2ri.converter):
NHANES = ro.conversion.rpy2py(ro.r['NHANES'])
NHANES = NHANES.drop_duplicates(subset='ID')
NHANES_adult = NHANES.dropna(subset=['PhysActive', 'BMI', 'BPSysAve']).query('Age > 17 and BPSysAve > 0')
rng = np.random.default_rng(123456)
Table 9.1#
sampSize = 250
NHANES_sample = NHANES_adult.dropna(subset=['BPSysAve']).sample(sampSize, random_state=rng)
healthGen_recoder = {'Poor': 1, 'Fair': 2, 'Good': 3, 'Vgood': 4, 'Excellent': 5}
sampleSummary = NHANES_sample.groupby('PhysActive')['BPSysAve'].describe()[['count', 'mean', 'std']]
print(sampleSummary)
sampleSummaryDiff = sampleSummary.diff().loc['Yes',:]
s1, s2 = sampleSummary['std'].values
n1, n2 = sampleSummary['count'].values
welch_df = (s1/n1 + s2/n2)**2 / ((s1/n1)**2/(n1-1) + (s2/n2)**2/(n2-1))
count mean std
PhysActive
No 124.0 122.290323 22.539495
Yes 126.0 118.285714 15.404341
Figure 9.1#
sns.boxplot(data=NHANES_sample, x='PhysActive', y='BPSysAve')
plt.xlabel('Physically active?')
plt.ylabel('Average systolic blood pressure')
Text(0, 0.5, 'Average systolic blood pressure')
Figure 9.2#
distDfNormal = pd.DataFrame({'x': np.arange(-4, 4, .01)})
distDfNormal['dnorm'] = norm.pdf(distDfNormal.x)
distDfNormal['dt4'] = t.pdf(distDfNormal.x, 4)
distDfNormal['dt1000'] = t.pdf(distDfNormal.x, 1000)
fig, ax = plt.subplots(1, 2, figsize=(12,6))
sns.lineplot(data=distDfNormal, x='x', y='dnorm', ax=ax[0], color='red')
sns.lineplot(data=distDfNormal, x='x', y='dt4', ax=ax[0], color='blue', linestyle='dashed')
ax[0].legend(['Normal', 't (df=4)'])
ax[0].set_title('df = 4')
ax[0].set_ylabel('density')
sns.lineplot(data=distDfNormal, x='x', y='dnorm', ax=ax[1], color='red')
sns.lineplot(data=distDfNormal, x='x', y='dt1000', ax=ax[1], color='blue', linestyle='dashed')
ax[1].legend(['Normal', 't (df=1000)'])
ax[1].set_title('df = 1000')
ax[1].set_ylabel('density')
Text(0, 0.5, 'density')
Figure 9.3#
def tossCoins(n=100):
return np.sum(rng.uniform(size=n) > 0.5)
# use a large number of replications since this is fast
coinFlips = np.array([tossCoins() for i in range(100000)])
sns.histplot(coinFlips, binwidth=1)
plt.plot([70, 70], [0, 7000], color='red')
[<matplotlib.lines.Line2D at 0x7fbe747708e0>]
Table 9.2#
def roundToNearest5(x, base = 5):
return(base * np.round(x / base))
squatDf = pd.DataFrame({'group': 5 * ['FB'] + 5 * ['XC'],
'squat': roundToNearest5(
np.hstack((rng.normal(size=5) * 30 + 300,
rng.normal(size=5) * 30 + 140))).astype('int')})
squatDf['shuffledSquat'] = rng.permuted(squatDf.squat.values)
squatDf
group | squat | shuffledSquat | |
---|---|---|---|
0 | FB | 315 | 160 |
1 | FB | 295 | 125 |
2 | FB | 275 | 140 |
3 | FB | 320 | 105 |
4 | FB | 290 | 175 |
5 | XC | 105 | 290 |
6 | XC | 140 | 320 |
7 | XC | 125 | 295 |
8 | XC | 160 | 275 |
9 | XC | 175 | 315 |
Figure 9.4#
fig, ax = plt.subplots(1, 2, figsize=(12,6))
sns.boxplot(data=squatDf, x='group', y='squat', ax=ax[0])
ax[0].set_ylabel('max squat (lbs)')
sns.boxplot(data=squatDf, x='group', y='shuffledSquat', ax=ax[1])
ax[1].set_ylabel('max squat (lbs)')
Text(0, 0.5, 'max squat (lbs)')
Two-group t-test:#
tt = pg.ttest(x=squatDf.query('group == "FB"').squat,
y=squatDf.query('group == "XC"').squat,
alternative='greater', correction=True)
tt
T | dof | alternative | p-val | CI95% | cohen-d | BF10 | power | |
---|---|---|---|---|---|---|---|---|
T-test | 10.604266 | 6.977154 | greater | 0.000007 | [129.76, inf] | 6.706726 | 3708.105 | 1.0 |
Figure 9.5#
nRuns = 10000
def shuffleAndMeasure(df):
dfScram = df.copy()
dfScram['squat'] = rng.permuted(df.squat.values)
tt = pg.ttest(x=dfScram.query('group == "FB"').squat,
y=dfScram.query('group == "XC"').squat,
alternative='greater', correction=True)
return(tt['T'][0])
shuffleDist = np.array([shuffleAndMeasure(squatDf) for i in range(nRuns)])
pvalRandomization = np.mean(shuffleDist >= tt['T'][0])
sns.histplot(shuffleDist, bins=50, stat='density')
distDfNormal['dt8'] = t.pdf(distDfNormal.x, 8)
sns.lineplot(data=distDfNormal, x='x', y='dt8', color='blue', linestyle='dashed')
plt.plot([tt['T'][0], tt['T'][0]], [0, .4], color='black')
[<matplotlib.lines.Line2D at 0x7fbe74479d80>]
Figure 9.6#
def shuffleBPstat(df):
dfScram = df.copy()
dfScram['BPSysAve'] = rng.permuted(df.BPSysAve.values)
tt = pg.ttest(x=dfScram.query('PhysActive == "No"').BPSysAve,
y=dfScram.query('PhysActive == "Yes"').BPSysAve,
correction=True)
return(tt['T'][0])
nRuns = 10000
meanDiffSim = np.array([shuffleBPstat(NHANES_sample) for i in range(nRuns)])
bp_tt = pg.ttest(x=NHANES_sample.query('PhysActive == "No"').BPSysAve,
y=NHANES_sample.query('PhysActive == "Yes"').BPSysAve,
correction=True)
bp_tt
T | dof | alternative | p-val | CI95% | cohen-d | BF10 | power | |
---|---|---|---|---|---|---|---|---|
T-test | 1.637566 | 216.959483 | two-sided | 0.102962 | [-0.82, 8.82] | 0.207749 | 0.492 | 0.373141 |
pvalRandomization = np.mean(meanDiffSim >= tt['T'][0])
sns.histplot(meanDiffSim, bins=50, stat='density')
plt.plot([bp_tt['T'][0], bp_tt['T'][0]], [0, .4], color='black')
plt.xlabel("T stat: BP difference between groups")
Text(0.5, 0, 'T stat: BP difference between groups')
Figure 9.7#
def exerciseTrial(nPerGroup, bpReduction=0.5):
bp_mean = NHANES_adult.BPSysAve.mean()
bp_sd = NHANES_adult.BPSysAve.std()
controlGroup = rng.normal(bp_mean, bp_sd, nPerGroup)
expGroup = rng.normal(bp_mean - bpReduction, bp_sd, nPerGroup)
bp_tt = pg.ttest(x=controlGroup,
y=expGroup,
correction=True)
return([nPerGroup, bpReduction, bp_tt['p-val'][0], controlGroup.mean() - expGroup.mean()])
nRuns = 1000
sampSizes = 2 ** np.arange(5,18) # powers of 2
simResults = []
for i, n in enumerate(sampSizes):
runResults = [exerciseTrial(n) for i in range(nRuns)]
runResultsDf = pd.DataFrame(runResults,
columns=['nPerGroup', 'bpReduction', 'pval', 'diff'])
simResults.append([n, np.mean(runResultsDf.pval < .05), runResultsDf['diff'].mean()])
simResultsDf = pd.DataFrame(simResults, columns=['n', 'psig', 'meandiff'])
p = sns.lineplot(data=simResultsDf, x='n', y='psig')
plt.xscale('log')
plt.ylabel('Proportion of significant results')
plt.xlabel('sample size per group')
_ = plt.xticks(simResultsDf.n, rotation=45)
p.get_xaxis().set_major_formatter(matplotlib.ticker.ScalarFormatter())
Figure 9.8#
nRuns = 1000 # number of simulated studies to run
nTests = 1000000 # number of simulated genes to test in each run
uncAlpha = 0.05 # alpha level
cutoff = norm.ppf(uncAlpha)
cor_cutoff = norm.ppf(uncAlpha/nTests)
uncOutcome = []
corOutcome = []
for i in range(nRuns):
sample = rng.normal(size=nTests)
uncOutcome.append(np.sum(sample < cutoff))
corOutcome.append(np.sum(sample < cor_cutoff))
fig, ax = plt.subplots(1, 2, figsize=(12,6))
sns.histplot(uncOutcome, bins=50, ax=ax[0])
sns.histplot(corOutcome, bins=50, ax=ax[1])
ax[0].set_xlabel(f'Number of significant results (out of {nTests})')
ax[1].set_xlabel(f'Number of significant results (out of {nTests})')
Text(0.5, 0, 'Number of significant results (out of 1000000)')