Chapter 11: Bayesian statistics
Contents
Chapter 11: Bayesian statistics#
library(tidyverse)
library(ggplot2)
library(cowplot)
library(boot)
library(MASS)
library(BayesFactor)
library(knitr)
theme_set(theme_minimal(base_size = 14))
set.seed(123456) # set random seed to exactly replicate results
# load the NHANES data library
library(NHANES)
# drop duplicated IDs within the NHANES dataset
NHANES <-
NHANES %>%
dplyr::distinct(ID, .keep_all = TRUE)
NHANES_adult <-
NHANES %>%
drop_na(Weight) %>%
subset(Age >= 18)
── Attaching packages ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.2 ──
✔ ggplot2 3.4.1 ✔ purrr 1.0.1
✔ tibble 3.1.8 ✔ dplyr 1.1.0
✔ tidyr 1.3.0 ✔ stringr 1.5.0
✔ readr 2.1.4 ✔ forcats 1.0.0
── Conflicts ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
Attaching package: ‘MASS’
The following object is masked from ‘package:dplyr’:
select
Loading required package: coda
Loading required package: Matrix
Attaching package: ‘Matrix’
The following objects are masked from ‘package:tidyr’:
expand, pack, unpack
************
Welcome to BayesFactor 0.9.12-4.4. If you have questions, please contact Richard Morey (richarddmorey@gmail.com).
Type BFManual() to open the manual.
************
Doing Bayesian estimation#
bayes_df = data.frame(prior=NA,
likelihood=NA,
marginal_likelihood=NA,
posterior=NA)
bayes_df$prior <- 1/1000000
nTests <- 3
nPositives <- 3
sensitivity <- 0.99
specificity <- 0.99
bayes_df$likelihood <- dbinom(nPositives, nTests, 0.99)
bayes_df$marginal_likelihood <-
dbinom(
x = nPositives,
size = nTests,
prob = sensitivity
) * bayes_df$prior +
dbinom(
x = nPositives,
size = nTests,
prob = 1 - specificity
) *
(1 - bayes_df$prior)
bayes_df$posterior <- (bayes_df$likelihood * bayes_df$prior) / bayes_df$marginal_likelihood
Figure 11.2#
# create a table with results
nResponders <- 64
nTested <- 100
drugDf <- tibble(
outcome = c("improved", "not improved"),
number = c(nResponders, nTested - nResponders)
)
likeDf <-
tibble(resp = seq(1,99,1)) %>%
mutate(
presp=resp/100,
likelihood5 = dbinom(resp,100,.5),
likelihood7 = dbinom(resp,100,.7),
likelihood3 = dbinom(resp,100,.3)
)
ggplot(likeDf,aes(resp,likelihood5)) +
geom_line() +
xlab('number of responders') + ylab('likelihood') +
geom_vline(xintercept = drugDf$number[1],color='blue') +
geom_line(aes(resp,likelihood7),linetype='dotted') +
geom_line(aes(resp,likelihood3),linetype='dashed')
Figure 11.3#
# compute marginal likelihood
likeDf <-
likeDf %>%
mutate(uniform_prior = array(1 / n()))
# multiply each likelihood by prior and add them up
marginal_likelihood <-
sum(
dbinom(
x = nResponders, # the number who responded to the drug
size = 100, # the number tested
likeDf$presp # the likelihood of each response
) * likeDf$uniform_prior
)
# Create data for use in figure
bayesDf <-
tibble(
steps = seq(from = 0.01, to = 0.99, by = 0.01)
) %>%
mutate(
likelihoods = dbinom(
x = nResponders,
size = 100,
prob = steps
),
priors = dunif(steps) / length(steps),
posteriors = (likelihoods * priors) / marginal_likelihood
)
# compute MAP estimate
MAP_estimate <-
bayesDf %>%
arrange(desc(posteriors)) %>%
slice(1) %>%
pull(steps)
# compute likelihoods for the observed data under all values of p(heads). here we use the quantized values from .01 to .99 in steps of 0.01
ggplot(bayesDf,aes(steps,posteriors)) +
geom_line() +
geom_line(aes(steps,priors),color='black',linetype='dotted') +
xlab('p(respond)') + ylab('posterior probability of the observed data') +
annotate(
"point",
x = MAP_estimate,
y = max(bayesDf$posteriors), shape=9,
size = 3
)
Figure 11.4#
# compute likelihoods for data under all values of p(heads)
# using a flat or empirical prior.
# here we use the quantized values from .01 to .99 in steps of 0.01
df <-
tibble(
steps = seq(from = 0.01, to = 0.99, by = 0.01)
) %>%
mutate(
likelihoods = dbinom(nResponders, 100, steps),
priors_flat = dunif(steps) / sum(dunif(steps)),
priors_empirical = dbinom(10, 20, steps) / sum(dbinom(10, 20, steps))
)
marginal_likelihood_flat <-
sum(dbinom(nResponders, 100, df$steps) * df$priors_flat)
marginal_likelihood_empirical <-
sum(dbinom(nResponders, 100, df$steps) * df$priors_empirical)
df <-
df %>%
mutate(
posteriors_flat =
(likelihoods * priors_flat) / marginal_likelihood_flat,
posteriors_empirical =
(likelihoods * priors_empirical) / marginal_likelihood_empirical
)
p1 <- ggplot(df, aes(steps, posteriors_flat)) +
geom_line(color = "blue") +
xlab("p(heads)") + ylab("Posterior probability") +
geom_line(aes(steps, posteriors_empirical), color = "red") +
geom_line(aes(steps, priors_empirical), linetype = "dotted")
# compute likelihoods for data under all values of p(heads) using strong prior.
df <-
df %>%
mutate(
priors_strong = dbinom(250, 500, steps) / sum(dbinom(250, 500, steps))
)
marginal_likelihood_strong <-
sum(dbinom(nResponders, 100, df$steps) * df$priors_strong)
df <-
df %>%
mutate(
posteriors_strongprior = (likelihoods * priors_strong) / marginal_likelihood_strong
)
p2 <- ggplot(df,aes(steps,posteriors_empirical)) +
geom_line(color='blue') +
xlab('p(heads)') + ylab('Posterior probability') +
geom_line(aes(steps,posteriors_strongprior),color='red') +
geom_line(aes(steps,priors_strong),linetype='dotted')
# compute likelihoods for data under all values of p(respond) using absolute prior.
df <-
df %>%
mutate(
priors_absolute = array(data = 0, dim = length(steps)),
priors_absolute = if_else(
steps >= 0.8,
1, priors_absolute
),
priors_absolute = priors_absolute / sum(priors_absolute)
)
marginal_likelihood_absolute <-
sum(dbinom(nResponders, 100, df$steps) * df$priors_absolute)
df <-
df %>%
mutate(
posteriors_absolute =
(likelihoods * priors_absolute) / marginal_likelihood_absolute
)
p3 <- ggplot(df,aes(steps,posteriors_absolute)) +
geom_line(color='blue') +
xlab('p(heads)') +
ylab('Posterior probability') +
ylim(0,max(df$posteriors_absolute)*1.1) +
geom_line(aes(steps,
priors_absolute*max(posteriors_absolute)*20),
linetype='dotted',
linewidth=1)
plot_grid(p1, p2,p3, labels='AUTO')
Figure 11.5#
# create simulated data for drug trial example
set.seed(1234567)
nsubs <- 40
effect_size <- 0.6
# randomize indiviuals to drug (1) or placebo (0)
drugDf <-
tibble(
group = as.integer(runif(nsubs) > 0.5)
) %>%
mutate(
hbchange = rnorm(nsubs) - group * effect_size
)
drugDf %>%
mutate(
group = as.factor(
recode(
group,
"1" = "Drug",
"0" = "Placebo"
)
)
) %>%
ggplot(aes(group, hbchange)) +
geom_boxplot() +
annotate("segment", x = 0.5, xend = 2.5, y = 0, yend = 0, linetype = "dotted") +
labs(
x = "",
y = "Change in hemoglobin A1C"
)
T-test for drug example#
# compute t-test for drug example
drugTT <- t.test(hbchange ~ group, alternative = "greater", data = drugDf)
print(drugTT)
Welch Two Sample t-test
data: hbchange by group
t = 2.0813, df = 32.091, p-value = 0.02273
alternative hypothesis: true difference in means between group 0 and group 1 is greater than 0
95 percent confidence interval:
0.1057051 Inf
sample estimates:
mean in group 0 mean in group 1
-0.08248954 -0.65013353
Bayes factor for drug example#
# compute Bayes factor for drug data
bf_drug <- ttestBF(
formula = hbchange ~ group, data = drugDf,
nullInterval = c(0, Inf)
)
bf_drug
Warning message:
“data coerced from tibble to data frame”
Bayes factor analysis
--------------
[1] Alt., r=0.707 0<d<Inf : 3.369297 ±0%
[2] Alt., r=0.707 !(0<d<Inf) : 0.115034 ±0.01%
Against denominator:
Null, mu1-mu2 = 0
---
Bayes factor type: BFindepSample, JZS
Bayes factor for one-sided tests#
bf_drug[1]/bf_drug[2]
Bayes factor analysis
--------------
[1] Alt., r=0.707 0<d<Inf : 29.28958 ±0.01%
Against denominator:
Alternative, r = 0.707106781186548, mu =/= 0 !(0<d<Inf)
---
Bayes factor type: BFindepSample, JZS
Table 11.2#
# Compute credible intervals for example
nsamples <- 100000
# create random uniform variates for x and y
x <- runif(nsamples)
y <- runif(nsamples)
# create f(x)
fx <- dbinom(x = nResponders, size = 100, prob = x)
# accept samples where y < f(x)
accept <- which(y < fx)
accepted_samples <- x[accept]
credible_interval <- quantile(x = accepted_samples,
probs = c(0.025, 0.975))
kable(credible_interval, caption='Credible interval obtained for the pain drug example using rejection sampling.')
Table: Credible interval obtained for the pain drug example using rejection sampling.
| | x|
|:-----|---------:|
|2.5% | 0.5385021|
|97.5% | 0.7275576|
Figure 11.6#
# plot histogram
p=ggplot(data.frame(samples=accepted_samples),aes(samples)) +
geom_density()
for (i in 1:2) {
p = p + annotate('segment',x=credible_interval[i],xend=credible_interval[i],
y=0,yend=2,col='blue',lwd=1)
}
print(p)