Chapter 8: Resampling and simulation
Contents
Chapter 8: Resampling and simulation#
library(tidyverse)
library(ggplot2)
library(cowplot)
library(knitr)
theme_set(theme_minimal(base_size = 14))
set.seed(123456) # set random seed to exactly replicate results
# load the NHANES data library
library(NHANES)
# drop duplicated IDs within the NHANES dataset
NHANES <- NHANES %>%
dplyr::distinct(ID,.keep_all=TRUE)
NHANES_adult <- NHANES %>%
drop_na(Height) %>%
subset(Age>=18)
── Attaching packages ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.2 ──
✔ ggplot2 3.4.1 ✔ purrr 1.0.1
✔ tibble 3.1.8 ✔ dplyr 1.1.0
✔ tidyr 1.3.0 ✔ stringr 1.5.0
✔ readr 2.1.4 ✔ forcats 1.0.0
── Conflicts ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
Figure 8.1#
p1 <-
tibble(
x = runif(10000)
) %>%
ggplot((aes(x))) +
geom_histogram(bins = 100) +
labs(title = "Uniform")
p2 <-
tibble(
x = rnorm(10000)
) %>%
ggplot(aes(x)) +
geom_histogram(bins = 100) +
labs(title = "Normal")
plot_grid(p1, p2, ncol = 3)
Figure 8.2#
finishTimeDf <- tibble(finishTime=rnorm(3*150,mean=5,sd=1),
quiz=kronecker(c(1:3),rep(1,150)))
ggplot(finishTimeDf,aes(finishTime)) +
geom_histogram(bins=25) +
facet_grid(. ~ quiz) +
xlim(0,10)
Warning message:
“Removed 6 rows containing missing values (`geom_bar()`).”
Figure 8.3#
# sample maximum value 5000 times and compute 99th percentile
nRuns <- 5000
sampSize <- 150
sampleMax <- function(sampSize = 150) {
samp <- rnorm(sampSize, mean = 5, sd = 1)
return(max(samp))
}
maxTime <- replicate(nRuns, sampleMax())
cutoff <- quantile(maxTime, 0.99)
tibble(maxTime) %>%
ggplot(aes(maxTime)) +
geom_histogram(bins = 100) +
geom_vline(xintercept = cutoff, color = "red")
Figure 8.4#
# perform the bootstrap to compute SEM and compare to parametric method
nRuns <- 2500
sampleSize <- 32
heightSample <-
NHANES_adult %>%
sample_n(sampleSize)
bootMeanHeight <- function(df) {
bootSample <- sample_n(df, dim(df)[1], replace = TRUE)
return(mean(bootSample$Height))
}
bootMeans <- replicate(nRuns, bootMeanHeight(heightSample))
SEM_standard <- sd(heightSample$Height) / sqrt(sampleSize)
SEM_bootstrap <- sd(bootMeans)
options(pillar.sigfig = 3)
tibble(bootMeans=bootMeans) %>%
ggplot(aes(bootMeans)) +
geom_histogram(aes(y=after_stat(density)),bins=50) +
stat_function(fun = dnorm, n = 100,
args = list(mean = mean(heightSample$Height),
sd = SEM_standard),
linewidth=1.5,color='red'
)