Chapter 18: Doing reproducible research#

library(tidyverse)
library(ggplot2)
library(cowplot)

set.seed(123456) # set random seed to exactly replicate results

# setup colorblind palette
# from http://www.cookbook-r.com/Graphs/Colors_(ggplot2)/#a-colorblind-friendly-palette
# The palette with grey:
cbPalette <- c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")
── Attaching packages ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.2 ──
 ggplot2 3.4.1      purrr   1.0.1
 tibble  3.1.8      dplyr   1.1.0
 tidyr   1.3.0      stringr 1.5.0
 readr   2.1.4      forcats 1.0.0
── Conflicts ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
 dplyr::filter() masks stats::filter()
 dplyr::lag()    masks stats::lag()

Figure 1#

alpha=0.05  # false positive rate
beta = seq(1.,0.05,-0.05)  # false negative rate
powerVals = 1-beta
priorVals=c(.01,0.1,0.5,0.9)

nstudies=100

df=data.frame(power=rep(powerVals,length(priorVals))) %>%
  mutate(priorVal=kronecker(priorVals,rep(1,length(powerVals))),
         `P(H is true)` = as.factor(priorVal),
         alpha=alpha)


# Positive Predictive Value (PPV) - the likelihood that a positive finding is true
PPV = function(df) {
  df$PPV = (df$power*df$priorVal)/(df$power*df$priorVal + df$alpha*(1-df$priorVal))
  return(df)
}

df=PPV(df)
ggplot(df,aes(power,PPV,linetype=`P(H is true)`)) +
  geom_line(linewidth=1) +
  ylim(0,1) +
  xlim(0,1) +
  ylab('Posterior predictive value (PPV)')
_images/18-DoingReproducibleResearch_3_0.svg

Figure 2#

trueEffectSize=0.2
dfCurse=data.frame(sampSize=seq(20,300,20)) %>%
  mutate(effectSize=trueEffectSize,
         alpha=0.05)

simCurse = function(df,nruns=1000){
  sigResults=0
  sigEffects=c()
  for (i in 1:nruns){
    tmpData=rnorm(df$sampSize,mean=df$effectSize,sd=1)
    ttestResult=t.test(tmpData)
    if (ttestResult$p.value<df$alpha){
      sigResults = sigResults + 1
      sigEffects=c(sigEffects,ttestResult$estimate)
    }
  }
  df$power=sigResults/nruns
  df$effectSizeEstimate=mean(sigEffects)
  return(df)
}

dfCurse = dfCurse %>% group_by(sampSize) %>% do(simCurse(.))

p1 <- ggplot(dfCurse,aes(power,effectSizeEstimate)) +
  geom_line(linewidth=1) +
  ylim(0,max(dfCurse$effectSizeEstimate)*1.2) +
  geom_hline(yintercept = trueEffectSize,size=1,linetype='dotted',color='red') +
  ylab('Effect size estimate')

# single

sampSize=60
effectSize=0.2
nruns=1000
alpha=0.05
df=data.frame(idx=seq(1,nruns)) %>%
  mutate(pval=NA,
         estimate=NA)

for (i in 1:nruns){
  tmpData=rnorm(sampSize,mean=effectSize,sd=1)
  ttestResult=t.test(tmpData)
  df$pval[i]=ttestResult$p.value
  df$estimate[i]=ttestResult$estimate
}
df = df %>%
  mutate(significant=pval<alpha) %>%
  group_by(significant)

power=mean(df$pval<alpha)

meanSigEffect=mean(df$estimate[df$pval<alpha])

meanTrueEffect=mean(df$estimate)

p2 <- ggplot(df,aes(estimate,fill=significant)) +
  geom_histogram(bins=50)

plot_grid(p1, p2)
Warning message:
“Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
 Please use `linewidth` instead.”
_images/18-DoingReproducibleResearch_5_1.svg