Chapter 4: Data Visualization#

library(tidyverse)
library(ggplot2)
library(cowplot)
library(knitr)
library(NHANES)
library(ggbeeswarm)

set.seed(1234567)

# drop duplicated IDs within the NHANES dataset
NHANES=NHANES %>% dplyr::distinct(ID,.keep_all=TRUE)

NHANES$isChild <- NHANES$Age<18

NHANES_adult=NHANES %>%
  drop_na(Height) %>%
  subset(subset=Age>=18)

theme_set(theme_minimal(base_size = 14))

# setup colorblind palette
# from http://www.cookbook-r.com/Graphs/Colors_(ggplot2)/#a-colorblind-friendly-palette
# The palette with grey:
cbPalette <- c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")
── Attaching packages ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.2 ──
 ggplot2 3.4.1      purrr   1.0.1
 tibble  3.1.8      dplyr   1.1.0
 tidyr   1.3.0      stringr 1.5.0
 readr   2.1.4      forcats 1.0.0
── Conflicts ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
 dplyr::filter() masks stats::filter()
 dplyr::lag()    masks stats::lag()

Figure 4.2#

oringDf <- read.table("https://raw.githubusercontent.com/statsthinking21/statsthinking21-figures-data/main/orings.csv", sep = ",", header = TRUE)

oringDf %>%
  ggplot(aes(x = Temperature, y = DamageIndex)) +
  geom_point() +
  geom_smooth(method = "loess", se = FALSE, span = 1) + ylim(0, 12) +
  geom_vline(xintercept = 27.5, linewidth =8, alpha = 0.3, color = "red") +
  labs(
    y = "Damage Index",
    x = "Temperature at time of launch"
  ) +
  scale_x_continuous(breaks = seq.int(25, 85, 5)) +
  annotate(
    "text",
    angle=90,
    x = 27.5,
    y = 6,
    label = "Forecasted temperature on Jan 28",
    size=5
  )
`geom_smooth()` using formula = 'y ~ x'
_images/04-VisualizingData_3_1.svg

Figure 4.3#

nhanes_sample=NHANES_adult %>% 
  sample_n(200) %>% 
  group_by(Gender) 

dfmean = nhanes_sample %>% 
  summarise(Height=mean(Height)) 

p1 = ggplot(dfmean,aes(x=Gender,y=Height)) + 
  geom_bar(stat="identity",color="black", fill="gray") + 
  coord_cartesian(ylim=c(125,210)) +
  ggtitle('Bar plot') + 
  theme(aspect.ratio=1)  

p2 = ggplot(dfmean,aes(x=Gender,y=Height)) + 
  geom_bar(stat="identity",,color="black", fill="gray") + 
  coord_cartesian(ylim=c(125,210)) +
  ggtitle('Bar plot + beeswarm') + 
  theme(aspect.ratio=1)  +
  geom_beeswarm(data=nhanes_sample,aes(x=Gender,y=Height), alpha=0.5, size=0.5, cex=2) 
  #geom_jitter(data=nhanes_sample,aes(x=Gender,y=Height),width=0.05,alpha=0.2)

p3 = ggplot(nhanes_sample,aes(x=Gender,y=Height)) + 
  geom_violin() + 
  ggtitle('Violin plot') + theme(aspect.ratio=1)

p4 = ggplot(nhanes_sample,aes(x=Gender,y=Height)) + 
  geom_boxplot() +  
  ggtitle('Box plot') + theme(aspect.ratio=1)


plot_grid(p1,p2,p3,p4,nrow=2,labels='AUTO')
_images/04-VisualizingData_5_0.svg

Figure 4.4#

npts=12
df=data.frame(x=seq(1,npts)) %>%
  mutate(yClean=x + rnorm(npts,sd=0.1))
pointSize=2
t=theme(axis.text=element_text(size=10),axis.title=element_text(size=16))
p1=ggplot(df,aes(x,yClean)) +
  geom_smooth(method='lm',se=FALSE) + ylim(-5,20) +
  ylab('Dental health') + xlab('Time spent flossing') + t


p2=ggplot(df,aes(x,yClean)) +
  geom_point(size=pointSize) +
  geom_smooth(method='lm',se=FALSE) + ylim(-5,20) +
  ylab('Dental health') + xlab('Time spent flossing') + t

df = df %>%
  mutate(yDirty=x+ rnorm(npts,sd=10))

p3=ggplot(df,aes(x,yDirty)) +
  geom_point(size=pointSize) +
  geom_smooth(method='lm',se=FALSE)+ ylim(-5,20) +
  ylab('Dental health') + xlab('Time spent flossing') + t

df = df %>%
  mutate(yOutlier=rnorm(npts))
df$yOutlier[npts]=200


p4=ggplot(df,aes(x,yOutlier)) +
  geom_point(size=pointSize) +
  geom_smooth(method='lm',se=FALSE) +
  ylab('Dental health') + xlab('Time spent flossing') + t

plot_grid(p1,p2,p3,p4,nrow=2,labels='AUTO')
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
`geom_smooth()` using formula = 'y ~ x'
Warning message:
“Removed 6 rows containing non-finite values (`stat_smooth()`).”
Warning message:
“Removed 6 rows containing missing values (`geom_point()`).”
`geom_smooth()` using formula = 'y ~ x'
_images/04-VisualizingData_7_6.svg

Figure 4.5#

p1 = ggplot(df,aes(x,yDirty)) +
  geom_point(size=2) +
  geom_smooth(method='lm',se=FALSE)+ ylim(-5,20) +
  ylab('Dental health') + xlab('Time spent flossing')

p2 = ggplot(df,aes(x,yDirty)) +
  geom_point(size=0.5) +
  geom_smooth(method='lm',se=FALSE)+ ylim(-5,20) +
  ylab('Dental health') + xlab('Time spent flossing') +
    theme_dark() +
  theme(panel.grid.major = element_line(colour = "black",linewidth=1)) +
  theme(panel.grid.minor = element_line(colour = "black",linewidth=1))

plot_grid(p1,p2,labels='AUTO',ncol=2)
`geom_smooth()` using formula = 'y ~ x'
Warning message:
“Removed 6 rows containing non-finite values (`stat_smooth()`).”
Warning message:
“Removed 6 rows containing missing values (`geom_point()`).”
`geom_smooth()` using formula = 'y ~ x'
Warning message:
“Removed 6 rows containing non-finite values (`stat_smooth()`).”
Warning message:
“Removed 6 rows containing missing values (`geom_point()`).”
_images/04-VisualizingData_9_6.svg

Figure 4.7#

crimeData=read.table('https://raw.githubusercontent.com/statsthinking21/statsthinking21-figures-data/main/CrimeStatebyState.csv',sep=',',header=TRUE) %>%
  subset(Year > 1989) %>%
  mutate(ViolentCrimePerCapita=Violent.crime.total/Population)

p1 = ggplot(crimeData,aes(Year,ViolentCrimePerCapita)) +
  geom_line() + ylim(-0.05,0.05)

p2 = ggplot(crimeData,aes(Year,ViolentCrimePerCapita)) +
  geom_line()


plot_grid(p1,p2,labels='AUTO')
_images/04-VisualizingData_11_0.svg

Figure 4.8#

bodyTempDf=data.frame(days=c(1:40),temp=rnorm(40)*0.3 + 98.6)
bodyTempDf$temp[21:30]=bodyTempDf$temp[21:30]+3

p1 = ggplot(bodyTempDf,aes(days,temp)) +
  geom_line() +
  ylim(0,105) +
  labs(
    y = "Body temperature",
    x = "Measurement day"
    )
p2 = ggplot(bodyTempDf,aes(days,temp)) +
  geom_line() +
  ylim(95,105) +
  labs(
    y = "Body temperature",
    x = "Measurement day"
    )

plot_grid(p1,p2)
_images/04-VisualizingData_13_0.svg

Figure 4.9#

p1 = ggplot(data.frame(y=c(100,95),x=c('condition 1','condition 2')),aes(x=x,y=y)) +
  geom_col(color="black", fill="gray") +
  ggtitle('lie factor = 1')

p2 = ggplot(data.frame(y=c(100,95),x=c('condition 1','condition 2')),aes(x=x,y=y)) +
  geom_col(color="black", fill="gray") +
  coord_cartesian(ylim=c(92.5,105)) +
  ggtitle('lie factor ~ 2.8')

plot_grid(p1,p2,labels='AUTO')
_images/04-VisualizingData_15_0.svg

Figure 4.10#

exampleDf = data.frame(value=c(3,5,4,6,2,5,8,8),
                       v1=as.factor(c(1,1,2,2,3,3,4,4)),
                       v2=as.factor(c(1,2,1,2,1,2,1,2)))
ggplot(exampleDf,aes(v1,value)) +
  theme_dark() +
  geom_bar(aes(fill = v2), position = "dodge",stat='identity')
_images/04-VisualizingData_17_0.svg

Figure 4.12#

religionData=read.table('https://raw.githubusercontent.com/statsthinking21/statsthinking21-figures-data/main/religion_data.txt',sep='\t')

names(religionData)=c('Religion','Percentage')
religionData = arrange(religionData,desc(Percentage))
religionData$Religion=factor(religionData$Religion,levels=religionData$Religion)
ggplot(religionData,aes(Religion,Percentage,label=Percentage)) +
  geom_bar(stat='identity',color="black", fill="gray") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
_images/04-VisualizingData_19_0.svg

This plot allows the viewer to make comparisons based on the the length of the bars along a common scale (the y-axis). Humans tend to be more accurate when decoding differences based on these perceptual elements than based on area or color.

Figure 4.13#

# Consumer Price Index data obtained from
# https://inflationdata.com/Inflation/Consumer_Price_Index/HistoricalCPI.aspx

# load CPI data
cpiData <- read_tsv('https://raw.githubusercontent.com/statsthinking21/statsthinking21-figures-data/main/cpi_data.txt',
                    col_names=FALSE) %>%
  dplyr::select(X1, X14) %>%
  rename(year = X1,
         meanCPI = X14)

# get reference cpi for 1950 dollars
cpiRef <- cpiData %>%
  filter(year==1950) %>%
  pull(meanCPI)

gasPriceData <- tibble(year=c(1930,1940,1950,1960,1970,1980,1990,2009,2013),
                        Unadjusted=c(.10,.11,.18,.25,.36,1.19,1.34,2.05,3.80))

allData <- left_join(gasPriceData,cpiData,by='year') %>%
  mutate(Adjusted = Unadjusted/(meanCPI/cpiRef)) %>%
  gather(key="Type", value="Price", -meanCPI, -year) %>%
  mutate(Type=as.factor(Type))

ggplot(allData,aes(year,Price,linetype=Type)) +
  geom_line(size=1.25) +
  ylab('Gasoline prices') + xlab('Year') +
  theme(legend.position = c(0.3,0.8))
Rows: 104 Columns: 14
── Column specification ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
dbl (14): X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X12, X13, X14
 Use `spec()` to retrieve the full column specification for this data.
 Specify the column types or set `show_col_types = FALSE` to quiet this message.
Warning message:
“Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
 Please use `linewidth` instead.”
_images/04-VisualizingData_22_4.svg