Chapter 13: Modeling continuous relationships#

library(tidyverse)
library(ggplot2)
library(fivethirtyeight)
library(BayesFactor)
library(bayestestR)
library(cowplot)
library(knitr)
library(DiagrammeR)
library(htmltools)
library(webshot)
theme_set(theme_minimal(base_size = 14))

set.seed(123456) # set random seed to exactly replicate results

# load the NHANES data library
library(NHANES)

# drop duplicated IDs within the NHANES dataset
NHANES <-
  NHANES %>%
  dplyr::distinct(ID,.keep_all=TRUE)

NHANES_adult <-
  NHANES %>%
  drop_na(Weight) %>%
  subset(Age>=18)
── Attaching packages ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.2 ──
 ggplot2 3.4.1      purrr   1.0.1
 tibble  3.1.8      dplyr   1.1.0
 tidyr   1.3.0      stringr 1.5.0
 readr   2.1.4      forcats 1.0.0
── Conflicts ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
 dplyr::filter() masks stats::filter()
 dplyr::lag()    masks stats::lag()
Some larger datasets need to be installed separately, like senators and
house_district_forecast. To install these, we recommend you install the
fivethirtyeightdata package by running:
install.packages('fivethirtyeightdata', repos =
'https://fivethirtyeightdata.github.io/drat/', type = 'source')
Loading required package: coda
Loading required package: Matrix
Attaching package: ‘Matrix’
The following objects are masked from ‘package:tidyr’:

    expand, pack, unpack
************
Welcome to BayesFactor 0.9.12-4.4. If you have questions, please contact Richard Morey (richarddmorey@gmail.com).

Type BFManual() to open the manual.
************

Figure 13.1#

hateCrimes <-
  hate_crimes %>%
  mutate(state_abb = state.abb[match(state,state.name)]) %>%
  drop_na(avg_hatecrimes_per_100k_fbi)

hateCrimes$state_abb[hateCrimes$state=="District of Columbia"]='DC'

ggplot(hateCrimes,aes(gini_index,avg_hatecrimes_per_100k_fbi,label=state_abb)) +
  geom_point() +
  geom_text(aes(label=state_abb),hjust=0, vjust=0) +
  theme(plot.title = element_text(size = 20, face = "bold")) +
  xlab('Gini index') +
  ylab('Avg hate crimes per 100K population (FBI)') +
  theme(plot.margin = unit(c(1,1,1,1), "cm")) +
  xlim(0.4, 0.55)
_images/13-ContinuousRelationships_3_0.svg

Table 13.1#

# create data for toy example of covariance
df <-
  tibble(x = c(3, 5, 8, 10, 12)) %>%
  mutate(y = x + round(rnorm(n = 5, mean = 0, sd = 2))) %>%
  mutate(
    y_dev = y - mean(y),
    x_dev = x - mean(x)
  ) %>%
  mutate(crossproduct = y_dev * x_dev)

covXY <- sum(df$crossproduct) / (nrow(df) - 1)
corXY <- sum(df$crossproduct) / ((nrow(df) - 1) * sd(df$x) * sd(df$y))

kable(df, caption='Data for toy example of covariance')
Table: Data for toy example of covariance

|  x|  y| y_dev| x_dev| crossproduct|
|--:|--:|-----:|-----:|------------:|
|  3|  5|  -3.6|  -4.6|        16.56|
|  5|  4|  -4.6|  -2.6|        11.96|
|  8|  7|  -1.6|   0.4|        -0.64|
| 10| 10|   1.4|   2.4|         3.36|
| 12| 17|   8.4|   4.4|        36.96|

Hypothesis testing for correlations#

# perform correlation test on hate crime data
cor.test(
  hateCrimes$avg_hatecrimes_per_100k_fbi,
  hateCrimes$gini_index
)
	Pearson's product-moment correlation

data:  hateCrimes$avg_hatecrimes_per_100k_fbi and hateCrimes$gini_index
t = 3.2182, df = 48, p-value = 0.002314
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.1619097 0.6261922
sample estimates:
      cor 
0.4212719 

Figure 13.2#

corGiniHC <-
  cor(
    hateCrimes$gini_index,
    hateCrimes$avg_hatecrimes_per_100k_fbi
  )

# compute null distribution by shuffling order of variable values

# create a function to compute the correlation on the shuffled values
shuffleCorr <- function(x, y) {
  xShuffled <- sample(x)
  return(cor(xShuffled, y))
}

# run this function 2500 times
shuffleDist <-
  replicate(
    2500,
    shuffleCorr(hateCrimes$avg_hatecrimes_per_100k_fbi, hateCrimes$gini_index)
  )

ggplot(data.frame(shuffleDist),aes(shuffleDist)) +
  geom_histogram(bins=100) +
  geom_vline(xintercept = corGiniHC,color='blue') +
  ggtitle(sprintf('p(shuffled r >= observed) = %0.3f',mean(shuffleDist>=corGiniHC))) +
  theme(plot.title = element_text(size = 16, face = "bold")) +
  theme(plot.margin = unit(c(0,1,0,0), "cm")) +
  labs(
  x = "Correlation coeffcients of shuffled variables"
  )
_images/13-ContinuousRelationships_9_0.svg

Figure 13.3#

n <- 10
set.seed(1234)

dfOutlier <-
  data.frame(x = rnorm(n)) %>%
  mutate(y = x * -1)

dfOutlier$x[1] <- 10
dfOutlier$y[1] <- 10
cc <- cor(dfOutlier$x, dfOutlier$y)
ccSpearman <- cor(dfOutlier$x, dfOutlier$y, method = "spearman")

p <- ggplot(dfOutlier, aes(x, y)) +
  geom_point() +
  ggtitle(sprintf("r = %0.2f (without outlier: r = %.2f)", cc, cor(dfOutlier$x[2:n], dfOutlier$y[2:n]))) +
  theme(plot.title = element_text(size = 16, face = "bold")) +
  theme(plot.margin = unit(c(0, 1, 0, 0), "cm")) +
  labs(
    x = "variable x",
    y = "variable y"
  )
print(p)
_images/13-ContinuousRelationships_11_0.svg

Spearman correlation test#

corTestSpearman <- cor.test( hateCrimes$avg_hatecrimes_per_100k_fbi,
  hateCrimes$gini_index,
  method = "spearman")
corTestSpearman
Warning message in cor.test.default(hateCrimes$avg_hatecrimes_per_100k_fbi, hateCrimes$gini_index, :
“Cannot compute exact p-value with ties”
	Spearman's rank correlation rho

data:  hateCrimes$avg_hatecrimes_per_100k_fbi and hateCrimes$gini_index
S = 20146, p-value = 0.8221
alternative hypothesis: true rho is not equal to 0
sample estimates:
       rho 
0.03261836 

Figure 13.6#

# function to generate a plot of Lorenz curve and compute Gini coefficient
lorenzCurve = function(df){
  df <- df %>% arrange(income)
  sumIncome <- sum(df$income)
  lc <- array(NA,nrow(df)+1)
  p <- array(NA,nrow(df)+1)
  lc[1] <- 0
  p[1] <- 0
  for (i in 1:nrow(df)){
    lc[i+1] <- sum(df$income[1:i])/sumIncome
    p[i+1] <- i/nrow(df)
  }
  S <- sum(lc)
  giniCoef <- 1 + (1-2*S)/nrow(df)
  return(list(p=p,lc=lc,gc=giniCoef))
}

incomeDf <- data.frame(income=rep(40000,10))
lc <- lorenzCurve(incomeDf)

incomeDf <- data.frame(income=rnorm(10,mean=40000,sd=5000))
lc2 <- lorenzCurve(incomeDf)

incomeDf <- data.frame(income=rep(40000,10))
incomeDf$income[1] <- 40000000
lc3 <- lorenzCurve(incomeDf)

p1 <- ggplot(data.frame(p=lc$p,lc=lc$lc),aes(p,lc)) +
    geom_line(color='blue') +
    geom_point() +
    xlim(0,1) + ylim(0,1) +
    xlab('Cumulative proportion of population') +
    ylab('Cumulative proportion of income') +
    geom_abline(slope=1,intercept = 0,color='black',linetype='dotted') +
    ggtitle(sprintf('A: Gini coefficient = %f',lc$gc))

p2 <- ggplot(data.frame(p=lc2$p,lc=lc2$lc),aes(p,lc)) +
    geom_line(color='blue') +
    geom_point() +
    xlim(0,1) + ylim(0,1) +
    xlab('Cumulative proportion of population') +
    ylab('Cumulative proportion of income') +
    geom_abline(slope=1,intercept = 0,color='black',linetype='dotted') +
    ggtitle(sprintf('B: Gini coefficient = %f',lc2$gc))

p3 <- ggplot(data.frame(p=lc3$p,lc=lc3$lc),aes(p,lc)) +
    geom_line(color='blue') +
    geom_point() +
    xlim(0,1) + ylim(0,1) +
    xlab('Cumulative proportion of population') +
    ylab('Cumulative proportion of income') +
    geom_abline(slope=1,intercept = 0,color='black',linetype='dotted') +
    ggtitle(sprintf('C: Gini coefficient = %f',lc3$gc))

plot_grid(p1,p2,p3,ncol=2)
_images/13-ContinuousRelationships_15_0.svg

Bayesian correlation analysis#

bayesCor <- correlationBF(
  hateCrimes$avg_hatecrimes_per_100k_fbi,
  hateCrimes$gini_index
)
print(bayesCor)
bayesCorPosterior <- describe_posterior(bayesCor)
print(bayesCorPosterior)
Bayes factor analysis
--------------
[1] Alt., r=0.333 : 20.85446 ±0%

Against denominator:
  Null, rho = 0 
---
Bayes factor type: BFcorrelation, Jeffreys-beta*
Summary of Posterior Distribution

Parameter | Median |       95% CI |     pd |          ROPE | % in ROPE |    BF |         Prior
----------------------------------------------------------------------------------------------
rho       |   0.38 | [0.13, 0.58] | 99.88% | [-0.05, 0.05] |        0% | 20.85 | Beta (3 +- 3)