Chapter 12: Modeling categorical relationships
Contents
Chapter 12: Modeling categorical relationships#
library(tidyverse)
library(ggplot2)
library(kableExtra)
library(BayesFactor)
library(sfsmisc)
library(cowplot)
theme_set(theme_minimal(base_size = 14))
library(knitr)
set.seed(123456) # set random seed to exactly replicate results
# load the NHANES data library
library(NHANES)
# drop duplicated IDs within the NHANES dataset
NHANES <-
  NHANES %>%
  dplyr::distinct(ID,.keep_all=TRUE)
NHANES_adult <-
  NHANES %>%
  drop_na(Weight) %>%
  subset(Age>=18)
── Attaching packages ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.2 ──
✔ ggplot2 3.4.1     ✔ purrr   1.0.1
✔ tibble  3.1.8     ✔ dplyr   1.1.0
✔ tidyr   1.3.0     ✔ stringr 1.5.0
✔ readr   2.1.4     ✔ forcats 1.0.0
── Conflicts ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
Attaching package: ‘kableExtra’
The following object is masked from ‘package:dplyr’:
    group_rows
Loading required package: coda
Loading required package: Matrix
Attaching package: ‘Matrix’
The following objects are masked from ‘package:tidyr’:
    expand, pack, unpack
************
Welcome to BayesFactor 0.9.12-4.4. If you have questions, please contact Richard Morey (richarddmorey@gmail.com).
Type BFManual() to open the manual.
************
Attaching package: ‘sfsmisc’
The following object is masked from ‘package:dplyr’:
    last
Table 12.1#
candyDf <-
  tibble(
    `Candy Type` = c("chocolate", "licorice", "gumball"),
    count = c(30, 33, 37)
  )
# kable(candyDf, caption='Counts of different candies in our bag.')
# compute chi-squared statistic
candyDf <- candyDf %>%
  mutate(nullExpectation =c(1 / 3, 1 / 3, 1 / 3) * sum(candyDf$count),
         `squared difference`=(count - nullExpectation)**2)
candyDf
# kable(candyDf, digits=3, caption='Observed counts, expectations under the null hypothesis, and squared differences in the candy data')
| Candy Type | count | nullExpectation | squared difference | 
|---|---|---|---|
| <chr> | <dbl> | <dbl> | <dbl> | 
| chocolate | 30 | 33.33333 | 11.1111111 | 
| licorice | 33 | 33.33333 | 0.1111111 | 
| gumball | 37 | 33.33333 | 13.4444444 | 
Figure 12.1#
chisqVal <-
  sum(
    ((candyDf$count - candyDf$nullExpectation)**2) / candyDf$nullExpectation
  )
xvals <- seq(0.01, 20, 0.01)
dfvals <- c(1, 2, 4, 8)
chisqDf <-
  data.frame(xvals, dfvals) %>%
  complete(xvals, dfvals)
chisqDf <-
  chisqDf %>%
  mutate(chisq = dchisq(x = xvals, df = dfvals),
         dfvals= as.factor(dfvals)) %>%
  group_by(dfvals) %>%
  mutate(chisqNorm = chisq / max(chisq),
         Df=dfvals
         )
p1 <- ggplot(chisqDf, aes(xvals, chisqNorm, group = Df, linetype = Df)) +
  geom_line() +
  theme(legend.position = c(0.8, 0.7)) +
  labs(
    fill = "Degrees of freedom",
    color = "Degrees of freedom",
    x = "Chi-squared values"
  ) + ylab('Density')
# simulate 50,000 sums of 8 standard normal random variables and compare
# to theoretical chi-squared distribution
# create a matrix with 50k columns of 8 rows of squared normal random variables
d <- replicate(50000, rnorm(n = 8, mean = 0, sd = 1)**2)
# sum each column of 8 variables
dMean <- apply(d, 2, sum)
# create a data frame of the theoretical chi-square distribution
# with 8 degrees of freedom
csDf <-
  data.frame(x = seq(0.01, 30, 0.01)) %>%
  mutate(chisq = dchisq(x, 8))
pval <- pchisq(chisqVal, df = 2, lower.tail = FALSE) #df = degrees of freedom
p2 <- ggplot(data.frame(dMean),aes(dMean)) +
  geom_histogram(aes(y=after_stat(density)),bins=100, fill='gray') +
  geom_line(data=csDf,aes(x,chisq),linetype='dotted',linewidth=1.5)+
  xlim(0,30) + ylim(0,.12) +
  labs(
    y = "Density",
    x = "Sum of squared random normal variables"
  )
plot_grid(p1, p2)
Warning message:
“Removed 11 rows containing non-finite values (`stat_bin()`).”
Warning message:
“Removed 2 rows containing missing values (`geom_bar()`).”
Table 12.2#
# load police stop data after preprocessing using code/process_CT_data.py
stopData <-
  read_csv("https://raw.githubusercontent.com/statsthinking21/statsthinking21-figures-data/main/CT_data_cleaned.csv",show_col_types = FALSE) %>%
  rename(searched = search_conducted)
# compute and print two-way contingency table
summaryDf2way <-
  stopData %>%
  count(searched, driver_race) %>%
  arrange(driver_race, searched)
summaryContingencyTable <-
  summaryDf2way %>%
  spread(driver_race, n)
# Compute and print contingency table using proportions
# rather than raw frequencies
summaryContingencyTable <-
  summaryContingencyTable %>%
  mutate(
    `Black (relative)` = Black / nrow(stopData), #count of Black individuals searched / total searched
  `White (relative)` = White / nrow(stopData)
  )
summaryContingencyTable
# kable(summaryContingencyTable, digits=3, caption='Contingency table for police search data')
| searched | Black | White | Black (relative) | White (relative) | 
|---|---|---|---|---|
| <lgl> | <int> | <int> | <dbl> | <dbl> | 
| FALSE | 36244 | 239241 | 0.129529827 | 0.85500622 | 
| TRUE | 1219 | 3108 | 0.004356497 | 0.01110746 | 
Chi-squared test result#
# first, compute the marginal probabilities
# probability of being each race
summaryDfRace <-
  stopData %>%
  count(driver_race) %>% #count the number of drivers of each race
  mutate(
    prop = n / sum(n) #compute the proportion of each race out of all drivers
  )
# probability of being searched
summaryDfStop <-
  stopData %>%
  count(searched) %>% #count the number of searched vs. not searched
  mutate(
    prop = n / sum(n) # compute proportion of each outcome out all traffic stops
  )
# We can use a linear algebra trick known as the "outer product"
# (via the `outer()` function) to compute this easily.
# second, multiply outer product by n (all stops) to compute expected frequencies
expected <- outer(summaryDfRace$prop, summaryDfStop$prop) * nrow(stopData)
# create a data frame of expected frequencies for each race
expectedDf <-
  data.frame(expected, driverRace = c("Black", "White")) %>%
  rename(
    NotSearched = X1,
    Searched = X2
  )
# tidy the data frame
expectedDfTidy <-
  gather(expectedDf, searched, n, -driverRace) %>%
  arrange(driverRace, searched)
# third, add expected frequencies to the original summary table
# and fourth, compute the standardized squared difference between
# the observed and expected frequences
summaryDf2way <-
  summaryDf2way %>%
  mutate(expected = expectedDfTidy$n)
summaryDf2way <-
  summaryDf2way %>%
  mutate(stdSqDiff = (n - expected)**2 / expected)
chisq <- sum(summaryDf2way$stdSqDiff)
pval <- pchisq(chisq, df = 1, lower.tail = FALSE)
#kable(summaryDf2way, digits=2,caption='Summary of the 2-way contingency table for police search data')
# first need to rearrange the data into a 2x2 table
summaryDf2wayTable <-
  summaryDf2way %>%
  dplyr::select(-expected, -stdSqDiff) %>%
  spread(searched, n) %>%
  dplyr::select(-driver_race)
chisqTestResult <- chisq.test(summaryDf2wayTable, 1, correct = FALSE)
chisqTestResult
	Pearson's Chi-squared test
data:  summaryDf2wayTable
X-squared = 828.3, df = 1, p-value < 2.2e-16
Table 12.4#
# compute standardized residuals
summaryDfResids <-
  summaryDf2way %>%
  mutate(`Standardized residuals` = (n - expected)/sqrt(expected)) %>%
  dplyr::select(-n, -expected, -stdSqDiff)
summaryDfResids
| searched | driver_race | Standardized residuals | 
|---|---|---|
| <lgl> | <chr> | <dbl> | 
| FALSE | Black | -3.330746 | 
| TRUE | Black | 26.576456 | 
| FALSE | White | 1.309550 | 
| TRUE | White | -10.449072 | 
Bayes factor#
# compute Bayes factor
# using independent multinomial sampling plan in which row totals (driver race)
# are fixed
bf <-
  contingencyTableBF(as.matrix(summaryDf2wayTable),
  sampleType = "indepMulti",
  fixedMargin = "cols"
)
bf
Bayes factor analysis
--------------
[1] Non-indep. (a=1) : 1.753219e+142 ±0%
Against denominator:
  Null, independence, a = 1 
---
Bayes factor type: BFcontingencyTable, independent multinomial
Table 12.5#
# summarize depression as a function of sleep trouble
depressedSleepTrouble <-
  NHANES_adult %>%
  drop_na(SleepTrouble, Depressed) %>%
  count(SleepTrouble, Depressed) %>%
  arrange(SleepTrouble, Depressed)
depressedSleepTroubleTable <-
  depressedSleepTrouble %>%
  spread(SleepTrouble, n) %>%
  rename(
    NoSleepTrouble = No,
    YesSleepTrouble = Yes
  )
depressedSleepTroubleTable
#kable(depressedSleepTroubleTable, caption="Relationship between depression and sleep problems in the NHANES dataset")
| Depressed | NoSleepTrouble | YesSleepTrouble | 
|---|---|---|
| <fct> | <int> | <int> | 
| None | 2614 | 676 | 
| Several | 418 | 249 | 
| Most | 138 | 145 | 
Chi-squared result#
# need to remove the column with the label names
depressedSleepTroubleTable <-
  depressedSleepTroubleTable %>%
  dplyr::select(-Depressed)
depressedSleepChisq <- chisq.test(depressedSleepTroubleTable)
depressedSleepChisq
	Pearson's Chi-squared test
data:  depressedSleepTroubleTable
X-squared = 191.46, df = 2, p-value < 2.2e-16
Bayes factor#
# compute bayes factor, using a joint multinomial sampling plan
bf <-
  contingencyTableBF(
    as.matrix(depressedSleepTroubleTable),
    sampleType = "jointMulti"
  )
bf
Bayes factor analysis
--------------
[1] Non-indep. (a=1) : 1.796345e+35 ±0%
Against denominator:
  Null, independence, a = 1 
---
Bayes factor type: BFcontingencyTable, joint multinomial