Chapter 12: Modeling categorical relationships#

library(tidyverse)
library(ggplot2)
library(kableExtra)
library(BayesFactor)
library(sfsmisc)
library(cowplot)
theme_set(theme_minimal(base_size = 14))

library(knitr)

set.seed(123456) # set random seed to exactly replicate results

# load the NHANES data library
library(NHANES)

# drop duplicated IDs within the NHANES dataset
NHANES <-
  NHANES %>%
  dplyr::distinct(ID,.keep_all=TRUE)

NHANES_adult <-
  NHANES %>%
  drop_na(Weight) %>%
  subset(Age>=18)
── Attaching packages ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.2 ──
 ggplot2 3.4.1      purrr   1.0.1
 tibble  3.1.8      dplyr   1.1.0
 tidyr   1.3.0      stringr 1.5.0
 readr   2.1.4      forcats 1.0.0
── Conflicts ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
 dplyr::filter() masks stats::filter()
 dplyr::lag()    masks stats::lag()
Attaching package: ‘kableExtra’
The following object is masked from ‘package:dplyr’:

    group_rows
Loading required package: coda
Loading required package: Matrix
Attaching package: ‘Matrix’
The following objects are masked from ‘package:tidyr’:

    expand, pack, unpack
************
Welcome to BayesFactor 0.9.12-4.4. If you have questions, please contact Richard Morey (richarddmorey@gmail.com).

Type BFManual() to open the manual.
************
Attaching package: ‘sfsmisc’
The following object is masked from ‘package:dplyr’:

    last

Table 12.1#

candyDf <-
  tibble(
    `Candy Type` = c("chocolate", "licorice", "gumball"),
    count = c(30, 33, 37)
  )
# kable(candyDf, caption='Counts of different candies in our bag.')

# compute chi-squared statistic

candyDf <- candyDf %>%
  mutate(nullExpectation =c(1 / 3, 1 / 3, 1 / 3) * sum(candyDf$count),
         `squared difference`=(count - nullExpectation)**2)
candyDf
# kable(candyDf, digits=3, caption='Observed counts, expectations under the null hypothesis, and squared differences in the candy data')
A tibble: 3 × 4
Candy TypecountnullExpectationsquared difference
<chr><dbl><dbl><dbl>
chocolate3033.3333311.1111111
licorice 3333.33333 0.1111111
gumball 3733.3333313.4444444

Figure 12.1#

chisqVal <-
  sum(
    ((candyDf$count - candyDf$nullExpectation)**2) / candyDf$nullExpectation
  )

xvals <- seq(0.01, 20, 0.01)
dfvals <- c(1, 2, 4, 8)
chisqDf <-
  data.frame(xvals, dfvals) %>%
  complete(xvals, dfvals)
chisqDf <-
  chisqDf %>%
  mutate(chisq = dchisq(x = xvals, df = dfvals),
         dfvals= as.factor(dfvals)) %>%
  group_by(dfvals) %>%
  mutate(chisqNorm = chisq / max(chisq),
         Df=dfvals
         )


p1 <- ggplot(chisqDf, aes(xvals, chisqNorm, group = Df, linetype = Df)) +
  geom_line() +
  theme(legend.position = c(0.8, 0.7)) +
  labs(
    fill = "Degrees of freedom",
    color = "Degrees of freedom",
    x = "Chi-squared values"
  ) + ylab('Density')


# simulate 50,000 sums of 8 standard normal random variables and compare
# to theoretical chi-squared distribution

# create a matrix with 50k columns of 8 rows of squared normal random variables
d <- replicate(50000, rnorm(n = 8, mean = 0, sd = 1)**2)
# sum each column of 8 variables
dMean <- apply(d, 2, sum)

# create a data frame of the theoretical chi-square distribution
# with 8 degrees of freedom
csDf <-
  data.frame(x = seq(0.01, 30, 0.01)) %>%
  mutate(chisq = dchisq(x, 8))

pval <- pchisq(chisqVal, df = 2, lower.tail = FALSE) #df = degrees of freedom

p2 <- ggplot(data.frame(dMean),aes(dMean)) +
  geom_histogram(aes(y=after_stat(density)),bins=100, fill='gray') +
  geom_line(data=csDf,aes(x,chisq),linetype='dotted',linewidth=1.5)+
  xlim(0,30) + ylim(0,.12) +
  labs(
    y = "Density",
    x = "Sum of squared random normal variables"
  )

plot_grid(p1, p2)
Warning message:
“Removed 11 rows containing non-finite values (`stat_bin()`).”
Warning message:
“Removed 2 rows containing missing values (`geom_bar()`).”
_images/12-CategoricalRelationships_5_2.svg

Table 12.2#

# load police stop data after preprocessing using code/process_CT_data.py
stopData <-
  read_csv("https://raw.githubusercontent.com/statsthinking21/statsthinking21-figures-data/main/CT_data_cleaned.csv",show_col_types = FALSE) %>%
  rename(searched = search_conducted)

# compute and print two-way contingency table
summaryDf2way <-
  stopData %>%
  count(searched, driver_race) %>%
  arrange(driver_race, searched)

summaryContingencyTable <-
  summaryDf2way %>%
  spread(driver_race, n)

# Compute and print contingency table using proportions
# rather than raw frequencies
summaryContingencyTable <-
  summaryContingencyTable %>%
  mutate(
    `Black (relative)` = Black / nrow(stopData), #count of Black individuals searched / total searched
  `White (relative)` = White / nrow(stopData)
  )

summaryContingencyTable


# kable(summaryContingencyTable, digits=3, caption='Contingency table for police search data')
A tibble: 2 × 5
searchedBlackWhiteBlack (relative)White (relative)
<lgl><int><int><dbl><dbl>
FALSE362442392410.1295298270.85500622
TRUE 1219 31080.0043564970.01110746

Chi-squared test result#

# first, compute the marginal probabilities

# probability of being each race
summaryDfRace <-
  stopData %>%
  count(driver_race) %>% #count the number of drivers of each race
  mutate(
    prop = n / sum(n) #compute the proportion of each race out of all drivers
  )

# probability of being searched
summaryDfStop <-
  stopData %>%
  count(searched) %>% #count the number of searched vs. not searched
  mutate(
    prop = n / sum(n) # compute proportion of each outcome out all traffic stops
  )

# We can use a linear algebra trick known as the "outer product"
# (via the `outer()` function) to compute this easily.
# second, multiply outer product by n (all stops) to compute expected frequencies
expected <- outer(summaryDfRace$prop, summaryDfStop$prop) * nrow(stopData)

# create a data frame of expected frequencies for each race
expectedDf <-
  data.frame(expected, driverRace = c("Black", "White")) %>%
  rename(
    NotSearched = X1,
    Searched = X2
  )

# tidy the data frame
expectedDfTidy <-
  gather(expectedDf, searched, n, -driverRace) %>%
  arrange(driverRace, searched)

# third, add expected frequencies to the original summary table
# and fourth, compute the standardized squared difference between
# the observed and expected frequences

summaryDf2way <-
  summaryDf2way %>%
  mutate(expected = expectedDfTidy$n)

summaryDf2way <-
  summaryDf2way %>%
  mutate(stdSqDiff = (n - expected)**2 / expected)

chisq <- sum(summaryDf2way$stdSqDiff)
pval <- pchisq(chisq, df = 1, lower.tail = FALSE)

#kable(summaryDf2way, digits=2,caption='Summary of the 2-way contingency table for police search data')

# first need to rearrange the data into a 2x2 table
summaryDf2wayTable <-
  summaryDf2way %>%
  dplyr::select(-expected, -stdSqDiff) %>%
  spread(searched, n) %>%
  dplyr::select(-driver_race)

chisqTestResult <- chisq.test(summaryDf2wayTable, 1, correct = FALSE)
chisqTestResult
	Pearson's Chi-squared test

data:  summaryDf2wayTable
X-squared = 828.3, df = 1, p-value < 2.2e-16

Table 12.4#

# compute standardized residuals
summaryDfResids <-
  summaryDf2way %>%
  mutate(`Standardized residuals` = (n - expected)/sqrt(expected)) %>%
  dplyr::select(-n, -expected, -stdSqDiff)

summaryDfResids
A tibble: 4 × 3
searcheddriver_raceStandardized residuals
<lgl><chr><dbl>
FALSEBlack -3.330746
TRUEBlack 26.576456
FALSEWhite 1.309550
TRUEWhite-10.449072

Bayes factor#

# compute Bayes factor
# using independent multinomial sampling plan in which row totals (driver race)
# are fixed

bf <-
  contingencyTableBF(as.matrix(summaryDf2wayTable),
  sampleType = "indepMulti",
  fixedMargin = "cols"
)
bf
Bayes factor analysis
--------------
[1] Non-indep. (a=1) : 1.753219e+142 ±0%

Against denominator:
  Null, independence, a = 1 
---
Bayes factor type: BFcontingencyTable, independent multinomial

Table 12.5#

# summarize depression as a function of sleep trouble
depressedSleepTrouble <-
  NHANES_adult %>%
  drop_na(SleepTrouble, Depressed) %>%
  count(SleepTrouble, Depressed) %>%
  arrange(SleepTrouble, Depressed)

depressedSleepTroubleTable <-
  depressedSleepTrouble %>%
  spread(SleepTrouble, n) %>%
  rename(
    NoSleepTrouble = No,
    YesSleepTrouble = Yes
  )

depressedSleepTroubleTable

#kable(depressedSleepTroubleTable, caption="Relationship between depression and sleep problems in the NHANES dataset")
A tibble: 3 × 3
DepressedNoSleepTroubleYesSleepTrouble
<fct><int><int>
None 2614676
Several 418249
Most 138145

Chi-squared result#

# need to remove the column with the label names
depressedSleepTroubleTable <-
  depressedSleepTroubleTable %>%
  dplyr::select(-Depressed)

depressedSleepChisq <- chisq.test(depressedSleepTroubleTable)
depressedSleepChisq
	Pearson's Chi-squared test

data:  depressedSleepTroubleTable
X-squared = 191.46, df = 2, p-value < 2.2e-16

Bayes factor#

# compute bayes factor, using a joint multinomial sampling plan
bf <-
  contingencyTableBF(
    as.matrix(depressedSleepTroubleTable),
    sampleType = "jointMulti"
  )
bf
Bayes factor analysis
--------------
[1] Non-indep. (a=1) : 1.796345e+35 ±0%

Against denominator:
  Null, independence, a = 1 
---
Bayes factor type: BFcontingencyTable, joint multinomial