Contents

Chapter 12: Modeling categorical relationships

Contents

Chapter 12: Modeling categorical relationships#

import pandas as pd
import sidetable
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm, t, binom, chi2
import pingouin as pg
import matplotlib

import rpy2.robjects as ro
from rpy2.robjects.packages import importr
from rpy2.robjects import pandas2ri
pandas2ri.activate()
from rpy2.robjects.conversion import localconverter
%load_ext rpy2.ipython

# import NHANES package
base = importr('NHANES')

with localconverter(ro.default_converter + pandas2ri.converter):
  NHANES = ro.conversion.rpy2py(ro.r['NHANES'])

 
NHANES = NHANES.drop_duplicates(subset='ID')
NHANES_adult = NHANES.dropna(subset=['Weight']).query('Age > 17 and BPSysAve > 0')

rng = np.random.default_rng(123456)

Table 12.1#

candyDf = pd.DataFrame({'Candy Type': ["chocolate", "licorice", "gumball"],
                            'Count': [30, 33, 37]})
candyDf['nullExpectation'] = [candyDf.Count.sum() / 3] * 3
candyDf['squaredDifference'] = (candyDf.Count - candyDf.nullExpectation) ** 2

candyDf

	Candy Type	Count	nullExpectation	squaredDifference
0	chocolate	30	33.333333	11.111111
1	licorice	33	33.333333	0.111111
2	gumball	37	33.333333	13.444444

Figure 12.1#

chisqVal = np.sum(candyDf.squaredDifference / candyDf.nullExpectation)
chisqVal

0.7399999999999999

x = np.arange(0.01, 20, .01)
dfvals = [1, 2, 4, 8]

x_df = [(i, j) for i in x for j in dfvals]

chisqDf = pd.DataFrame(x_df, columns = ['x', 'df'])
chisqDf['chisq'] = chi2.pdf(chisqDf.x, chisqDf.df)
for df in dfvals:
    chisqDf.loc[chisqDf.df == df, 'chisq'] = chisqDf[chisqDf.df == df]['chisq'] / chisqDf[chisqDf.df == df]['chisq'].max()
fig, ax = plt.subplots(1, 2, figsize=(12,6))

sns.lineplot(data=chisqDf, x='x', y='chisq', style='df', ax=ax[0])
ax[0].set_xlabel("Chi-squared values")
ax[0].set_ylabel('Density')


# simulate 50,000 sums of 8 standard normal random variables and compare
# to theoretical chi-squared distribution

# create a matrix with 50k columns of 8 rows of squared normal random variables
dSum = (rng.normal(size=(50000, 8)) ** 2).sum(axis=1)

sns.histplot(dSum, ax=ax[1], stat='density')
ax[1].set_ylabel("Density")
ax[1].set_xlabel("Sum of squared random normal variables")

csDf = pd.DataFrame({'x': np.arange(0.01, 30, 0.01)})
csDf['chisq'] = chi2.pdf(csDf.x, 8)
_ =sns.lineplot(data=csDf, x='x', y='chisq', ax=ax[1], color='black')

_images/12-CategoricalRelationships_6_0.png

Table 12.2#

stopData = pd.read_csv('https://raw.githubusercontent.com/statsthinking21/statsthinking21-figures-data/main/CT_data_cleaned.csv')
table = pd.crosstab(stopData['driver_race'], stopData['search_conducted'], margins=True)
print(table)
n = stopData.shape[0]
print('')
print('Normalized:')
table_normalized = pd.crosstab(stopData['driver_race'], stopData['search_conducted'], margins=True, normalize='all')
print(table_normalized)

search_conducted   False  True     All
driver_race                           
Black              36244  1219   37463
White             239241  3108  242349
All               275485  4327  279812

Normalized:
search_conducted     False      True       All
driver_race                                   
Black             0.129530  0.004356  0.133886
White             0.855006  0.011107  0.866114
All               0.984536  0.015464  1.000000

Chi-squared test result#

expected = np.outer(table_normalized.values[:2, 2], table_normalized.values[2, :2]) * n
actual = pd.crosstab(stopData['driver_race'], stopData['search_conducted'], margins=False)
diff = expected - actual
stdSqDiff = diff **2 / expected
chisq = stdSqDiff.sum().sum()
pval = chi2.pdf(chisq, 1)
pval

1.9001204194035138e-182

_, _, stats = pg.chi2_independence(stopData, 'driver_race', 'search_conducted')
stats[stats.test=='pearson']

	test	lambda	chi2	dof	pval	cramer	power
0	pearson	1.0	827.005515	1.0	7.255988e-182	0.054365	1.0

Table 12.3#

summaryDfResids = diff/ np.sqrt(expected)
summaryDfResids

search_conducted	False	True
driver_race
Black	3.330746	-26.576456
White	-1.309550	10.449072

Bayes factor#

%%R -i actual

# compute Bayes factor
# using independent multinomial sampling plan in which row totals (driver race)
# are fixed
library(BayesFactor)
bf <-
  contingencyTableBF(as.matrix(actual),
  sampleType = "indepMulti",
  fixedMargin = "cols"
)
bf

R[write to console]: Loading required package: coda

R[write to console]: Loading required package: Matrix

R[write to console]: ************
Welcome to BayesFactor 0.9.12-4.4. If you have questions, please contact Richard Morey (richarddmorey@gmail.com).

Type BFManual() to open the manual.
************

Bayes factor analysis
--------------

[1] Non-indep. (a=1)

1.753219e+142

±

Against denominator:

Null, independence, a = 1

---
Bayes factor type: 

BFcontingencyTable

independent multinomial

Table 12.4#

NHANES_sleep = NHANES.query('Age > 17').dropna(subset=['SleepTrouble', 'Depressed'])
depressedSleepTrouble = pd.crosstab( NHANES_sleep.Depressed, NHANES_sleep.SleepTrouble)
depressedSleepTrouble

SleepTrouble	No	Yes
Depressed
None	2631	678
Several	423	254
Most	140	146

Chi-squared result#

_, _, stats = pg.chi2_independence(NHANES_sleep, 'SleepTrouble', 'Depressed')
stats[stats.test=='pearson']

	test	lambda	chi2	dof	pval	cramer	power
0	pearson	1.0	194.653393	2.0	5.389552e-43	0.213459	1.0

Bayes factor#

%%R -i depressedSleepTrouble

# compute bayes factor, using a joint multinomial sampling plan
bf <-
  contingencyTableBF(
    as.matrix(depressedSleepTrouble),
    sampleType = "jointMulti"
  )
bf

Bayes factor analysis
--------------

[1] Non-indep. (a=1)

8.541445e+35

±

Against denominator:

Null, independence, a = 1

---
Bayes factor type: 

BFcontingencyTable

joint multinomial