Chapter 14: The General Linear Model#

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm, t, binom, scoreatpercentile
import pingouin as pg
import matplotlib
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import KFold

import rpy2.robjects as ro
from rpy2.robjects.packages import importr
from rpy2.robjects import pandas2ri
pandas2ri.activate()
from rpy2.robjects.conversion import localconverter
# import NHANES package
base = importr('NHANES')
%load_ext rpy2.ipython

with localconverter(ro.default_converter + pandas2ri.converter):
  NHANES = ro.conversion.rpy2py(ro.r['NHANES'])

 
NHANES = NHANES.drop_duplicates(subset='ID')
NHANES_adult = NHANES.dropna(subset=['Weight']).query('Age > 17 and BPSysAve > 0')

rng = np.random.default_rng(1234567)

Figure 14.1#

# Data differ from book example because y values are random samples
betas = np.array([6, 5])

df = pd.DataFrame({'studyTime': np.array([2, 3, 5, 6, 6, 8, 10, 12]) / 3,
                   'priorClass': [0, 1, 1, 0, 1, 0, 1, 0]})
grade_intercept = 70
df['grade'] = df.values.dot(betas) + rng.normal(size=8, loc = grade_intercept, scale = 5)

fig = plt.figure(figsize=(6,6))
sns.scatterplot(data=df, x='studyTime', y='grade')
plt.xlabel('Study time (hours)')
plt.ylabel('Grade (percent)')
Text(0, 0.5, 'Grade (percent)')
_images/14-GeneralLinearModel_3_1.png

Correlation test#

# compute correlation between grades and study time
corTestResult = pg.corr(df['studyTime'], df['grade'])
corTestResult
n r CI95% p-val BF10 power
pearson 8 0.742254 [0.08, 0.95] 0.034959 2.886 0.612544

Figure 14.2#

mod = smf.ols(formula='grade ~ studyTime', data=df)
res = mod.fit()

fig = plt.figure(figsize=(6,6))
plt.xlim([-0.5, 4.5])
sns.regplot(data=df, x='studyTime', y='grade', color='black', 
            line_kws={'color': 'blue'}, ci=None, truncate=False)
plt.xlabel('Study time (hours)')
plt.ylabel('Grade (percent)')
xmax = df.studyTime.max() * 1.1

plt.plot([2, 3], [res.params.dot(np.array([1, 2])), res.params.dot(np.array([1, 2]))], 'k', linestyle='dashed')
plt.plot([3, 3], [res.params.dot(np.array([1, 2])), res.params.dot(np.array([1, 3]))], 'k', linestyle='dashed')
plt.plot([0, xmax], [res.params['Intercept'], res.params['Intercept']], 'k:')
[<matplotlib.lines.Line2D at 0x7f3993c48730>]
_images/14-GeneralLinearModel_7_1.png

Table 14.1#

nstudents = 100

readingScores = pd.DataFrame({'test1': rng.normal(100, 10, nstudents),
                              'test2': rng.normal(100, 10, nstudents)})

cutoff = scoreatpercentile(readingScores.test1, 25)

readingScores[readingScores.test1 < cutoff].mean()
test1    87.084873
test2    99.249985
dtype: float64

Linear model output#

print(res.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                  grade   R-squared:                       0.551
Model:                            OLS   Adj. R-squared:                  0.476
Method:                 Least Squares   F-statistic:                     7.361
Date:                Tue, 21 Feb 2023   Prob (F-statistic):             0.0350
Time:                        21:50:28   Log-Likelihood:                -25.157
No. Observations:                   8   AIC:                             54.31
Df Residuals:                       6   BIC:                             54.47
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     75.3225      5.241     14.371      0.000      62.498      88.147
studyTime      5.9019      2.175      2.713      0.035       0.579      11.225
==============================================================================
Omnibus:                        1.336   Durbin-Watson:                   2.668
Prob(Omnibus):                  0.513   Jarque-Bera (JB):                0.897
Skew:                           0.672   Prob(JB):                        0.639
Kurtosis:                       2.060   Cond. No.                         6.30
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
/opt/conda/lib/python3.10/site-packages/scipy/stats/_stats_py.py:1736: UserWarning: kurtosistest only valid for n>=20 ... continuing anyway, n=8
  warnings.warn("kurtosistest only valid for n>=20 ... continuing "

Linear regression output for study time and prior class#

mod2 = smf.ols(formula='grade ~ studyTime + priorClass', data=df)
res2 = mod2.fit()
print(res2.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                  grade   R-squared:                       0.780
Model:                            OLS   Adj. R-squared:                  0.693
Method:                 Least Squares   F-statistic:                     8.889
Date:                Tue, 21 Feb 2023   Prob (F-statistic):             0.0226
Time:                        21:50:28   Log-Likelihood:                -22.294
No. Observations:                   8   AIC:                             50.59
Df Residuals:                       5   BIC:                             50.83
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     69.9340      4.655     15.024      0.000      57.969      81.899
studyTime      6.5119      1.687      3.860      0.012       2.175      10.849
priorClass     8.1336      3.557      2.287      0.071      -1.010      17.277
==============================================================================
Omnibus:                        1.929   Durbin-Watson:                   2.521
Prob(Omnibus):                  0.381   Jarque-Bera (JB):                0.755
Skew:                          -0.126   Prob(JB):                        0.686
Kurtosis:                       1.517   Cond. No.                         7.91
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
/opt/conda/lib/python3.10/site-packages/scipy/stats/_stats_py.py:1736: UserWarning: kurtosistest only valid for n>=20 ... continuing anyway, n=8
  warnings.warn("kurtosistest only valid for n>=20 ... continuing "

Figure 14.3#

fig = plt.figure(figsize=(6,6))
plt.xlim([-0.5, xmax])

sns.scatterplot(data=df, x='studyTime', y='grade', color='black', style='priorClass')
plt.xlabel('Study time (hours)')
plt.ylabel('Grade (percent)')
xmax = df.studyTime.max() * 1.1

plt.plot([0, xmax], [res2.params.dot(np.array([1, 0, 0])), res2.params.dot(np.array([1, xmax, 0]))], 'k')
plt.plot([0, xmax], [res2.params.dot(np.array([1, 0, 1])), res2.params.dot(np.array([1, xmax, 1]))], 'k', linestyle='dashed')
plt.plot([2, 2], [res2.params.dot(np.array([1, 2, 0])), res2.params.dot(np.array([1, 2, 1]))], 'k', linestyle='dotted')
[<matplotlib.lines.Line2D at 0x7f3993ba99c0>]
_images/14-GeneralLinearModel_15_1.png

Figure 14.4#

fig, ax = plt.subplots(2, 2, figsize=(12,12))

df = pd.DataFrame({'group': [-1, 1] * 10,
                  'caffeine': rng.uniform(size=20) * 100})
df['speaking'] = 0.5 * df.caffeine * -df.group + 20 * df.group + rng.normal(size=20) * 10
df['anxiety'] = ['anxious' if i == 1 else 'notAnxious' for i in df.group]


# perform linear regression with caffeine as independent variable
res_caf = smf.ols(formula='speaking ~ caffeine', data=df).fit()

# compute linear regression adding anxiety to model
res_caf_anx = smf.ols(formula='speaking ~ caffeine + anxiety', data=df).fit()

# compute linear regression including caffeine X anxiety interaction
res_caf_anx_int = smf.ols(formula='speaking ~ caffeine * anxiety', data=df).fit()

sns.scatterplot(data=df, x='caffeine', y='speaking', ax=ax[0][0])

sns.scatterplot(data=df, x='caffeine', y='speaking', hue='anxiety', ax=ax[0][1])

sns.regplot(data=df.query('group == -1'), x='caffeine', y='speaking',  ax=ax[1][0], ci=None, truncate=False)
sns.regplot(data=df.query('group == 1'), x='caffeine', y='speaking',  ax=ax[1][0], ci=None, truncate=False)

ax[1][1].set_visible(False)
_images/14-GeneralLinearModel_17_0.png

Linear model for caffeine#

print(res_caf.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               speaking   R-squared:                       0.051
Model:                            OLS   Adj. R-squared:                 -0.001
Method:                 Least Squares   F-statistic:                    0.9718
Date:                Tue, 21 Feb 2023   Prob (F-statistic):              0.337
Time:                        21:50:29   Log-Likelihood:                -86.007
No. Observations:                  20   AIC:                             176.0
Df Residuals:                      18   BIC:                             178.0
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -3.7625      8.994     -0.418      0.681     -22.659      15.134
caffeine       0.1436      0.146      0.986      0.337      -0.162       0.450
==============================================================================
Omnibus:                        0.328   Durbin-Watson:                   2.223
Prob(Omnibus):                  0.849   Jarque-Bera (JB):                0.488
Skew:                          -0.133   Prob(JB):                        0.784
Kurtosis:                       2.283   Cond. No.                         132.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

Linear model result for caffeine and anxiety#

print(res_caf_anx.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               speaking   R-squared:                       0.131
Model:                            OLS   Adj. R-squared:                  0.029
Method:                 Least Squares   F-statistic:                     1.283
Date:                Tue, 21 Feb 2023   Prob (F-statistic):              0.303
Time:                        21:50:29   Log-Likelihood:                -85.127
No. Observations:                  20   AIC:                             176.3
Df Residuals:                      17   BIC:                             179.2
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
=========================================================================================
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
Intercept                -5.9466      9.027     -0.659      0.519     -24.992      13.099
anxiety[T.notAnxious]    10.9220      8.732      1.251      0.228      -7.501      29.345
caffeine                  0.0836      0.151      0.552      0.588      -0.236       0.403
==============================================================================
Omnibus:                        2.592   Durbin-Watson:                   2.082
Prob(Omnibus):                  0.274   Jarque-Bera (JB):                1.151
Skew:                          -0.004   Prob(JB):                        0.562
Kurtosis:                       1.825   Cond. No.                         145.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

Linear model result for interaction#

print(res_caf_anx_int.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:               speaking   R-squared:                       0.761
Model:                            OLS   Adj. R-squared:                  0.716
Method:                 Least Squares   F-statistic:                     16.99
Date:                Tue, 21 Feb 2023   Prob (F-statistic):           3.14e-05
Time:                        21:50:29   Log-Likelihood:                -72.214
No. Observations:                  20   AIC:                             152.4
Df Residuals:                      16   BIC:                             156.4
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
==================================================================================================
                                     coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------------
Intercept                         18.9944      6.209      3.059      0.007       5.833      32.156
anxiety[T.notAnxious]            -46.7564     10.056     -4.650      0.000     -68.073     -25.439
caffeine                          -0.4655      0.118     -3.959      0.001      -0.715      -0.216
caffeine:anxiety[T.notAnxious]     1.0625      0.164      6.496      0.000       0.716       1.409
==============================================================================
Omnibus:                        1.060   Durbin-Watson:                   2.686
Prob(Omnibus):                  0.589   Jarque-Bera (JB):                0.367
Skew:                           0.328   Prob(JB):                        0.833
Kurtosis:                       3.103   Cond. No.                         367.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

Analysis of variance result#

print(sm.stats.anova_lm(res_caf_anx, res_caf_anx_int))
   df_resid          ssr  df_diff      ss_diff          F    Pr(>F)
0      17.0  5828.534062      0.0          NaN        NaN       NaN
1      16.0  1602.468696      1.0  4226.065367  42.195549  0.000007

Figure 14.5#

qq_df = pd.DataFrame({'norm': rng.normal(size=100),
                      'unif': rng.uniform(size=100)})

fig, ax = plt.subplots(1, 2, figsize=(12,6))

_ = sm.qqplot(qq_df['norm'], line ='45', ax=ax[0])
ax[0].set_title('Normal data')

_ = sm.qqplot(qq_df['unif'], line ='45', ax=ax[1])
ax[1].set_title('Non-normal data')
Text(0.5, 1.0, 'Non-normal data')
_images/14-GeneralLinearModel_27_1.png

Table 14.2#

NHANES_child = NHANES.query('Age < 18')[['Height', 'Weight', 'TVHrsDayChild', 'HHIncomeMid', 'CompHrsDayChild', 'Age']].dropna()

# create function to sample data and compute regression on in-sample and out-of-sample data

def get_sample_predictions(sample_size=250, shuffle = False):
    # generate a sample from NHANES
    orig_sample = NHANES_child.sample(sample_size)

    # if shuffle is turned on, then randomly shuffle the weight variable
    if shuffle:
        orig_sample.Weight = rng.permuted(orig_sample.Weight)

    # compute the regression line for Weight, as a function of several
    # other variables (with all possible interactions between variables)
    heightRegressOrig = smf.ols(
        formula='Weight ~ Height * TVHrsDayChild * CompHrsDayChild * HHIncomeMid * Age',
        data=orig_sample).fit()

    # compute the predictions
    pred_orig = heightRegressOrig.predict(orig_sample)
    

    # create a new sample from the same population
    new_sample = NHANES_child.sample(sample_size)

    # use the model fom the original sample to predict the
    # Weight values for the new sample
    pred_new = heightRegressOrig.predict(new_sample)

    # return r-squared and rmse for original and new data
    return([
        r2_score(pred_orig, orig_sample.Weight),
        r2_score(pred_new, new_sample.Weight),
        np.sqrt(mean_squared_error(pred_orig, orig_sample.Weight)),
        np.sqrt(mean_squared_error(pred_new, new_sample.Weight))])


# implement the function
nruns = 100
sim_results = pd.DataFrame([get_sample_predictions() for i in range(nruns)],
                           columns=['r2_orig', 'r2_new', 'RMSE_orig', 'RMSE_new'])
sim_results_shuffled = pd.DataFrame([get_sample_predictions(shuffle=True) for i in range(nruns)],
                           columns=['r2_orig', 'r2_new', 'RMSE_orig', 'RMSE_new'])

sim_results_combined = pd.DataFrame(sim_results.mean(), columns=['True data'])

sim_results_combined['Shuffled data'] = (sim_results_shuffled.mean())

sim_results_combined
True data Shuffled data
r2_orig 0.499525 -1.817208
r2_new 0.486593 -0.972294
RMSE_orig 20.057573 32.853990
RMSE_new 20.367269 27.822996

Table 14.3#

# create a function to run cross-validation
# returns the metrics for the out-of-sample prediction

def compute_cv(df, sampleSize = 250, nfolds = 6):
    kf = KFold(n_splits=nfolds, shuffle=True)
    orig_sample = df.sample(sampleSize)
    fullsample_model = smf.ols(
        formula='Weight ~ Height * TVHrsDayChild * CompHrsDayChild * HHIncomeMid * Age',
        data=orig_sample).fit()
    new_sample = df.sample(sampleSize)
    
    pred = np.zeros(orig_sample.shape[0]) * np.nan
    for train_index, test_index in kf.split(orig_sample):
        train_df = orig_sample.iloc[train_index, :]
        test_df = orig_sample.iloc[test_index, :]
        train_model = smf.ols(
            formula='Weight ~ Height * TVHrsDayChild * CompHrsDayChild * HHIncomeMid * Age',
            data=train_df).fit()
        pred[test_index] = train_model.predict(test_df)

    results = [
        # full original data
        np.sqrt(mean_squared_error(fullsample_model.predict(), orig_sample.Weight)),
        r2_score(fullsample_model.predict(), orig_sample.Weight),
        # new data
        np.sqrt(mean_squared_error(fullsample_model.predict(new_sample), new_sample.Weight)),
        r2_score(fullsample_model.predict(new_sample), new_sample.Weight),
        # CV
        np.sqrt(mean_squared_error(pred, orig_sample.Weight)),
        r2_score(pred, orig_sample.Weight)]
    return(results)

nruns = 1000
sim_results = pd.DataFrame([compute_cv(NHANES_child) for i in range(nruns)],
                           columns=['RMSE_insample', 'R2_insample',
                                    'RMSE_newdata', 'R2_newdata',
                                   'RMSE_CV', 'R2_CV'])
sim_results.mean()
RMSE_insample     19.981154
R2_insample        0.499827
RMSE_newdata      59.366464
R2_newdata         0.485745
RMSE_CV          355.700929
R2_CV              0.448627
dtype: float64

Table 14.4#

df = pd.DataFrame({'studyTime': np.array([2, 3, 5, 6, 6, 8, 10, 12]) / 3,
                   'priorClass': [0, 1, 1, 0, 1, 0, 1, 0]})
df['grade'] = df.values.dot(betas) + rng.normal(size=8, loc = 70, scale = 5)


df.values
array([[  0.66666667,   0.        ,  81.40234345],
       [  1.        ,   1.        ,  78.49643294],
       [  1.66666667,   1.        ,  77.8962827 ],
       [  2.        ,   0.        ,  79.66479948],
       [  2.        ,   1.        ,  86.82553731],
       [  2.66666667,   0.        ,  88.18580407],
       [  3.33333333,   1.        , 100.96621238],
       [  4.        ,   0.        ,  99.59806513]])

Table 14.5#

# compute beta estimates using linear algebra

#create Y variable 8 x 1 matrix
Y = df.grade

#create X variable 8 x 2 matrix
X = np.zeros((df.shape[0], 2))

#assign studyTime values to first column in X matrix
X[:, 1] = df.studyTime.values - df.studyTime.values.mean()

#assign constant of 1 to second column in X matrix
X[:, 0] = 1

# compute inverse of X using ginv()
# %*% is the R matrix multiplication operator

beta_hat = np.linalg.inv(X.T.dot(X)).dot(X.T).dot(Y.values) #multiple the inverse of X by Y

beta_df = pd.DataFrame({'True': [grade_intercept, betas[0]],
                        'Estimated': beta_hat}, index=['intercept', 'grade'])

beta_df
True Estimated
intercept 70 86.629435
grade 6 7.211706
# confirm results same as statsmodels

mod2 = sm.OLS(Y, X)
res2 = mod2.fit()
print(res2.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                  grade   R-squared:                       0.778
Model:                            OLS   Adj. R-squared:                  0.741
Method:                 Least Squares   F-statistic:                     20.99
Date:                Tue, 21 Feb 2023   Prob (F-statistic):            0.00376
Time:                        21:52:29   Log-Likelihood:                -22.569
No. Observations:                   8   AIC:                             49.14
Df Residuals:                       6   BIC:                             49.30
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         86.6294      1.659     52.213      0.000      82.570      90.689
x1             7.2117      1.574      4.582      0.004       3.360      11.063
==============================================================================
Omnibus:                        0.461   Durbin-Watson:                   1.686
Prob(Omnibus):                  0.794   Jarque-Bera (JB):                0.458
Skew:                           0.094   Prob(JB):                        0.795
Kurtosis:                       1.843   Cond. No.                         1.05
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
/opt/conda/lib/python3.10/site-packages/scipy/stats/_stats_py.py:1736: UserWarning: kurtosistest only valid for n>=20 ... continuing anyway, n=8
  warnings.warn("kurtosistest only valid for n>=20 ... continuing "