Chapter 16: Multivariate statistics
Contents
Chapter 16: Multivariate statistics#
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm, t, binom, scoreatpercentile, pearsonr
from scipy.spatial.distance import cdist
import pingouin as pg
import matplotlib
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import KFold
from sklearn.preprocessing import scale
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
from scipy.cluster import hierarchy
from sklearn.decomposition import PCA
%matplotlib inline
import rpy2.robjects as ro
from rpy2.robjects.packages import importr
from rpy2.robjects import pandas2ri
pandas2ri.activate()
from rpy2.robjects.conversion import localconverter
# import NHANES package
base = importr('NHANES')
%load_ext rpy2.ipython
with localconverter(ro.default_converter + pandas2ri.converter):
NHANES = ro.conversion.rpy2py(ro.r['NHANES'])
NHANES = NHANES.drop_duplicates(subset='ID')
NHANES_adult = NHANES.dropna(subset=['Weight']).query('Age > 17 and BPSysAve > 0')
rng = np.random.default_rng(1234567)
Multivariate data setup#
behavdata = pd.read_csv('https://raw.githubusercontent.com/statsthinking21/statsthinking21-figures-data/main/Eisenberg/meaningful_variables.csv')
behavdata.set_index('subcode', inplace=True)
demoghealthdata = pd.read_csv('https://raw.githubusercontent.com/statsthinking21/statsthinking21-figures-data/main/Eisenberg/demographic_health.csv')
demoghealthdata.set_index('subcode', inplace=True)
sexdict = {1:'Female', 0:'Male'}
demoghealthdata['Sex'] = [sexdict[i] for i in demoghealthdata['Sex']]
alldata = demoghealthdata.join(behavdata, how='inner')
rename_dict = {
'upps_impulsivity_survey': 'UPPS',
'sensation_seeking_survey': 'SSS',
'dickman_survey': 'Dickman',
'bis11_survey': 'BIS11',
'spatial_span': 'spatial',
'digit_span': 'digit',
'adaptive_n_back': 'nback',
'dospert_rt_survey': 'dospert',
'motor_selective_stop_signal.SSRT': 'SSRT_motorsel',
'stim_selective_stop_signal.SSRT': 'SSRT_stimsel',
'stop_signal.SSRT_low': 'SSRT_low',
'stop_signal.SSRT_high': 'SSRT_high'
}
impulsivity_vars = []
for column in alldata.columns:
for k, v in rename_dict.items():
if k in column:
alldata.rename(columns={column: column.replace(k, v)}, inplace=True)
impulsivity_vars.append(column.replace(k, v))
impulsivity_data = alldata[impulsivity_vars].dropna()
ssrtdata = alldata[[i for i in alldata.columns if 'SSRT_' in i and 'proactive' not in i]].dropna()
uppsdata = alldata[[i for i in alldata.columns if 'UPPS.' in i]].dropna()
for column in uppsdata.columns:
uppsdata.rename(columns={column: column.replace('UPPS.','')}, inplace=True)
impdata = ssrtdata.join(uppsdata, how='inner').dropna()
for column in impdata.columns:
impdata[column] = scale(impdata[column].values)
var_dict = {'SSRT_motor': 'SSRT_motorsel',
'SSRT_stim': 'SSRT_stimsel',
'UPPS_pers': 'lack_of_perseverance',
'UPPS_premed': 'lack_of_premeditation',
'UPPS_negurg': 'negative_urgency',
'UPPS_posurg': 'positive_urgency',
'UPPS_senseek': 'sensation_seeking'
}
impdata.rename(columns=var_dict, inplace=True)
Figure 16.1#
g = sns.PairGrid(impdata)
def corrfunc(x, y, hue=None, ax=None, **kws):
"""Plot the correlation coefficient in the top left hand corner of a plot."""
r, _ = pearsonr(x, y)
ax = ax or plt.gca()
ax.annotate(f'r = {r:.2f}', xy=(.2, .5), size=24, xycoords=ax.transAxes)
g.map_upper(corrfunc)
g.map_lower(sns.regplot, color='black')
g.map_diag(sns.kdeplot, legend=False)
plt.show()
Figure 16.2#
plt.figure(figsize=(12,12))
cc = impdata.corr()
sns.heatmap(cc, annot=True)
<Axes: >
Figure 16.3#
ccmtx = pd.read_csv('https://raw.githubusercontent.com/statsthinking21/statsthinking21-figures-data/main/myconnectome/ccmtx_sorted.txt', header=None, delimiter=' ').values
parcel_data = pd.read_csv('https://raw.githubusercontent.com/statsthinking21/statsthinking21-figures-data/main/myconnectome/parcel_data.txt', header=None, delimiter='\t')
parcel_data.columns = ['parcelnum', 'hemis', 'X', 'Y', 'Z', 'lobe', 'region', 'network', 'yeo7network', 'yeo17network']
parcel_data = parcel_data.sort_values(by=['hemis', 'yeo7network'])
parcel_data['netdiff'] = False
parcel_data.iloc[1:, -1] = parcel_data.yeo7network[1:].values != parcel_data.yeo7network[:-1].values
hemis_to_use = 'L'
tmp = ccmtx[parcel_data.hemis == hemis_to_use, :]
ccmtx_hemis = tmp[:, parcel_data.hemis == hemis_to_use]
hemis_parcel_data = parcel_data.query('hemis == "{hemis_to_use}"')
sns.heatmap(ccmtx_hemis, cmap='bwr', center=0,
yticklabels=False, xticklabels=False)
<Axes: >
Figure 16.4#
plt.plot([1, 4], [2, 3], 'k--o')
plt.xlim((0, 5))
plt.ylim((0, 4))
(0.0, 4.0)
Figure 16.5#
## Figure differs slightly from book due to differences in random sampling
countries = pd.read_csv('https://raw.githubusercontent.com/statsthinking21/statsthinking21-figures-data/main/countries/country_data.csv').query('Population2020 > 500000')
latlong = countries[['latitude', 'longitude']]
# based on https://stanford.edu/~cpiech/cs221/handouts/kmeans.html
# need to clarify license!
# and https://jakevdp.github.io/PythonDataScienceHandbook/05.11-k-means.html
# (Code under MIT license)
k = 6
centroids = latlong.sample(k, random_state=rng)
iterations = 0
oldCentroids = None
MAX_ITERATIONS = 100
def shouldStop(oldCentroids, centroids, iterations):
if iterations > MAX_ITERATIONS:
return(True)
if oldCentroids is None:
return(False)
return((centroids.values == oldCentroids.values).all())
def getLabels(dataSet, centroids):
d = cdist(dataSet, centroids)
# For each element in the dataset, chose the closest centroid.
# Make that centroid the element's label.
return(d.argmin(axis=1))
def getCentroids(dataSet, labels, k):
# Each centroid is the geometric mean of the points that
# have that centroid's label. Important: If a centroid is empty (no points have
# that centroid's label) you should randomly re-initialize it.
newCentroids = []
for i in range(k):
labeldata = dataSet.loc[labels==i, :]
newCentroids.append(labeldata.mean(axis=0))
return(pd.DataFrame(newCentroids))
all_centroids = None
while not shouldStop(oldCentroids, centroids, iterations):
# Save old centroids for convergence test. Book keeping.
oldCentroids = centroids
iterations = iterations + 1
# Assign labels to each datapoint based on centroids
labels = getLabels(latlong, centroids)
# Assign centroids based on datapoint labels
centroids = getCentroids(latlong, labels, k)
centroids_df = centroids.copy()
centroids_df['iteration'] = iterations
centroids_df['cluster'] = list(range(centroids.shape[0]))
if all_centroids is None:
all_centroids = centroids_df
else:
all_centroids = pd.concat((all_centroids, centroids_df))
print(f'Completed after {iterations} iterations')
countries['cluster'] = labels
sns.scatterplot(data=countries, x='longitude', y='latitude', hue='cluster', legend=False)
sns.scatterplot(data=centroids_df, x='longitude', y='latitude', hue='cluster', s=100, palette='colorblind')
for i in range(1, iterations):
for j in range(k):
iter_df = all_centroids.query(f'iteration == {i + 1} and cluster=={j}')
prev_df = all_centroids.query(f'iteration == {i} and cluster=={j}')
plt.plot([iter_df.longitude, prev_df.longitude], [iter_df.latitude, prev_df.latitude], alpha=0.7)
sns.scatterplot(data=all_centroids.query('iteration == 1'), x='longitude', y='latitude',
legend=False, marker='*', s=80)
Completed after 11 iterations
<Axes: xlabel='longitude', ylabel='latitude'>
Table 16.1#
Comparison of k-means clustering result to actual continents.
pd.crosstab(countries.Continent, countries.cluster).T
Continent | AF | AS | EU | OC | SA |
---|---|---|---|---|---|
cluster | |||||
0 | 0 | 22 | 0 | 0 | 0 |
1 | 19 | 0 | 1 | 0 | 0 |
2 | 0 | 0 | 0 | 0 | 11 |
3 | 0 | 8 | 0 | 6 | 0 |
4 | 22 | 0 | 0 | 0 | 0 |
5 | 2 | 5 | 35 | 0 | 0 |
Figure 16.6#
scikit-learn uses a non-random initialization (“kmeans++”) that results in greater stability of the resulting clusters compared to the random initialization in R’s kmeans function.
cluster_results = []
rstate = np.random.RandomState(1234567)
for i in range(10):
kmeans = KMeans(n_clusters=3, random_state=rstate).fit(impdata.T)
cluster_results.append(kmeans.labels_)
if i > 0 and adjusted_rand_score(cluster_results[i], cluster_results[i - 1]) == 1:
cluster_results[i] = cluster_results[i - 1]
results_df = pd.DataFrame(cluster_results, columns=impdata.columns)
sns.heatmap(results_df)
/opt/conda/lib/python3.10/site-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
warnings.warn(
/opt/conda/lib/python3.10/site-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
warnings.warn(
/opt/conda/lib/python3.10/site-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
warnings.warn(
/opt/conda/lib/python3.10/site-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
warnings.warn(
/opt/conda/lib/python3.10/site-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
warnings.warn(
/opt/conda/lib/python3.10/site-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
warnings.warn(
/opt/conda/lib/python3.10/site-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
warnings.warn(
/opt/conda/lib/python3.10/site-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
warnings.warn(
/opt/conda/lib/python3.10/site-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
warnings.warn(
/opt/conda/lib/python3.10/site-packages/sklearn/cluster/_kmeans.py:870: FutureWarning: The default value of `n_init` will change from 10 to 'auto' in 1.4. Set the value of `n_init` explicitly to suppress the warning
warnings.warn(
<Axes: >
Figure 16.7#
# Plot the hierarchical clustering as a dendrogram.
temp = hierarchy.linkage(impdata.T, 'average')
dn = hierarchy.dendrogram(
temp, above_threshold_color="green", color_threshold=.7,
labels=impdata.columns, orientation='left')
cutoffs = [25, 20, 19]
colors = ['blue', 'green', 'red']
for i, c in enumerate(cutoffs):
plt.plot([c, c], [0, 100], color=colors[i])
Figure 16.8#
N = 30 #setting my sample size
mu = [0, 0] #setting the means
c1 = .7
sigma = np.array([[1, c1], [c1, 1]]) #setting the covariance matrix values. The "2,2" part at the tail end defines the number of rows and columns in the matrix
sim_df = pd.DataFrame(scale(rng.multivariate_normal(mu, sigma, size=N)),
columns=['Y', 'X'])
cov_eig_values, cov_eig_vectors = np.linalg.eigh(sim_df.cov())
cov_eig_slopes = np.array([
cov_eig_vectors[0, 0] / cov_eig_vectors[1, 0],
cov_eig_vectors[0, 1] / cov_eig_vectors[1, 1]])
sns.scatterplot(data=sim_df, x='X', y='Y')
plt.plot([0, cov_eig_values[0]], [0, cov_eig_slopes[0] * cov_eig_values[0]], color='red')
plt.plot([0, cov_eig_values[1]], [0, cov_eig_slopes[1] * cov_eig_values[1]], color='green')
[<matplotlib.lines.Line2D at 0x7f4d5c30d270>]
Figure 16.9#
ssrtdata = impdata[[i for i in impdata.columns if 'SSRT' in i]]
pca_ssrt = PCA()
pca_ssrt.fit(ssrtdata)
pca_ssrt_varacct = pca_ssrt.explained_variance_ratio_
print(pca_ssrt_varacct)
uppsdata = impdata[[i for i in impdata.columns if 'SSRT' not in i]]
pca_upps = PCA()
pca_upps.fit(uppsdata)
pca_upps_varacct = pca_upps.explained_variance_ratio_
sns.lineplot(x=np.arange(1, 5, dtype='int'), y=pca_ssrt_varacct, label='SSRT')
sns.lineplot(x=np.arange(1, 6, dtype='int'), y=pca_upps_varacct, label='UPPS')
plt.ylabel('Variance accounted for')
plt.xlabel('Component')
_ = plt.xticks(np.arange(1, 6, dtype='int'))
[0.60018896 0.16768757 0.14725293 0.08487054]
Correlation test for PCA components#
ssrt_pred = pca_ssrt.transform(ssrtdata)[:, 0]
upps_pred = pca_upps.transform(uppsdata)[:, 0]
pg.corr(ssrt_pred, upps_pred)
n | r | CI95% | p-val | BF10 | power | |
---|---|---|---|---|---|---|
pearson | 329 | -0.014819 | [-0.12, 0.09] | 0.788861 | 0.072 | 0.058214 |
Figure 16.10#
imp_pca = PCA()
imp_pca.fit(impdata)
sns.barplot(x=np.arange(1, 10), y=imp_pca.explained_variance_ratio_, color='blue')
sns.lineplot(x=np.arange(0, 9), y=imp_pca.explained_variance_ratio_, color='black', )
plt.ylabel('Variance accounted for')
plt.xlabel('Component')
Text(0.5, 0, 'Component')
Figure 16.11#
loading_df = pd.DataFrame(imp_pca.components_, columns=impdata.columns)
loading_df['Component'] = np.arange(1, 10)
loading_df = loading_df.query('Component < 3')
loading_df_long = pd.melt(loading_df, id_vars=['Component'], var_name='Variable', value_name='loading')
g = sns.FacetGrid(loading_df_long, row="Component", height=4, aspect=2)
g.map_dataframe(sns.barplot, x="Variable", y="loading")
plt.xticks(rotation = 45) # Rotates X-Axis Ticks by 45-degrees
g.add_legend()
<seaborn.axisgrid.FacetGrid at 0x7f4d5c3868f0>
Figure 16.12#
N = 200 # sample size
mu = [0] * 3 # means
c1 = .5 # correlation b/w WM and FR
sigma = np.array([[1, c1, 0], [c1, 1, 0], [0, 0, 1]])
latent_df = pd.DataFrame(scale(rng.multivariate_normal(mu, sigma, size=N)),
columns=['WM', 'FR', 'IMP'])
tasknames = ['nback', 'dspan', 'sspan', 'ravens', 'crt', 'UPPS', 'BIS11', 'dickman']
ntasks = len(tasknames)
weights = np.zeros((3, ntasks))
weights[0, 0:3] = 1
weights[1, 3:5] = 1
weights[2, 5:8] = 1
noise_sd = .6
observed_df = pd.DataFrame(
latent_df.values.dot(weights) + rng.multivariate_normal(np.zeros(ntasks), np.eye(ntasks) * noise_sd, size=N),
columns=tasknames)
cormtx = observed_df.corr()
plt.figure(figsize=(8,8))
sns.heatmap(cormtx, annot=True)
<Axes: >
Factor analysis output#
import semopy
fa = semopy.efa.explore_cfa_model(observed_df)
print(fa)
m = semopy.Model(fa)
m.fit(observed_df)
semopy.semplot(m, "pd.png", plot_covs=True)
eta1 =~ dspan + sspan + nback
eta2 =~ UPPS + dickman + BIS11
eta3 =~ ravens + crt
The Psych package in R provides more powerful factor analysis tools, so we will use it for the foregoing analyses.
%%R -i observed_df
library(psych)
fa_result <- fa(observed_df, nfactors = 3)
summary(fa_result)
R[write to console]: Loading required namespace: GPArotation
Factor analysis with Call:
fa(r = observed_df, nfactors = 3)
Test of the hypothesis that
3
factors are
sufficient.
The degrees of freedom for the model is
7
and the objective function was
0.02
The number of observations was
200
with Chi Square =
4.09
with prob <
0.77
The root mean square of the residuals (RMSA) is
0.01
The df corrected root mean square of the residuals is
0.02
Tucker Lewis Index of factoring reliability =
1.02
RMSEA index =
0
and the
10
% confidence intervals are
0
0.06
BIC =
-33
With factor correlations of
MR1
MR2
MR3
MR1
1.00
0.12
0.38
MR2
0.12
1.00
0.05
MR3
0.38
0.05
1.00
Figure 16.13#
%%R
fa.diagram(fa_result)
Figure 16.14#
%%R -o BIC_results
library(ggplot2)
BIC_results = data.frame(nfactors=seq(1, 4), SABIC=NA)
for (i in 1:nrow(BIC_results)){
BIC_results$SABIC[i] = fa(observed_df, nfactors=BIC_results$nfactors[i])$SABIC
}
R[write to console]:
Attaching package: ‘ggplot2’
R[write to console]: The following objects are masked from ‘package:psych’:
%+%, alpha
sns.lineplot(data=BIC_results, x='nfactors', y='SABIC')
<Axes: xlabel='nfactors', ylabel='SABIC'>
Figure 16.15#
imp_efa_df = behavdata[[
'adaptive_n_back.mean_load',
'bis11_survey.Nonplanning',
'cognitive_reflection_survey.correct_proportion',
'dickman_survey.dysfunctional',
'digit_span.reverse_span',
'ravens.score',
'spatial_span.reverse_span',
'upps_impulsivity_survey.lack_of_premeditation']]
%%R -i imp_efa_df
library(tidyverse)
library(psych)
imp_efa_df <- imp_efa_df %>%
rename(UPPS = upps_impulsivity_survey.lack_of_premeditation,
nback = adaptive_n_back.mean_load,
BIS11 = bis11_survey.Nonplanning,
dickman = dickman_survey.dysfunctional,
dspan = digit_span.reverse_span,
sspan = spatial_span.reverse_span,
crt = cognitive_reflection_survey.correct_proportion,
ravens = ravens.score
)
BIC_df <- data.frame(nfactors = seq(1, 4), SABIC=NA)
for (i in 1:nrow(BIC_df)){
fa_result <- fa(imp_efa_df, nfactors=BIC_df$nfactors[i])
BIC_df$SABIC[i] = fa_result$SABIC
}
fa_result <- fa(imp_efa_df, nfactors=3)
fa.diagram(fa_result)
── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
✔ tibble 3.1.8 ✔ dplyr 1.1.0
✔ tidyr 1.3.0 ✔ stringr 1.5.0
✔ readr 2.1.4 ✔ forcats 1.0.0
✔ purrr 1.0.1
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ ggplot2::%+%() masks psych::%+%()
✖ ggplot2::alpha() masks psych::alpha()
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()