Chapter 5: Fitting models to data#

import pandas as pd
import sidetable
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
from scipy.interpolate import interp1d
import geopandas

import rpy2.robjects as ro
from rpy2.robjects.packages import importr
from rpy2.robjects import pandas2ri
pandas2ri.activate()
from rpy2.robjects.conversion import localconverter
# import NHANES package
base = importr('NHANES')

with localconverter(ro.default_converter + pandas2ri.converter):
  NHANES = ro.conversion.rpy2py(ro.r['NHANES'])

 
NHANES = NHANES.drop_duplicates(subset='ID')
NHANES['isChild'] = NHANES.Age < 18
NHANES_adult = NHANES.dropna(subset=['Height']).query('Age > 17')
NHANES_child = NHANES.dropna(subset=['Height']).query('Age < 18')

rng = np.random.RandomState(12345)

Figure 5.1#

sns.histplot(NHANES_child.Height, bins=100)

<Axes: xlabel='Height', ylabel='Count'>

Figure 5.2#

error_mean = NHANES_child.Height - NHANES_child.Height.mean()
error_mode = NHANES_child.Height - NHANES_child.Height.mode().values[0]

sns.histplot(error_mean, bins=100)
plt.xlabel('Error when predicting height with mean')

Text(0.5, 0, 'Error when predicting height with mean')

Figure 5.3#

fig, ax = plt.subplots(2, 2, figsize=(12,12))

NHANES_child['Age_jittered'] = NHANES_child.Age + rng.randn(NHANES_child.shape[0]) * 0.1

sns.regplot(data=NHANES_child, x='Age', y='Height', ax=ax[0][0], 
            x_jitter=0.2, scatter_kws={'s':2}, fit_reg=False)
ax[0][0].set_title('A: Original data')

X = NHANES_child.Age.values[:, np.newaxis]
Y = NHANES_child.Height.values[:, np.newaxis]

slope = np.linalg.inv(X.T.dot(X)).dot(X.T.dot(Y))
sns.regplot(data=NHANES_child, x='Age', y='Height', ax=ax[0][1], 
            x_jitter=0.2, scatter_kws={'s':2}, fit_reg=False)
ax[0][1].set_title('B: Age')
ax[0][1].plot([0, NHANES_child.Age.max()*1.05], [0, NHANES_child.Age.max()*1.05*slope])

sns.regplot(data=NHANES_child, x='Age', y='Height', ax=ax[1][0], 
            x_jitter=0.2, scatter_kws={'s':2}, fit_reg=True)
ax[1][0].set_title('C: Age + constant')

sns.scatterplot(data=NHANES_child, x='Age_jittered', y='Height',  hue='Gender', 
                size=2, ax=ax[1][1])
ax[1][1].set_title('C: Age + constant + gender')
h,l = ax[1][1].get_legend_handles_labels()
ax[1][1].legend(h[0:2],l[0:2])

lr = LinearRegression()
xmin = 0
xmax = 18

for gender in ['female', 'male']:
    data = NHANES_child.query(f'Gender == "{gender}"')
    lr.fit(data.Age[:, np.newaxis], data.Height)
    ax[1][1].plot([xmin, xmax], [xmin * lr.coef_[0] + lr.intercept_, xmax * lr.coef_[0] + lr.intercept_, ])
    
for i in range(2):
    for j in range(2):
        ax[i][j].set_ylabel('Height')
        ax[i][j].set_xlabel('Age')

/opt/conda/lib/python3.10/site-packages/numpy/core/shape_base.py:65: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray.
  ary = asanyarray(ary)
/tmp/ipykernel_430/1526773379.py:34: FutureWarning: Support for multi-dimensional indexing (e.g. `obj[:, None]`) is deprecated and will be removed in a future version.  Convert to a numpy array before indexing instead.
  lr.fit(data.Age[:, np.newaxis], data.Height)
/tmp/ipykernel_430/1526773379.py:34: FutureWarning: Support for multi-dimensional indexing (e.g. `obj[:, None]`) is deprecated and will be removed in a future version.  Convert to a numpy array before indexing instead.
  lr.fit(data.Age[:, np.newaxis], data.Height)

Figure 5.4#

rmse = [['mode', np.sqrt(np.mean(error_mode ** 2))],
        ['mean', np.sqrt(np.mean(error_mean ** 2))]]

lr = LinearRegression()
lr.fit(NHANES_child.Age.values[:, np.newaxis], NHANES_child.Height)
pred = lr.predict(NHANES_child.Age.values[:, np.newaxis])
error_age = NHANES_child.Height - pred
rmse.append(['constant+age', np.sqrt(np.mean(error_age ** 2))])

NHANES_child['Gender_num'] = (NHANES_child.Gender == 'female').astype(int)

lr.fit(NHANES_child[['Age', 'Gender_num']].values, NHANES_child.Height)
pred = lr.predict(NHANES_child[['Age', 'Gender_num']].values)
error_age_gender = NHANES_child.Height - pred
rmse.append(['constant+age+gender', np.sqrt(np.mean(error_age_gender ** 2))])

rmse_df = pd.DataFrame(rmse, columns=['model', 'RMSE']).sort_values(by='RMSE')
sns.barplot(data=rmse_df, x='RMSE', y='model')

<Axes: xlabel='RMSE', ylabel='model'>

Figure 5.5#

npts = 100
dataDf = pd.DataFrame({'BAC': rng.uniform(size=npts) * 0.3})
dataDf['ReactionTime'] = dataDf.BAC + 1 + rng.random(npts) * .01
dataDf['ReactionTimeNoisy'] = dataDf.BAC + 2 + rng.random(npts) * 0.2

fig, ax = plt.subplots(2, 2, figsize=(12,12))

sns.regplot(data=dataDf, x='BAC', y='ReactionTime', ax=ax[0][0], ci=None)

sns.regplot(data=dataDf, x='BAC', y='ReactionTimeNoisy', ax=ax[0][1], ci=None)
ax[0][1].set_ylabel('ReactionTime')


dataDf = pd.DataFrame({'caffeineLevel': rng.uniform(size=npts) * 10})
dataDf['caffeineLevelInvertedU'] = (dataDf.caffeineLevel - dataDf.caffeineLevel.mean()) ** 2
dataDf['testPerformance'] = -1 * dataDf.caffeineLevelInvertedU + rng.randn(100) * 0.5

sns.regplot(data=dataDf, x='caffeineLevel', y='testPerformance', ax=ax[1][0], ci=None)

ax[1][1].remove()

plt.tight_layout()

Figure 5.6#

sampleSize = 16
rng = np.random.RandomState(123456)

simData = pd.DataFrame({'X': rng.randn(sampleSize)}).sort_values(by='X')
simData['Y'] = simData['X'] + rng.randn(sampleSize)
simData['Ynew'] = simData['X'] + rng.randn(sampleSize)

Xmin = simData.X.min()
Xmax = simData.X.max()

simpleModel = LinearRegression()
simpleModel.fit(simData.X.values[:, np.newaxis], simData.Y)
simData['simplePred'] = simpleModel.predict(simData.X.values[:, np.newaxis])
simple_rmse = mean_squared_error(simData['simplePred'], simData.Y)
simple_rmse_new = mean_squared_error(simData['simplePred'], simData.Ynew)

complexModel = LinearRegression()
X_complex = PolynomialFeatures(degree=8).fit_transform(simData.X.values[:, np.newaxis])
complexModel.fit(X_complex, simData.Y)
simData['complexPred'] = complexModel.predict(X_complex)
complex_rmse = mean_squared_error(simData['complexPred'], simData.Y, squared=False)
complex_rmse_new = mean_squared_error(simData['complexPred'], simData.Ynew, squared=False)

fig, ax = plt.subplots(1, 2, figsize=(12,6))
sns.scatterplot(data=simData, x='X', y='Y', ax=ax[0])
ax[0].plot(simData.X, simData.simplePred)
f_cubic = interp1d(simData.X, simData.complexPred, kind='cubic')
xnew = np.linspace(Xmin, Xmax, num=100, endpoint=True)
ax[0].plot(xnew, f_cubic(xnew), color='r')
ax[0].set_ylim((-3, 3))
ax[0].annotate(f'RMSE = {simple_rmse:0.2}', xy=[-2, 2.5], color='blue', size=14)
ax[0].annotate(f'RMSE = {complex_rmse:0.2}', xy=[-2, 2], color='red', size=14)
ax[0].set_title('Original data')

sns.scatterplot(data=simData, x='X', y='Ynew', ax=ax[1])
ax[1].plot(simData.X, simData.simplePred)
ax[1].plot(xnew, f_cubic(xnew), color='r')
ax[1].set_ylim((-3, 3))
ax[1].annotate(f'RMSE = {simple_rmse_new:0.2}', xy=[-2, 2.5], color='blue', size=14)
ax[1].annotate(f'RMSE = {complex_rmse_new:0.2}', xy=[-2, 2], color='red', size=14)
ax[1].set_title('New data')

Text(0.5, 1.0, 'New data')

Figure 5.7#

df_error = pd.DataFrame({'val': np.arange(100, 175, 0.05), 'SSE': None})

for i, v in enumerate(df_error.val):
    err = NHANES_child.Height - v
    df_error.loc[i, 'SSE'] = np.sum(err ** 2)
    
plt.plot(df_error.val, df_error.SSE, color='k')
plt.plot([NHANES_child.Height.mean(), NHANES_child.Height.mean()], [1e6, 4e6], color='blue')
plt.annotate('mean', xy=[140, 3.75e6], color='blue')

Text(140, 3750000.0, 'mean')

Table 5.1#

incomeDf = pd.DataFrame({'income': [48000, 64000, 58000, 72000, 66000],
                         'person': ["Joe", "Karen", "Mark", "Andrea", "Pat"]})
incomeDf

	income	person
0	48000	Joe
1	64000	Karen
2	58000	Mark
3	72000	Andrea
4	66000	Pat

Table 5.2#

incomeDf.loc[5, :] = [54000000, "Beyonce"]
incomeDf

	income	person
0	48000.0	Joe
1	64000.0	Karen
2	58000.0	Mark
3	72000.0	Andrea
4	66000.0	Pat
5	54000000.0	Beyonce

Table 5.3#

population_variance = NHANES_child.Height.var(ddof=0)

sampsize = 50
nsamp = 10000

varhat_n = []
varhat_nm1 = []

for i in range(nsamp):
    samp = NHANES_child.sample(sampsize)
    sampmean = samp.Height.mean()
    sse = np.sum((samp.Height - sampmean) ** 2)
    varhat_n.append(sse / sampsize)
    varhat_nm1.append(sse / (sampsize - 1))
    

summary_df = pd.DataFrame(
    {'Estimate': ["Population variance",
                  "Variance estimate using n",
                  "Variance estimate using n-1"],
     'Value': [population_variance,
               np.mean(varhat_n),
               np.mean(varhat_nm1)]})
summary_df

	Estimate	Value
0	Population variance	724.238191
1	Variance estimate using n	708.565125
2	Variance estimate using n-1	723.025638

Figure 5.8#

crimeData = pd.read_csv("https://raw.githubusercontent.com/statsthinking21/statsthinking21-figures-data/main//CrimeOneYearofData_clean.csv", index_col=0)

for state in ['District of Columbia', 'Hawaii', 'Alaska']:
    crimeData = crimeData.query(f"State != '{state}'")

caCrimeData = crimeData.query('State == "California"')

fig, ax = plt.subplots(1, 2, figsize=(12,6))

sns.histplot(crimeData['Violent crime total'], ax=ax[0])

# based on: https://melaniesoek0120.medium.com/data-visualization-how-to-plot-a-map-with-geopandas-in-python-73b10dcd4b4b

# load the shape file using geopandas
geo_usa = geopandas.read_file('cb_2018_us_state_20m/cb_2018_us_state_20m.shp')

# merge usa_state data and geo_usa shapefile
crimeData['NAME'] = crimeData.State
geo_merge=geo_usa.merge(crimeData,on='NAME')# plot USA map 
geo_merge.plot(column='Violent crime total', 
               figsize=(25, 15),
               legend=True,cmap='coolwarm', ax=ax[1])
plt.tight_layout()

Figure 5.9#

fig, ax = plt.subplots(1, 2, figsize=(12,6))

sns.scatterplot(data=crimeData, x='Population', y='Violent crime total', 
                ax=ax[0])
ax[0].annotate('CA', [3.6e7, 152000], color='red', size=12)
ax[0].set_ylabel("Number of violent crimes in 2014")

sns.histplot(crimeData['Violent Crime rate'], binwidth=80, color='gray', ax=ax[1])
plt.plot([caCrimeData['Violent Crime rate'].values[0], caCrimeData['Violent Crime rate'].values[0]],
         [0, 15], color='blue')
ax[1].set_xlabel("Rate of violent crimes per 100,000 people")
ax[1].annotate('CA', [405, 14.5], color='blue', size=12)
plt.tight_layout()

Figure 5.10#

from sklearn.preprocessing import scale

crimeData['ViolentCrimeRateZscore'] = scale(crimeData['Violent Crime rate'])
sns.scatterplot(data=crimeData, x='Violent Crime rate', y='ViolentCrimeRateZscore')

<Axes: xlabel='Violent Crime rate', ylabel='ViolentCrimeRateZscore'>

Figure 5.11#

# merge usa_state data and geo_usa shapefile
geo_merge=geo_usa.merge(crimeData,on='NAME')# plot USA map 
geo_merge.plot(column='ViolentCrimeRateZscore', 
               figsize=(25, 15),
               legend=True,cmap='coolwarm')
plt.axis('off')

(-127.6146362, -64.0610978, 23.253819649999997, 50.628669349999996)

Figure 5.12#

fig, ax = plt.subplots(2, 2, figsize=(12,12))


def plot_zscore(sd, ax, zmean=0, zsd=1, zmin=-4, zmax=4):
    dist_df = pd.DataFrame({'Z score': np.arange(zmin, zmax, .01)})
    dist_df['density'] = norm.pdf(dist_df['Z score'].values, loc=zmean, scale=zsd)
    area = (norm.cdf(sd) - norm.cdf(-sd)) * 100
    tail_area = (100 - area) / 2
    
    ax.plot(dist_df['Z score'], dist_df.density, color='red')
    ax.fill_between(x=dist_df['Z score'], y1=dist_df.density, color='green')
    ax.fill_between(x=dist_df['Z score'], y1=dist_df.density, color='orange',
                         where=np.logical_and(dist_df['Z score'].values > -sd, dist_df['Z score'].values < sd))
    ax.annotate(f'{area:.1f}%', [zmean - 0.3, 0.5 * norm.pdf(zmean, loc=zmean, scale=zsd)])
    ax.annotate(f'{tail_area:.1f}%', [-sd - .5, 1.1 * norm.pdf(-sd, loc=zmean, scale=zsd)])
    ax.annotate(f'{tail_area:.1f}%', [sd - .4, 1.1 * norm.pdf(sd, loc=zmean, scale=zsd)])
    ax.set_xlabel('Z score')
    ax.set_ylabel('Density')

plot_zscore(1, ax[0][0])
plot_zscore(2, ax[0][1])

def plot_cdf(sd, ax, zmean=0, zsd=1, zmin=-4, zmax=4):
    dist_df = pd.DataFrame({'Z score': np.arange(zmin, zmax, .01)})
    dist_df['cdf'] = norm.cdf(dist_df['Z score'].values, loc=zmean, scale=zsd)
    ax.plot(dist_df['Z score'], dist_df.cdf, color='black')
    ax.plot([zmin, -sd], [norm.cdf(-sd, loc=zmean, scale=zsd), norm.cdf(-sd, loc=zmean, scale=zsd)], 
            color='k', linestyle='dashed')
    ax.plot([-sd, -sd], [0, norm.cdf(-sd, loc=zmean, scale=zsd)], 
            color='k', linestyle='dashed')
    ax.plot([zmin, sd], [norm.cdf(sd, loc=zmean, scale=zsd), norm.cdf(sd, loc=zmean, scale=zsd)], 
            color='k', linestyle='dashed')
    ax.plot([sd, sd], [0, norm.cdf(sd, loc=zmean, scale=zsd)], 
            color='k', linestyle='dashed')
    ax.set_xlabel('Z score')
    ax.set_ylabel('Cumulative Density')
  
plot_cdf(1, ax[1][0])
plot_cdf(2, ax[1][1])

Figure 5.13#

crimeData['ViolentCrimeRateStdScore'] = crimeData.ViolentCrimeRateZscore * 10 + 100

caCrimeData = crimeData.query('State == "California"')

sns.histplot(crimeData['ViolentCrimeRateStdScore'], binwidth=5, color='gray')
plt.plot([caCrimeData['ViolentCrimeRateStdScore'].values[0], caCrimeData['ViolentCrimeRateStdScore'].values[0]],
         [0, 10], color='blue')
plt.xlabel("Standardized rate of violent crimes")
plt.annotate('CA', [104.364111, 9.5], color='blue', size=12)
plt.tight_layout()

Figure 5.14#

crimeData['PropertyCrimeRateZscore'] = scale(crimeData['Property crime rate'])
caCrimeData = crimeData.query('State == "California"')

fig, ax = plt.subplots(1, 2, figsize=(12,6))

sns.scatterplot(data=crimeData, x='Violent Crime rate', y='Property crime rate', ax=ax[0], color='black')
ax[0].scatter([caCrimeData['Violent Crime rate']], [caCrimeData['Property crime rate']], 
                 color='blue', s=100)
ax[0].annotate('California', [caCrimeData['Violent Crime rate'] + 20, 
                                 caCrimeData['Property crime rate'] - 50], color='blue')
ax[0].set_xlabel('Violent crime rate (per 100,000)')
ax[0].set_ylabel('Property crime rate (per 100,000)')


sns.scatterplot(data=crimeData, x='ViolentCrimeRateZscore', y='PropertyCrimeRateZscore', ax=ax[1], color='black')
ax[1].scatter([caCrimeData['ViolentCrimeRateZscore']], [caCrimeData['PropertyCrimeRateZscore']], 
                 color='blue', s=100)
ax[1].annotate('California', [caCrimeData['ViolentCrimeRateZscore'] + .1, 
                                 caCrimeData['PropertyCrimeRateZscore'] - .1], color='blue')
ax[1].set_xlabel('Z-scored rate of violent crimes')
ax[1].set_ylabel('Z-scored rate of property crimes')

Text(0, 0.5, 'Z-scored rate of property crimes')

Figure 5.15#

fig, ax = plt.subplots(1, 2, figsize=(12,6))

crimeData['Violence Difference'] =  crimeData.ViolentCrimeRateZscore - crimeData.PropertyCrimeRateZscore

sns.scatterplot(data=crimeData, x='ViolentCrimeRateZscore', y='PropertyCrimeRateZscore', 
                ax=ax[0], color='black', size=crimeData.Population, sizes=(10, 100))
ax[0].scatter([caCrimeData['ViolentCrimeRateZscore']], [caCrimeData['PropertyCrimeRateZscore']], 
                 color='blue')
ax[0].annotate('California', [caCrimeData['ViolentCrimeRateZscore'] + .1, 
                                 caCrimeData['PropertyCrimeRateZscore'] - .1], color='blue')
ax[0].set_xlabel('Z-scored rate of violent crimes')
ax[0].set_ylabel('Z-scored rate of property crimes')

sns.scatterplot(data=crimeData, x='Population', y='Violence Difference', ax=ax[1])

<Axes: xlabel='Population', ylabel='Violence Difference'>

Code and figures for Statistical Thinking: Analyzing Data in an Uncertain World

Chapter 5: Fitting models to data

Contents

Chapter 5: Fitting models to data#

Figure 5.1#

Figure 5.2#

Figure 5.3#

Figure 5.4#

Figure 5.5#

Figure 5.6#

Figure 5.7#

Table 5.1#

Table 5.2#

Table 5.3#

Figure 5.8#

Figure 5.9#

Figure 5.10#

Figure 5.11#

Figure 5.12#

Figure 5.13#

Figure 5.14#

Figure 5.15#