Chapter 7: Sampling#

import pandas as pd
import sidetable
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm

import rpy2.robjects as ro
from rpy2.robjects.packages import importr
from rpy2.robjects import pandas2ri
pandas2ri.activate()
from rpy2.robjects.conversion import localconverter
# import NHANES package
base = importr('NHANES')

with localconverter(ro.default_converter + pandas2ri.converter):
  NHANES = ro.conversion.rpy2py(ro.r['NHANES'])

 
NHANES = NHANES.drop_duplicates(subset='ID')
NHANES_adult = NHANES.dropna(subset=['Height']).query('Age > 17')

rng = np.random.RandomState(12345)

Table 7.1#

NHANES_height = NHANES_adult.dropna(subset=['Height'])

sample_stats = []
sampsize = 50

for i in range(5):
    samp = NHANES_adult.sample(sampsize, random_state=rng)['Height']
    sample_stats.append([samp.mean(), samp.std()])

sample_stats_df = pd.DataFrame(sample_stats, columns=['mean', 'sd'])

sample_stats_df
mean sd
0 166.790 9.942985
1 168.344 9.248589
2 170.096 9.579143
3 169.114 8.705499
4 169.002 10.450593

Figure 7.1#

sampSize = 50 # size of sample
nsamps = 5000 # number of samples we will take

sampMeans = []
for i in range(nsamps):
    samp = NHANES_adult.sample(sampsize, random_state=rng)['Height']
    sampMeans.append([samp.mean()])

sampMeansDf = pd.DataFrame(sampMeans, columns=['sampleMean'])
sns.histplot(NHANES_adult.Height, color='gray', alpha=0.1, stat='density', bins=100)
sns.histplot(sampMeansDf.sampleMean, stat='density', bins=100)
plt.plot([NHANES_adult.Height.mean(), NHANES_adult.Height.mean()], [0, 0.3], color='black')
plt.annotate('Population mean', [NHANES_adult.Height.mean() + 2, 0.3])
Text(170.3497077244259, 0.3, 'Population mean')
_images/07-Sampling_6_1.png

Figure 7.2#

NHANES_clean = NHANES_adult.query('AlcoholYear >= 0').dropna(subset=['AlcoholYear'])

fig, ax = plt.subplots(1, 2, figsize=(12,6))

sns.histplot(NHANES_clean.AlcoholYear, ax=ax[0])

def get_sampling_dist(sampSize=50, nsamps=2500):
    sampMeans = []
    for i in range(nsamps):
        samp = NHANES_clean.sample(sampsize, random_state=rng)['AlcoholYear']
        sampMeans.append([samp.mean()])
    return(pd.DataFrame(sampMeans, columns=['sampleMean']))

sampDist = get_sampling_dist()
sns.histplot(sampDist.sampleMean, bins=100, stat='density', element='poly', fill=None)
x = np.arange(sampDist.sampleMean.min(), sampDist.sampleMean.max(), 0.1)
y = norm.pdf(x, loc=sampDist.sampleMean.mean(), scale=sampDist.sampleMean.std())
plt.plot(x,y, color='red')
[<matplotlib.lines.Line2D at 0x7faacb5001c0>]
_images/07-Sampling_8_1.png