Contents

Chapter 3: Summarizing data

Contents

Chapter 3: Summarizing data#

import pandas as pd
import sidetable
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm


import rpy2.robjects as ro
from rpy2.robjects.packages import importr
from rpy2.robjects import pandas2ri
pandas2ri.activate()
from rpy2.robjects.conversion import localconverter
# import NHANES package
base = importr('NHANES')

with localconverter(ro.default_converter + pandas2ri.converter):
  NHANES = ro.conversion.rpy2py(ro.r['NHANES'])

 
NHANES = NHANES.drop_duplicates(subset='ID')

Table 3.1#

pd.DataFrame(NHANES.PhysActive.value_counts(dropna=False))

	PhysActive
Yes	2972
No	2473
NaN	1334

Table 3.2#

table_df = NHANES.stb.freq(['PhysActive']).drop(
    ['cumulative_count', 'cumulative_percent'], axis=1)
table_df['RelativeFrequency'] = table_df.percent / 100
table_df = table_df.rename(columns={
    'count': 'AbsoluteFrequency',
    'percent': 'Percentage'})
table_df.style.hide(axis="index")

PhysActive	AbsoluteFrequency	Percentage	RelativeFrequency
Yes	2972	54.582185	0.545822
No	2473	45.417815	0.454178

Table 3.3#

NHANES_sleep = NHANES.query('SleepHrsNight > 0')
table_df = NHANES_sleep.stb.freq(
    ['SleepHrsNight'])
table_df['RelativeFrequency'] = table_df.percent / 100
table_df = table_df.rename(columns={
    'count': 'AbsoluteFrequency',
    'percent': 'Percentage'}).sort_values(by='SleepHrsNight').drop(
    ['cumulative_count', 'cumulative_percent'], axis=1)

table_df.style.hide(axis="index")

SleepHrsNight	AbsoluteFrequency	Percentage	RelativeFrequency
2	9	0.178749	0.001787
3	49	0.973188	0.009732
4	200	3.972195	0.039722
5	406	8.063555	0.080636
6	1172	23.277061	0.232771
7	1394	27.686197	0.276862
8	1405	27.904667	0.279047
9	271	5.382324	0.053823
10	97	1.926514	0.019265
11	15	0.297915	0.002979
12	17	0.337637	0.003376

Figure 3.2#

fig, ax = plt.subplots(1, 2, figsize=(12,8))

sns.histplot(NHANES_sleep.SleepHrsNight, ax=ax[0], discrete=True)
sns.histplot(NHANES_sleep.SleepHrsNight, ax=ax[1], stat='density', discrete=True)

<Axes: xlabel='SleepHrsNight', ylabel='Density'>

_images/03-SummarizingData_9_1.png

Figure 3.3#

fig, ax = plt.subplots(1, 2, figsize=(12,8))

sns.histplot(NHANES_sleep.SleepHrsNight, ax=ax[0],
             element="step", fill=False, cumulative=True, )
sns.histplot(NHANES_sleep.SleepHrsNight,  ax=ax[0], discrete=True)

sns.histplot(NHANES_sleep.SleepHrsNight, ax=ax[1], stat='density',
             element="step", fill=False, cumulative=True, )
sns.histplot(NHANES_sleep.SleepHrsNight, ax=ax[1], stat='density', discrete=True)

<Axes: xlabel='SleepHrsNight', ylabel='Density'>

_images/03-SummarizingData_11_1.png

Figure 3.4#

fig, ax = plt.subplots(1, 2, figsize=(12,8))

sns.histplot(NHANES.Age, binwidth=1, ax=ax[0])

sns.histplot(NHANES.Height, binwidth=1, ax=ax[1])

<Axes: xlabel='Height', ylabel='Count'>

_images/03-SummarizingData_13_1.png

Figure 3.5#

fig, ax = plt.subplots(2, 2, figsize=(12,8))

NHANES['isChild'] = NHANES.Age < 18
NHANES_adult = NHANES.dropna(subset=['Age', 'Height']).query('Age > 17')

sns.histplot(NHANES, x='Height', hue='isChild', binwidth=1, ax=ax[0][0])
ax[0][0].set_title('A. All individuals')

sns.histplot(NHANES_adult, x='Height',  binwidth=1, ax=ax[0][1])
ax[0][0].set_title('B. Adults only')

sns.histplot(NHANES_adult, x='Height',  binwidth=.1, ax=ax[1][0])
ax[1][0].set_title('B. Adults only (bin width=.1)')

Text(0.5, 1.0, 'B. Adults only (bin width=.1)')

_images/03-SummarizingData_15_1.png

Table 3.5#

NHANES_adult['Height'][44:50]

   169.6
   169.8
  167.5
  155.2
  173.8
  174.5
Name: Height, dtype: float64

Figure 3.6#

# first update the summary to include the mean and standard deviation of each
# dataset
# NOTE: some NA values get turned into large negative numbers in the conversion from R, so remove those

pulsedata = NHANES_adult.dropna(subset=['Pulse']).query('Pulse > 0')[['Pulse']]
heightdata = NHANES_adult.dropna(subset=['Height']).query('Height > 0')[['Height']]

pulse_summary = pulsedata.describe()
pulse_summary.loc['binwidth'] = np.diff(np.histogram_bin_edges(pulsedata.Pulse, 'fd'))[0]

height_summary = heightdata.describe()
height_summary.loc['binwidth'] = np.diff(np.histogram_bin_edges(heightdata.Height, 'fd'))[0]

heightDist = pd.DataFrame({'x': np.arange(height_summary.loc['min'][0], height_summary.loc['max'][0], .1)})
heightDist['norm'] = norm.pdf(
    heightDist.x, height_summary.loc['mean'][0], height_summary.loc['std'])

pulseDist = pd.DataFrame({'x': np.arange(pulse_summary.loc['min'][0], pulse_summary.loc['max'][0], .1)})
pulseDist['norm'] = norm.pdf(
    pulseDist.x, pulse_summary.loc['mean'][0], pulse_summary.loc['std'])

fig, ax = plt.subplots(1, 2, figsize=(12,8))

sns.histplot(heightdata.Height, stat='density', binwidth=height_summary.loc['binwidth'][0], color='gray', ax=ax[0])
ax[0].plot(heightDist.x, heightDist.norm, lw=2, color='b')

sns.histplot(pulsedata.Pulse, stat='density', binwidth=pulse_summary.loc['binwidth'][0], color='gray', ax=ax[1])
ax[1].plot(pulseDist.x, pulseDist.norm, lw=2, color='b')

[<matplotlib.lines.Line2D at 0x7f56a7c1d930>]

_images/03-SummarizingData_20_1.png

Figure 3.7#

waittimes = pd.read_csv('https://raw.githubusercontent.com/statsthinking21/statsthinking21-figures-data/main/04/sfo_wait_times_2017.csv')

fig, ax = plt.subplots(1, 2, figsize=(12,8))
sns.histplot(waittimes.waittime, binwidth=1, ax=ax[0])

fbdata = pd.read_csv('https://raw.githubusercontent.com/statsthinking21/statsthinking21-figures-data/main/04/facebook_combined.txt', names=['V1', 'V2'], delimiter=' ')
friends_table = fbdata.groupby('V1').count()
sns.histplot(friends_table.V2, binwidth=2, ax=ax[1])
plt.plot([friends_table.max()['V2']], [5], marker='.', markersize=14, color='k')

[<matplotlib.lines.Line2D at 0x7f56a6236e00>]

_images/03-SummarizingData_22_1.png