Chapter 2: Working with data#

library(tidyverse)
library(cowplot)
theme_set(theme_minimal(base_size = 14))
── Attaching packages ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.2 ──
 ggplot2 3.4.1      purrr   1.0.1
 tibble  3.1.8      dplyr   1.1.0
 tidyr   1.3.0      stringr 1.5.0
 readr   2.1.4      forcats 1.0.0
── Conflicts ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
 dplyr::filter() masks stats::filter()
 dplyr::lag()    masks stats::lag()

Table 2.1#

# create table showing frequency of responses to question about why students
# are taking statistics class

classData=data.frame(why=c("It fulfills a degree plan requirement",
                           "It fulfills a General Education Breadth Requirement",
                           "It is not required but I am interested in the topic",
                           "Other"),
                     nResponses=c(105,32,11,4))

knitr::kable(
  classData, booktabs = TRUE,
  caption = 'Counts of the prevalence of different responses to the question "Why are you taking this class?"',
  col.names=c('Why are you taking this class?',"Number of students")
)
Table: Counts of the prevalence of different responses to the question "Why are you taking this class?"

|Why are you taking this class?                      | Number of students|
|:---------------------------------------------------|------------------:|
|It fulfills a degree plan requirement               |                105|
|It fulfills a General Education Breadth Requirement |                 32|
|It is not required but I am interested in the topic |                 11|
|Other                                               |                  4|

Figure 2.1#

# Figure inspired by https://commons.wikimedia.org/wiki/File:Reliability_and_validity.svg

# from https://stackoverflow.com/questions/6862742/draw-a-circle-with-ggplot2

set.seed(12345)
gg_circle <- function(r, xc, yc, color="black", fill=NA, ...) {
    x <- xc + r*cos(seq(0, pi, length.out=100))
    ymax <- yc + r*sin(seq(0, pi, length.out=100))
    ymin <- yc + r*sin(seq(0, -pi, length.out=100))
    annotate("ribbon", x=x, ymin=ymin, ymax=ymax, color=color, fill=fill, ...)
}


npoints <- 16
rel_mult <- 0.75
unrel_mult <- 2.5

plotDf <- tibble(
  X_RelVal = rnorm(npoints)*rel_mult,
  Y_RelVal = rnorm(npoints)*rel_mult,
  X_RelInval = rnorm(npoints)*rel_mult+2,
  Y_RelInval = rnorm(npoints)*rel_mult+2,
  X_UnrelInval = rnorm(npoints)*unrel_mult+2,
  Y_UnrelInval = rnorm(npoints)*unrel_mult+2,
  X_UnrelVal = rnorm(npoints)*unrel_mult,
  Y_UnrelVal = rnorm(npoints)*unrel_mult
)

pointsize <- 3
linesize=2
p1=ggplot(plotDf,aes(X_RelVal,Y_RelVal)) +
  gg_circle(r=4, xc=0.0, yc=0.0,linewidth=linesize,color='red') +
  gg_circle(r=3, xc=0.0, yc=0.0,linewidth=linesize,color='red') +
  gg_circle(r=2, xc=0.0, yc=0.0,linewidth=linesize,color='red') +
  gg_circle(r=1, xc=0.0, yc=0.0,linewidth=linesize,color='red',fill='red') +
  geom_point(size=pointsize) +
  xlim(-10,10) + ylim(-10,10) +
  theme_void()

p2=ggplot(plotDf,aes(X_UnrelVal,Y_UnrelVal)) +
  gg_circle(r=4, xc=0.0, yc=0.0,linewidth=linesize,color='red') +
  gg_circle(r=3, xc=0.0, yc=0.0,linewidth=linesize,color='red') +
  gg_circle(r=2, xc=0.0, yc=0.0,linewidth=linesize,color='red') +
  gg_circle(r=1, xc=0.0, yc=0.0,linewidth=linesize,color='red',fill='red') +
  geom_point(size=pointsize) +
  xlim(-10,10) + ylim(-10,10) +
  theme_void()

p3=ggplot(plotDf,aes(X_RelInval,Y_RelInval)) +
  gg_circle(r=4, xc=0.0, yc=0.0,linewidth=linesize,color='red') +
  gg_circle(r=3, xc=0.0, yc=0.0,linewidth=linesize,color='red') +
  gg_circle(r=2, xc=0.0, yc=0.0,linewidth=linesize,color='red') +
  gg_circle(r=1, xc=0.0, yc=0.0,linewidth=linesize,color='red',fill='red') +
  geom_point(size=pointsize) +
  xlim(-10,10) + ylim(-10,10) +
  theme_void()

p4=ggplot(plotDf,aes(X_UnrelInval,Y_UnrelInval)) +
  gg_circle(r=4, xc=0.0, yc=0.0,linewidth=linesize,color='red') +
  gg_circle(r=3, xc=0.0, yc=0.0,linewidth=linesize,color='red') +
  gg_circle(r=2, xc=0.0, yc=0.0,linewidth=linesize,color='red') +
  gg_circle(r=1, xc=0.0, yc=0.0,linewidth=linesize,color='red',fill='red') +
  geom_point(size=pointsize) +
  xlim(-10,10) + ylim(-10,10) +
  theme_void()

plot_grid(p1,p2,p3,p4,ncol=2,label_size=12,
          labels=c('A: Reliable and valid',
                   'B: Unreliable but valid',
                   'C: Reliable but invalid',
                   'D: Unreliable and invalid'))
_images/02-Data_5_0.svg

Table 2.2#

measTypes=data.frame(equal=c('OK','OK','OK','OK'),
                     gtlt=c('','OK','OK','OK'),
                     addsub=c('','','OK','OK'),
                     multdiv=c('','','','OK'),
                     row.names=c('Nominal','Ordinal','Interval','Ratio'))
knitr::kable(
  measTypes, booktabs = TRUE,
  caption = 'Different scales of measurement admit different types of numeric operations',
  col.names=c("Equal/not equal",">/<", "+/-", "Multiply/divide")
)
Table: Different scales of measurement admit different types of numeric operations

|         |Equal/not equal |>/< |+/- |Multiply/divide |
|:--------|:---------------|:---|:---|:---------------|
|Nominal  |OK              |    |    |                |
|Ordinal  |OK              |OK  |    |                |
|Interval |OK              |OK  |OK  |                |
|Ratio    |OK              |OK  |OK  |OK              |