Chapter 2: Working with data
Contents
Chapter 2: Working with data#
library(tidyverse)
library(cowplot)
theme_set(theme_minimal(base_size = 14))
── Attaching packages ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.2 ──
✔ ggplot2 3.4.1 ✔ purrr 1.0.1
✔ tibble 3.1.8 ✔ dplyr 1.1.0
✔ tidyr 1.3.0 ✔ stringr 1.5.0
✔ readr 2.1.4 ✔ forcats 1.0.0
── Conflicts ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
Table 2.1#
# create table showing frequency of responses to question about why students
# are taking statistics class
classData=data.frame(why=c("It fulfills a degree plan requirement",
"It fulfills a General Education Breadth Requirement",
"It is not required but I am interested in the topic",
"Other"),
nResponses=c(105,32,11,4))
knitr::kable(
classData, booktabs = TRUE,
caption = 'Counts of the prevalence of different responses to the question "Why are you taking this class?"',
col.names=c('Why are you taking this class?',"Number of students")
)
Table: Counts of the prevalence of different responses to the question "Why are you taking this class?"
|Why are you taking this class? | Number of students|
|:---------------------------------------------------|------------------:|
|It fulfills a degree plan requirement | 105|
|It fulfills a General Education Breadth Requirement | 32|
|It is not required but I am interested in the topic | 11|
|Other | 4|
Figure 2.1#
# Figure inspired by https://commons.wikimedia.org/wiki/File:Reliability_and_validity.svg
# from https://stackoverflow.com/questions/6862742/draw-a-circle-with-ggplot2
set.seed(12345)
gg_circle <- function(r, xc, yc, color="black", fill=NA, ...) {
x <- xc + r*cos(seq(0, pi, length.out=100))
ymax <- yc + r*sin(seq(0, pi, length.out=100))
ymin <- yc + r*sin(seq(0, -pi, length.out=100))
annotate("ribbon", x=x, ymin=ymin, ymax=ymax, color=color, fill=fill, ...)
}
npoints <- 16
rel_mult <- 0.75
unrel_mult <- 2.5
plotDf <- tibble(
X_RelVal = rnorm(npoints)*rel_mult,
Y_RelVal = rnorm(npoints)*rel_mult,
X_RelInval = rnorm(npoints)*rel_mult+2,
Y_RelInval = rnorm(npoints)*rel_mult+2,
X_UnrelInval = rnorm(npoints)*unrel_mult+2,
Y_UnrelInval = rnorm(npoints)*unrel_mult+2,
X_UnrelVal = rnorm(npoints)*unrel_mult,
Y_UnrelVal = rnorm(npoints)*unrel_mult
)
pointsize <- 3
linesize=2
p1=ggplot(plotDf,aes(X_RelVal,Y_RelVal)) +
gg_circle(r=4, xc=0.0, yc=0.0,linewidth=linesize,color='red') +
gg_circle(r=3, xc=0.0, yc=0.0,linewidth=linesize,color='red') +
gg_circle(r=2, xc=0.0, yc=0.0,linewidth=linesize,color='red') +
gg_circle(r=1, xc=0.0, yc=0.0,linewidth=linesize,color='red',fill='red') +
geom_point(size=pointsize) +
xlim(-10,10) + ylim(-10,10) +
theme_void()
p2=ggplot(plotDf,aes(X_UnrelVal,Y_UnrelVal)) +
gg_circle(r=4, xc=0.0, yc=0.0,linewidth=linesize,color='red') +
gg_circle(r=3, xc=0.0, yc=0.0,linewidth=linesize,color='red') +
gg_circle(r=2, xc=0.0, yc=0.0,linewidth=linesize,color='red') +
gg_circle(r=1, xc=0.0, yc=0.0,linewidth=linesize,color='red',fill='red') +
geom_point(size=pointsize) +
xlim(-10,10) + ylim(-10,10) +
theme_void()
p3=ggplot(plotDf,aes(X_RelInval,Y_RelInval)) +
gg_circle(r=4, xc=0.0, yc=0.0,linewidth=linesize,color='red') +
gg_circle(r=3, xc=0.0, yc=0.0,linewidth=linesize,color='red') +
gg_circle(r=2, xc=0.0, yc=0.0,linewidth=linesize,color='red') +
gg_circle(r=1, xc=0.0, yc=0.0,linewidth=linesize,color='red',fill='red') +
geom_point(size=pointsize) +
xlim(-10,10) + ylim(-10,10) +
theme_void()
p4=ggplot(plotDf,aes(X_UnrelInval,Y_UnrelInval)) +
gg_circle(r=4, xc=0.0, yc=0.0,linewidth=linesize,color='red') +
gg_circle(r=3, xc=0.0, yc=0.0,linewidth=linesize,color='red') +
gg_circle(r=2, xc=0.0, yc=0.0,linewidth=linesize,color='red') +
gg_circle(r=1, xc=0.0, yc=0.0,linewidth=linesize,color='red',fill='red') +
geom_point(size=pointsize) +
xlim(-10,10) + ylim(-10,10) +
theme_void()
plot_grid(p1,p2,p3,p4,ncol=2,label_size=12,
labels=c('A: Reliable and valid',
'B: Unreliable but valid',
'C: Reliable but invalid',
'D: Unreliable and invalid'))
Table 2.2#
measTypes=data.frame(equal=c('OK','OK','OK','OK'),
gtlt=c('','OK','OK','OK'),
addsub=c('','','OK','OK'),
multdiv=c('','','','OK'),
row.names=c('Nominal','Ordinal','Interval','Ratio'))
knitr::kable(
measTypes, booktabs = TRUE,
caption = 'Different scales of measurement admit different types of numeric operations',
col.names=c("Equal/not equal",">/<", "+/-", "Multiply/divide")
)
Table: Different scales of measurement admit different types of numeric operations
| |Equal/not equal |>/< |+/- |Multiply/divide |
|:--------|:---------------|:---|:---|:---------------|
|Nominal |OK | | | |
|Ordinal |OK |OK | | |
|Interval |OK |OK |OK | |
|Ratio |OK |OK |OK |OK |