This post is a small companion to Why are Charts an Important Part of Statistical Analysis? In it, you’ll find the R scripts for exploring Anscombe’s Quartet and reproducing the processes you saw in that post.

I recommend working in RStudio. You can find the instructions for that here if you don’t have it already.

## Set Up Your Environment and Get the Anscombe Data

```library(stats)
library(graphics)
library(ggplot2)
library(plyr)
library(datasets)

# Convert anscombe into a set of data frames
## Anscombe 1
anscombe.1 <- data.frame(x = anscombe[["x1"]], y = anscombe[["y1"]], Set = "Anscombe Set 1")

## Anscombe 2
anscombe.2 <- data.frame(x = anscombe[["x2"]], y = anscombe[["y2"]], Set = "Anscombe Set 2")

## Anscombe 3
anscombe.3 <- data.frame(x = anscombe[["x3"]], y = anscombe[["y3"]], Set = "Anscombe Set 3")

## Anscombe 4
anscombe.4 <- data.frame(x = anscombe[["x4"]], y = anscombe[["y4"]], Set = "Anscombe Set 4")

## Collapse all the frames into one frame
anscombe.data <- rbind(anscombe.1, anscombe.2, anscombe.3, anscombe.4)```

## Get Some Summary Statistics for all the Distributions

### Method 1: A Few Stats One at a Time

```# Means
aggregate(cbind(mean_x=x, mean_y=y) ~ Set, anscombe.data, mean)

# SDs
aggregate(cbind(sd_x=x, sd_y=y) ~ Set, anscombe.data, sd)

# Correlations
correlation <- function(data) {
x <- data.frame(r = cor(data\$x, data\$y))
return(x)
}
ddply(.data = anscombe.data, .variables = "Set", .fun = correlation)

## Alternatively, you can get correlations and variance this way:
sapply(1:8, function(x) var(anscombe[ , x]))
sapply(1:4, function(x) cor(anscombe[ , x], anscombe[ , x+4]))
```

### Method 2: A Bunch of Stats All at Once

```# Generate summary stats using the fBasics package
library(fBasics)
fBasics::basicStats(anscombe)

# Output:
> fBasics::basicStats(anscombe)
x1        x2        x3        x4        y1        y2        y3        y4
nobs        11.000000 11.000000 11.000000 11.000000 11.000000 11.000000 11.000000 11.000000
NAs          0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000
Minimum      4.000000  4.000000  4.000000  8.000000  4.260000  3.100000  5.390000  5.250000
Maximum     14.000000 14.000000 14.000000 19.000000 10.840000  9.260000 12.740000 12.500000
1. Quartile  6.500000  6.500000  6.500000  8.000000  6.315000  6.695000  6.250000  6.170000
3. Quartile 11.500000 11.500000 11.500000  8.000000  8.570000  8.950000  7.980000  8.190000
Mean         9.000000  9.000000  9.000000  9.000000  7.500909  7.500909  7.500000  7.500909
Median       9.000000  9.000000  9.000000  8.000000  7.580000  8.140000  7.110000  7.040000
Sum         99.000000 99.000000 99.000000 99.000000 82.510000 82.510000 82.500000 82.510000
SE Mean      1.000000  1.000000  1.000000  1.000000  0.612541  0.612568  0.612196  0.612242
LCL Mean     6.771861  6.771861  6.771861  6.771861  6.136083  6.136024  6.135943  6.136748
UCL Mean    11.228139 11.228139 11.228139 11.228139  8.865735  8.865795  8.864057  8.865070
Variance    11.000000 11.000000 11.000000 11.000000  4.127269  4.127629  4.122620  4.123249
Stdev        3.316625  3.316625  3.316625  3.316625  2.031568  2.031657  2.030424  2.030579
Skewness     0.000000  0.000000  0.000000  2.466911 -0.048374 -0.978693  1.380120  1.120774
Kurtosis    -1.528926 -1.528926 -1.528926  4.520661 -1.199123 -0.514319  1.240044  0.628751
```

## Generate the `ggplots` and Produce a Simple Visualization

```# Get ggpubr
if(!require(devtools)) install.packages("devtools")
devtools::install_github("kassambara/ggpubr")
library(ggpubr)

# Generate inidividual ggplots for each distribution
ans1 <-ggplot(data=anscombe, aes(x=x1, y=y1)) +
geom_point(size=3, color="#2E8697") +
geom_smooth(color="#2E8697", method=lm, fill="#2E8697", alpha=0.3)
ans2 <-ggplot(data=anscombe, aes(x=x2, y=y2)) +
geom_point(size=3, color="#2E8697") +
geom_smooth(color="#2E8697", method=lm, fill="#2E8697", alpha=0.3)
ans3 <-ggplot(data=anscombe, aes(x=x3, y=y3)) +
geom_point(size=3, color="#2E8697") +
geom_smooth(color="#2E8697", method=lm, fill="#2E8697", alpha=0.3)
ans4 <-ggplot(data=anscombe, aes(x=x4, y=y4)) +
geom_point(size=3, color="#2E8697") +
geom_smooth(color="#2E8697", method=lm, fill="#2E8697", alpha=0.3)

# Arrange all the plots in a single frame
ggarrange(ans1,ans2,ans3,ans4 + rremove("x.text"), labels=c("A", "B", "C", "D"), ncol=2, nrow=2)```
I highly recommend checking out SHTDA's outstanding coverage of `ggplot2` scatterplot formatting if you need some help with that.