This post is a small companion to Why are Charts an Important Part of Statistical Analysis? In it, you’ll find the R scripts for exploring Anscombe’s Quartet and reproducing the processes you saw in that post.

I recommend working in RStudio. You can find the instructions for that here if you don’t have it already.

Set Up Your Environment and Get the Anscombe Data

library(stats)
library(graphics)
library(ggplot2)
library(plyr)
library(datasets)

# Convert anscombe into a set of data frames
## Anscombe 1
anscombe.1 <- data.frame(x = anscombe[["x1"]], y = anscombe[["y1"]], Set = "Anscombe Set 1")

## Anscombe 2
anscombe.2 <- data.frame(x = anscombe[["x2"]], y = anscombe[["y2"]], Set = "Anscombe Set 2")

## Anscombe 3
anscombe.3 <- data.frame(x = anscombe[["x3"]], y = anscombe[["y3"]], Set = "Anscombe Set 3")

## Anscombe 4
anscombe.4 <- data.frame(x = anscombe[["x4"]], y = anscombe[["y4"]], Set = "Anscombe Set 4")

## Collapse all the frames into one frame
anscombe.data <- rbind(anscombe.1, anscombe.2, anscombe.3, anscombe.4)

Get Some Summary Statistics for all the Distributions

Method 1: A Few Stats One at a Time

# Means
aggregate(cbind(mean_x=x, mean_y=y) ~ Set, anscombe.data, mean)

# SDs
aggregate(cbind(sd_x=x, sd_y=y) ~ Set, anscombe.data, sd)

# Correlations
correlation <- function(data) {
  x <- data.frame(r = cor(data$x, data$y))
  return(x)
}
ddply(.data = anscombe.data, .variables = "Set", .fun = correlation)

## Alternatively, you can get correlations and variance this way: 
sapply(1:8, function(x) var(anscombe[ , x]))
sapply(1:4, function(x) cor(anscombe[ , x], anscombe[ , x+4]))

Method 2: A Bunch of Stats All at Once

# Generate summary stats using the fBasics package
library(fBasics)
fBasics::basicStats(anscombe)

# Output:
> fBasics::basicStats(anscombe)
                   x1        x2        x3        x4        y1        y2        y3        y4
nobs        11.000000 11.000000 11.000000 11.000000 11.000000 11.000000 11.000000 11.000000
NAs          0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000
Minimum      4.000000  4.000000  4.000000  8.000000  4.260000  3.100000  5.390000  5.250000
Maximum     14.000000 14.000000 14.000000 19.000000 10.840000  9.260000 12.740000 12.500000
1. Quartile  6.500000  6.500000  6.500000  8.000000  6.315000  6.695000  6.250000  6.170000
3. Quartile 11.500000 11.500000 11.500000  8.000000  8.570000  8.950000  7.980000  8.190000
Mean         9.000000  9.000000  9.000000  9.000000  7.500909  7.500909  7.500000  7.500909
Median       9.000000  9.000000  9.000000  8.000000  7.580000  8.140000  7.110000  7.040000
Sum         99.000000 99.000000 99.000000 99.000000 82.510000 82.510000 82.500000 82.510000
SE Mean      1.000000  1.000000  1.000000  1.000000  0.612541  0.612568  0.612196  0.612242
LCL Mean     6.771861  6.771861  6.771861  6.771861  6.136083  6.136024  6.135943  6.136748
UCL Mean    11.228139 11.228139 11.228139 11.228139  8.865735  8.865795  8.864057  8.865070
Variance    11.000000 11.000000 11.000000 11.000000  4.127269  4.127629  4.122620  4.123249
Stdev        3.316625  3.316625  3.316625  3.316625  2.031568  2.031657  2.030424  2.030579
Skewness     0.000000  0.000000  0.000000  2.466911 -0.048374 -0.978693  1.380120  1.120774
Kurtosis    -1.528926 -1.528926 -1.528926  4.520661 -1.199123 -0.514319  1.240044  0.628751

Generate the ggplots and Produce a Simple Visualization

# Get ggpubr
if(!require(devtools)) install.packages("devtools")
devtools::install_github("kassambara/ggpubr")
library(ggpubr)

# Generate inidividual ggplots for each distribution
ans1 <-ggplot(data=anscombe, aes(x=x1, y=y1)) + 
  geom_point(size=3, color="#2E8697") + 
  geom_smooth(color="#2E8697", method=lm, fill="#2E8697", alpha=0.3)
ans2 <-ggplot(data=anscombe, aes(x=x2, y=y2)) + 
  geom_point(size=3, color="#2E8697") + 
  geom_smooth(color="#2E8697", method=lm, fill="#2E8697", alpha=0.3)
ans3 <-ggplot(data=anscombe, aes(x=x3, y=y3)) + 
  geom_point(size=3, color="#2E8697") + 
  geom_smooth(color="#2E8697", method=lm, fill="#2E8697", alpha=0.3)
ans4 <-ggplot(data=anscombe, aes(x=x4, y=y4)) + 
  geom_point(size=3, color="#2E8697") + 
  geom_smooth(color="#2E8697", method=lm, fill="#2E8697", alpha=0.3)

# Arrange all the plots in a single frame
ggarrange(ans1,ans2,ans3,ans4 + rremove("x.text"), labels=c("A", "B", "C", "D"), ncol=2, nrow=2)
Exploring Anscombe's Quartet with a ggplot2 visualization produced in R
A simple Anscombe's Quartet visualization produced in R using ggplot2.
License2020 Xyzology | CC-BY-SA | This work is licensed under a Creative Commons Attribution-ShareAlike 4.0 International License.

You can obviously make adjustments as you see fit for each of those plots. For example, in Why are Charts an Important Part of Statistical Analysis? I used part of the Xyzology branding palette to give keep things in-line with this site.

I highly recommend checking out SHTDA's outstanding coverage of ggplot2 scatterplot formatting if you need some help with that.

Advertisements Disclosure

I will always make it clear if I am writing to endorse or recommend a specific product(s) or service(s). I hate it when I visit a site only to find out that the article is just one big ad.

Various ads may be displayed on this post to help defray the operating cost of this blog. I may make a small commission on any purchases you make by clicking on those advertisements. Thank you for supporting my work bringing you accurate and actionable information on data literacy, analytics, and engineering.