ggplot2 ecosystem
& designing visualizations

Lecture 11

Dr. Colin Rundel

The wider ggplot2 ecosystem

ggthemes

ggplot2 themes

g = ggplot( palmerpenguins::penguins, aes(x=species, y=body_mass_g, fill=species)) + 
    geom_boxplot()
g

g + theme_dark()

g + theme_minimal()

g + theme_void()

ggthemes

g + ggthemes::theme_economist() + 
  ggthemes::scale_fill_economist()

g + ggthemes::theme_fivethirtyeight() + 
  ggthemes::scale_fill_fivethirtyeight()

g + ggthemes::theme_gdocs() +
  ggthemes::scale_fill_gdocs()

g + ggthemes::theme_wsj() +
  ggthemes::scale_fill_wsj()

And for those who miss Excel

g + ggthemes::theme_excel() +
  ggthemes::scale_fill_excel()

g + ggthemes::theme_excel_new() +
  ggthemes::scale_fill_excel_new()

d = tibble(
  car = rownames(mtcars),
  weight = mtcars$wt,
  mpg = mtcars$mpg
) |>
  filter(weight > 2.75, weight < 3.45)
ggplot(d, aes(x=weight, y=mpg)) +
  geom_point(color="red") +
  geom_text(
    aes(label = car)
  )

ggplot(d, aes(x=weight, y=mpg)) +
  geom_point(color="red") +
  ggrepel::geom_text_repel(
    aes(label = car)
  )

ggplot(d, aes(x=weight, y=mpg)) +
  geom_point(color="red") +
  ggrepel::geom_text_repel(
    aes(label = car),
    nudge_x = .1, box.padding = 1, point.padding = 0.6,
    arrow = arrow(length = unit(0.02, "npc")), segment.alpha = 0.25
  )

ggplot objects

library(patchwork)

p1 = ggplot(palmerpenguins::penguins) + 
  geom_boxplot(aes(x = island, y = body_mass_g))

p2 = ggplot(palmerpenguins::penguins) + 
  geom_boxplot(aes(x = species, y = body_mass_g))

p3 = ggplot(palmerpenguins::penguins) + 
  geom_point(aes(x = flipper_length_mm, y = body_mass_g, color = sex))

p4 = ggplot(palmerpenguins::penguins) + 
  geom_point(aes(x = bill_length_mm, y = body_mass_g, color = sex))
class(p1)
[1] "ggplot2::ggplot" "ggplot"          "ggplot2::gg"     "S7_object"      
[5] "gg"             

p1 + p2 + p3 + p4

p1 + p2 + p3 + p4 + plot_layout(nrow=1)

p1 / (p2 + p3 + p4)

p1 + p2 + p3 + p4 + 
  plot_annotation(title = "Palmer Penguins", tag_levels = c("A"))

p1 + {
  p2 + {
    p3 + p4 + plot_layout(ncol = 1) + plot_layout(tag_level = 'new')
  }
} + 
  plot_layout(ncol = 1) +
  plot_annotation(tag_levels = c("1","a"), tag_prefix = "Fig ")

GGally

GGally::ggpairs(palmerpenguins::penguins)

airq = airquality
airq$Month = month.name[airq$Month]

ggplot(
  airq, 
  aes(Day, Temp, group = Month)
) + 
  geom_line() + 
  geom_segment(
    aes(xend = 31, yend = Temp), 
    linetype = 2, 
    colour = 'grey'
  ) + 
  geom_point(size = 2) + 
  geom_text(
    aes(x = 31.1, label = Month), 
    hjust = 0
  ) + 
  gganimate::transition_reveal(Day) +
  coord_cartesian(clip = 'off') + 
  labs(
    title = 'Temperature in New York', 
    y = 'Temperature (°F)'
  ) + 
  theme_minimal() + 
  theme(plot.margin = margin(5.5, 40, 5.5, 5.5))

Some other notable packages

  • marquee - add rendered markdown to your plots

  • thematic & brand.yml - automatic theming of plots to match your app / site

  • ggridges - creates ridgeline plots (stacked density plots)

  • ggdist - visualizations and utilities for distributions and uncertainty (think bayesian model output)

  • legendary - adds addition guides (legends) to ggplot2

More extensions

Why do we visualize?

Asncombe’s Quartet

datasets::anscombe |> as_tibble()
# A tibble: 11 × 8
      x1    x2    x3    x4    y1    y2    y3    y4
   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
 1    10    10    10     8  8.04  9.14  7.46  6.58
 2     8     8     8     8  6.95  8.14  6.77  5.76
 3    13    13    13     8  7.58  8.74 12.7   7.71
 4     9     9     9     8  8.81  8.77  7.11  8.84
 5    11    11    11     8  8.33  9.26  7.81  8.47
 6    14    14    14     8  9.96  8.1   8.84  7.04
 7     6     6     6     8  7.24  6.13  6.08  5.25
 8     4     4     4    19  4.26  3.1   5.39 12.5 
 9    12    12    12     8 10.8   9.13  8.15  5.56
10     7     7     7     8  4.82  7.26  6.42  7.91
11     5     5     5     8  5.68  4.74  5.73  6.89

Tidy anscombe

(tidy_anscombe = datasets::anscombe |>
  pivot_longer(everything(), names_sep = 1, names_to = c("var", "group")) |>
  pivot_wider(id_cols = group, names_from = var, 
              values_from = value, values_fn = list(value = list)) |> 
  unnest(cols = c(x,y)))
# A tibble: 44 × 3
   group     x     y
   <chr> <dbl> <dbl>
 1 1        10  8.04
 2 1         8  6.95
 3 1        13  7.58
 4 1         9  8.81
 5 1        11  8.33
 6 1        14  9.96
 7 1         6  7.24
 8 1         4  4.26
 9 1        12 10.8 
10 1         7  4.82
# ℹ 34 more rows

tidy_anscombe |>
  group_by(group) |>
  summarize(
    mean_x = mean(x), mean_y = mean(y), 
    sd_x = sd(x), sd_y = sd(y),
    cor = cor(x,y), .groups = "drop"
  )
# A tibble: 4 × 6
  group mean_x mean_y  sd_x  sd_y   cor
  <chr>  <dbl>  <dbl> <dbl> <dbl> <dbl>
1 1          9   7.50  3.32  2.03 0.816
2 2          9   7.50  3.32  2.03 0.816
3 3          9   7.5   3.32  2.03 0.816
4 4          9   7.50  3.32  2.03 0.817

ggplot(tidy_anscombe, aes(x = x, y = y, color = as.factor(group))) +
  geom_point(size=2) +
  facet_wrap(~group) +
  geom_smooth(method="lm", se=FALSE, fullrange=TRUE, formula = y~x) +
  guides(color="none")

DatasauRus

ggplot(datasauRus::datasaurus_dozen, aes(x = x, y = y)) +
  geom_point() +
  facet_wrap(~dataset, ncol=5)

datasauRus::datasaurus_dozen
# A tibble: 1,846 × 3
   dataset     x     y
   <chr>   <dbl> <dbl>
 1 dino     55.4  97.2
 2 dino     51.5  96.0
 3 dino     46.2  94.5
 4 dino     42.8  91.4
 5 dino     40.8  88.3
 6 dino     38.7  84.9
 7 dino     35.6  79.9
 8 dino     33.1  77.6
 9 dino     29.0  74.5
10 dino     26.2  71.4
# ℹ 1,836 more rows
datasauRus::datasaurus_dozen |>
  group_by(dataset) |>
  summarize(mean_x = mean(x), mean_y = mean(y), 
            sd_x = sd(x), sd_y = sd(y), 
            cor = cor(x,y), .groups = "drop")
# A tibble: 13 × 6
   dataset    mean_x mean_y  sd_x  sd_y     cor
   <chr>       <dbl>  <dbl> <dbl> <dbl>   <dbl>
 1 away         54.3   47.8  16.8  26.9 -0.0641
 2 bullseye     54.3   47.8  16.8  26.9 -0.0686
 3 circle       54.3   47.8  16.8  26.9 -0.0683
 4 dino         54.3   47.8  16.8  26.9 -0.0645
 5 dots         54.3   47.8  16.8  26.9 -0.0603
 6 h_lines      54.3   47.8  16.8  26.9 -0.0617
 7 high_lines   54.3   47.8  16.8  26.9 -0.0685
 8 slant_down   54.3   47.8  16.8  26.9 -0.0690
 9 slant_up     54.3   47.8  16.8  26.9 -0.0686
10 star         54.3   47.8  16.8  26.9 -0.0630
11 v_lines      54.3   47.8  16.8  26.9 -0.0694
12 wide_lines   54.3   47.8  16.8  26.9 -0.0666
13 x_shape      54.3   47.8  16.8  26.9 -0.0656

Simpson’s Paradox

lm(y~x, data=simpsons) |>
  summary()

Call:
lm(formula = y ~ x, data = simpsons)

Residuals:
    Min      1Q  Median      3Q     Max 
-38.988 -10.208  -0.707   9.874  42.642 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) -41.20220    3.51007  -11.74   <2e-16 ***
x             1.81324    0.06993   25.93   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 13.93 on 215 degrees of freedom
Multiple R-squared:  0.7577,    Adjusted R-squared:  0.7566 
F-statistic: 672.2 on 1 and 215 DF,  p-value: < 2.2e-16

Simpson’s Paradox Visually

Simpson’s Paradox with groups

Revised model

lm(y~x*group-1, data=simpsons) |>
  summary()

Call:
lm(formula = y ~ x * group - 1, data = simpsons)

Residuals:
     Min       1Q   Median       3Q      Max 
-15.4264  -0.6137   0.0811   1.0448   5.0613 

Coefficients:
          Estimate Std. Error t value Pr(>|t|)    
x         -0.62658    0.07987  -7.846 2.27e-13 ***
group1    32.50512    2.61640  12.424  < 2e-16 ***
group2    67.38858    3.47010  19.420  < 2e-16 ***
group3    99.63330    3.34565  29.780  < 2e-16 ***
group4   132.39316    4.76158  27.804  < 2e-16 ***
group5   146.36456    6.78530  21.571  < 2e-16 ***
x:group2  -0.38394    0.11747  -3.268 0.001267 ** 
x:group3  -0.36743    0.10440  -3.519 0.000532 ***
x:group4  -0.36425    0.11146  -3.268 0.001268 ** 
x:group5  -0.25654    0.12950  -1.981 0.048917 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 2.474 on 207 degrees of freedom
Multiple R-squared:  0.998, Adjusted R-squared:  0.9979 
F-statistic: 1.044e+04 on 10 and 207 DF,  p-value: < 2.2e-16

Designing effective visualizations

Gapminder

Keep it simple



Judging relative area

Use color to draw attention



Tell a story


Leave out non-story details



Ordering matter

Clearly indicate missing data


Reduce cognitive load


Use descriptive titles

Annotate figures


All of the data doesn’t tell a story

All of the data doesn’t tell a story

All of the data doesn’t tell a story

Chart Remakes / Makeovers

The Why Axis - Gender Gap

The Why Axis - BLS

Other Resources

  • Duke Library - Center for Data and Visualization Sciences - https://library.duke.edu/data/

  • Tidy tuesday - https://github.com/rfordatascience/tidytuesday

  • Twitter / Bluesky / Mastodon - #dataviz, #tidytuesday

  • Books:

    • Wickham, Navarro, Pedersen. ggplot2: Elegant Graphics for Data Analysis. 3rd edition. Faller, 2021.
    • Wilke. Fundamentals of Data Visualization. O’Reilly Media, 2019.
    • Healy. Data Visualization: A Practical Introduction. Princeton University Press, 2018.
    • Tufte. The visual display of quantitative information. 2nd edition. Connecticut Graphics Press, 2015.

Acknowledgments

Above materials are derived in part from the following sources: