Example code used in class discussion.
First, let’s load libraries to use data transformation and graphing functions.
library(tidyverse) # for data transformations and grammar of graphics
library(ggthemes) # for setting defaults on non-data ink
library(tidycensus) # for gathering US census county ids and populations
theme_set( theme_void() + theme(legend.position = "") )
Next, we’ll pull census data, including county ids and population thereof.
CENSUS_API_KEY <- Sys.getenv("CENSUS_API_KEY")
county_pop <- get_estimates(
geography = "county",
product = "population",
year = 2019,
key = CENSUS_API_KEY)
To set up a simulation study, we’ll simulate the rate of cancer by county:
data <-
county_pop %>%
pivot_wider(
names_from = "variable",
values_from = "value") %>%
left_join(county_laea) %>%
rowwise() %>%
mutate(
rate_cases = rbinom(n = 1, size = POP, prob = 0.01) / POP * 1000
)
Now, we’ll draw the county map:
ggplot() +
theme_void() +
theme(legend.position = "") +
geom_sf(
data = data,
mapping = aes(
geometry = geometry),
lwd = 0.2,
color = '#dddddd',
fill = NA
) +
geom_sf(
data = state_laea,
fill = NA
)
Encoding directly into the polygons of each county, we’ll encode the fill color of the highest 10 percent of rates:
ggplot() +
geom_sf(
data = data,
mapping = aes(
geometry = geometry,
fill = rate_cases > quantile(rate_cases, probs = 0.9)
),
lwd = 0.2,
color = '#dddddd'
) +
geom_sf(
data = state_laea,
fill = NA
) +
scale_fill_manual(
values = c("#ffffff", "#CD9DA7")
)
Now, we’ll do the same, but for those with the lowest rates:
ggplot() +
geom_sf(
data = data,
mapping = aes(
geometry = geometry,
fill = rate_cases < quantile(rate_cases, probs = 0.1)
),
lwd = 0.2,
color = '#dddddd'
) +
geom_sf(
data = state_laea,
fill = NA
) +
scale_fill_manual(
values = c("#ffffff", "#6695BC")
)
Let’s combine both encodings to compare where the lowest and highest counties are:
ggplot() +
scale_fill_identity() +
geom_sf(
data = data,
mapping = aes(
geometry = geometry,
fill = case_when(
rate_cases > quantile(rate_cases, probs = 0.9) ~ "#CD9DA7",
rate_cases < quantile(rate_cases, probs = 0.1) ~ "#6695BC",
TRUE ~ "#ffffff")
),
lwd = 0.2,
color = '#dddddd'
) +
geom_sf(
data = state_laea,
fill = NA
)
Let’s now consider where the highest populations are across counties:
ggplot() +
theme_void() +
geom_sf(
data = data,
mapping = aes(
geometry = geometry,
fill = POP ),
lwd = 0.2,
color = NA) +
geom_sf(
data = state_laea,
fill = NA,
color = '#ffffff',
lwd = 0.2
) +
scale_fill_gradient(
low = '#000000',
high = '#ffffff',
name = 'County Population\nin thousands',
trans = 'log10',
breaks = c(0, 1e3, 1e4, 1e5, 1e6, 1e7),
labels = c(0, "1", "10", "100", "1,000", "10,000")
) +
theme(
legend.position = 'left',
legend.text.align = 1,
legend.title = element_text(hjust = 0),
legend.key = element_rect(color = '#000000')
) +
guides(
fill = guide_legend(
title.position = "top",
title.hjust = 0.5,
label.position = "left"
)
)
Finally, let’s compare county population to cancer rate:
ggplot(data) +
theme_tufte(base_family = 'sans') +
geom_point(
mapping = aes(
x = POP,
y = rate_cases,
color = case_when(
rate_cases > quantile(rate_cases, probs = 0.9) ~ '#CD9DA7',
rate_cases < quantile(rate_cases, probs = 0.1) ~ '#6695BC',
TRUE ~ '#000000'
)
),
size = 1.5,
alpha = 0.6,
shape = 21
) +
geom_hline(
mapping = aes(
yintercept = quantile(rate_cases, probs = 0.9)
),
linetype = 'dotted',
) +
annotate(
'text',
x = 1e6,
y = quantile(data$rate_cases, probs = 0.9) + 0.2,
label = "↑ 90th Percentile",
hjust = 0,
vjust = 0,
color = '#CD9DA7'
) +
geom_hline(
mapping = aes(
yintercept = quantile(rate_cases, probs = 0.1)
),
linetype = 'dotted'
) +
annotate(
'text',
x = 1e6,
y = quantile(data$rate_cases, probs = 0.1) - 0.2,
label = "↓ 10th Percentile",
hjust = 0,
vjust = 1,
color = '#6695BC'
) +
scale_x_log10(
breaks = c(0, 1e3, 1e4, 1e5, 1e6, 1e7),
labels = c(0, "1", "10", "100", "1,000", "10,000")) +
scale_color_identity() +
theme(legend.position = '') +
labs(
x = "County populations in thousands",
y = "Age-adjusted rates of cancer per 1,000 people"
)
People can generally reason about things better when thinking discretely than when confronted with continuous information. Let’s consider how this plays out when reasoning about a distribution of information.
Let’s say we have a population exhibiting a log normal distribution with a log mean of log(11) and a log standard deviation of 0.2:
set.seed(8)
# population information
mu <- log(11)
sigma <- 0.2
Here’s the population distribution:
p1 <-
ggplot() +
stat_function(
fun = dlnorm,
args = c(meanlog = mu, sdlog = sigma),
xlim = c(0, 30),
geom = "density",
fill = "lightblue",
lwd = 0.1
)
p1
Now, to discretize the distribution, we might take samples. But, as shown below, the samples can be noisy, not visually appearing as the population distribution and inconsistent across samples. To see this, let’s create three experiments of 50 samples:
runs <- 3L
n <- 50L
d <-
tibble(
idx = rep( seq(runs), each = n ),
x = rlnorm( n = n * runs, mean = mu, sd = sigma )
)
p2 <-
ggplot(d) +
theme(strip.text = element_blank()) +
geom_dotplot(
mapping = aes(
x = x
),
fill = 'lightblue',
dotsize = 0.8,
stroke = 0.2
) +
facet_wrap(
~ idx,
nrow = 3
) +
scale_x_continuous(
name = 'x',
limits = c(0, 30)
) +
scale_y_continuous(
name = 'p(x)',
breaks = seq(0, 1, by = 0.25)
)
p2
Instead, we can represent the population discretely in another way.
First, I’ll show you manually doing everything. But there’s a function
in the package ggdist
that does this for you:
geom_dots()
.
q <-
tibble(
p_x =
seq(
from = 1 / n / 2,
to = 1 - (1 / n / 2),
length = n
),
x = qlnorm(
p = p_x,
meanlog = mu,
sdlog = sigma
)
)
# shape of the quantiles
p3 <-
ggplot(q) +
# draw lines from axes to CDF
geom_segment(
mapping = aes(
x = 0, xend = x,
y = p_x, yend = p_x
),
color = 'lightgray',
lwd = 0.2
) +
geom_segment(
mapping = aes(
x = x, xend = x,
y = p_x, yend = 0
),
color = 'lightgray',
lwd = 0.2
) +
# draw CDF from generated data
geom_line(
mapping = aes(
x = x,
y = p_x
),
color = 'darkred'
) +
# draw dotplots from generateed data
geom_dotplot(
mapping = aes(
x = x
),
dotsize = 0.8,
fill = 'lightblue',
stroke = 0.2
)
p3
Users can just count the dots above or below a threshold:
p4 <-
ggplot(q) +
geom_dotplot(
mapping = aes(
x = x,
fill = x < 8
),
dotsize = 0.8,
stroke = 0.2
) +
scale_x_continuous(limits = c(0, 18)) +
scale_fill_manual(
breaks = c(FALSE, TRUE),
values = c('lightblue', 'darkred'),
guide = "none")
p4
Here’s a function to do this for you:
library(ggdist)
ggplot(q) +
stat_dots(
mapping = aes(
x = x
),
dotsize = 0.8,
quantiles = 50,
fill = 'lightblue',
color = 'black',
lwd = 0.2
) +
scale_x_continuous(limits = c(0, 18))
The exact shape of a quantile dotplot will depend — like histograms do with bin width / size — on the parameters you specify.
If you see mistakes or want to suggest changes, please create an issue on the source repository.