Skip to content

Latest commit

 

History

History
executable file
·
177 lines (145 loc) · 6.75 KB

likert-sankey-dotplot.md

File metadata and controls

executable file
·
177 lines (145 loc) · 6.75 KB

Likert Sankey Dotplot

Matthew Kay 2024-10-14

A quick riff on Matt Cooper and Zac Dempsey’s post on using Sankey diagrams to display Likert data.

Setup

Data

Data from the original post:

set.seed(123) # For reproducibility

# dat_i is the intervention group
n <- 183 # Set the number of individuals
def <- defData(varname = "pre", formula = "1;5", dist = "uniformInt") # Pre values: uniformly distributed between 1 and 5
dat_i <- genData(n, def)
group_probs <- c(0.45, 0.45, 0.10)
dat_i$grp <- sample(1:3, n, replace = TRUE, prob = group_probs)
dat_i$post <- dat_i$pre

dat_i$post[dat_i$grp == 2] <- pmin(dat_i$pre[dat_i$grp == 2] + (rbinom(sum(dat_i$grp == 2), 2, 0.2) + 1), 5) # Increase by 1, max 5
dat_i$post[dat_i$grp == 3] <- pmax(dat_i$pre[dat_i$grp == 3] - (rbinom(sum(dat_i$grp == 3), 2, 0.2) + 1), 1) # Decrease by 1, min 1
# dat_c is the control group
n <- 154 # Set the number of individuals
def <- defData(varname = "pre", formula = "1;5", dist = "uniformInt") # Pre values: uniformly distributed between 1 and 5
dat_c <- genData(n, def)
group_probs <- c(0.55, 0.25, 0.20)
set.seed(123) # For reproducibility
dat_c$grp <- sample(1:3, n, replace = TRUE, prob = group_probs)
dat_c$post <- dat_c$pre

dat_c$post[dat_c$grp == 2] <- pmin(dat_c$pre[dat_c$grp == 2] + (rbinom(sum(dat_c$grp == 2), 2, 0.2) + 1), 5) # Increase by 1, max 5
dat_c$post[dat_c$grp == 3] <- pmax(dat_c$pre[dat_c$grp == 3] - (rbinom(sum(dat_c$grp == 3), 2, 0.2) + 1), 1) # Decrease by 1, min 1

dat <- rbind(cbind(dat_i, group = "Intervention"), 
             cbind(dat_c, group = "Control")) %>% 
  mutate(post = as.integer(post)) %>% 
  select(-grp)

response_levels = c("Less than one cup/day", "About 1-2 cups/day", "About 3-4 cups/day", "About 5-6 cups/day","More than 6 cups/day")
dat <- dat %>% 
  mutate(
    pre_l = factor(pre, 1:5, labels = response_levels),
    post_l = factor(post, 1:5, labels = response_levels)
  )

head(dat, n = 8)
##       id   pre  post        group                 pre_l                post_l
##    <int> <int> <int>       <char>                <fctr>                <fctr>
## 1:     1     2     3 Intervention    About 1-2 cups/day    About 3-4 cups/day
## 2:     2     4     4 Intervention    About 5-6 cups/day    About 5-6 cups/day
## 3:     3     3     3 Intervention    About 3-4 cups/day    About 3-4 cups/day
## 4:     4     5     5 Intervention  More than 6 cups/day  More than 6 cups/day
## 5:     5     5     5 Intervention  More than 6 cups/day  More than 6 cups/day
## 6:     6     1     1 Intervention Less than one cup/day Less than one cup/day
## 7:     7     3     2 Intervention    About 3-4 cups/day    About 1-2 cups/day
## 8:     8     5     5 Intervention  More than 6 cups/day  More than 6 cups/day
# head(dat, 5) %>%
  # thekids_table(colour = "Saffron", padding = 3)

Sankey plot from the original post:

max_prop <- dat %>%
  select(id, group, pre, post) %>% 
  pivot_longer(cols = c(pre, post)) %>% 
  group_by(group, name) %>%
  count(value) %>%
  mutate(freq = n / sum(n)) %>% 
  .$freq %>%
  max

max_prop <- plyr::round_any(max_prop, 0.05, f = ceiling)

p_sankey <- dat %>% 
  filter(group == "Intervention") %>% 
  rename(Pre = pre,
         Post = post) %>% 
  make_long(Pre, Post) %>% 
  mutate(node = factor(node, levels = c(7,6,5,4,3,2,1)),
         next_node = factor(next_node, levels = c(7,6,5,4,3,2,1))) %>% 
  ggplot(aes(x = x, 
             next_x = next_x, 
             node = node, 
             next_node = next_node,
             fill = factor(node))) +
  geom_sankey(alpha = 0.7,
              node.color = 'black') +
  geom_sankey_label(aes(label = node), alpha = 0.75,
                    size = 3, color = "black", fill = "gray80") +
  scale_fill_viridis_d(option = "plasma", end = 0.85, guide = "none") +
  scale_x_discrete(expand = c(0.05,0.05)) +
  scale_y_continuous(breaks = NULL) +
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        axis.title.y=element_blank(),
        axis.text.y=element_blank(),
        axis.ticks=element_blank(),
        legend.position = "bottom",
        plot.title = element_text(hjust = 0.5)) +
  labs(title = "Pre-post",
       fill = "Pre",
       x = "")

p_sankey

Dotplots + Sankey

Two thoughts occured to me when looking at the original post:

  1. Because the histograms are displayed side-by-side, having the main axis be the x axis makes it hard to compare the pre and post distributions. This could be fixed either by stacking the histograms vertically or by rotating the histograms. If you do the latter, you can also mirror one of them to do a more pyramid-style approach.

  2. Dotplots / waffle plots would allow you to show individual datapoints, seeing proportions and raw counts simultaneously, and also have a consistent coloring by the pre value.

Something like this:

p_pre = dat |>
  filter(group == "Intervention") |>
  ggplot() +
  # ordinarily I would not set binwidth manually here (and let geom_dots find the optimal binwidth),
  # but because we have to use geom_dots in two different charts we need to pick a shared binwidth
  # to keep things consistent. Can use geom_dots(verbose = TRUE) without a binwidth to find one.
  geom_dots(aes(group = NA, fill = pre_l, y = forcats::fct_rev(pre_l)), layout = "bar", side = "left", color = NA, binwidth = 0.16, scale = 1) +
  scale_fill_viridis_d(option = "plasma", end = 0.85, direction = -1, guide = "none") +
  scale_x_continuous(breaks = NULL) +
  labs(y = NULL, x = NULL)

p_post = dat |>
  filter(group == "Intervention") |>
  ggplot() +
  geom_dots(aes(group = NA, fill = pre_l, y = forcats::fct_rev(post_l), order = forcats::fct_rev(pre_l)), layout = "bar", side = "right", color = NA, binwidth = 0.16, scale = 1) +
  scale_fill_viridis_d(option = "plasma", end = 0.85, direction = -1, guide = "none") +
  scale_y_discrete(position = "right") +
  scale_x_continuous(breaks = NULL) +
  labs(y = NULL, x = NULL)

p_pre + p_sankey + p_post

Although…

Although if in the end what we care about is post-pre differences, perhaps dotplots of that difference would be helpful:

dat |> 
  ggplot(aes(x = post - pre, y = group)) +
  geom_dots(layout = "bar", subguide = subguide_integer(breaks = scales::breaks_width(2))) +
  scale_y_discrete(expand = expansion(add = 0)) +
  scale_x_continuous(expand = expansion(add = 0.5)) +
  geom_hline(yintercept = 2, color = "gray75", linewidth = 0.25)