02_covid19_published_articles.Rmd

---
title: "COVID-19 Preprints - published articles"
output: github_document
---
# Collect metadata of published articles linked to COVID-19 preprints

# Load required packages 

```{r message = FALSE, warning = FALSE}

library(lubridate)
library(rcrossref)
library(tidyverse)
library(ggalluvial)
```


# Get published article metadata via Crossref 
# (publication date, journal, publisher)

```{r message = FALSE, warning = FALSE, cache = TRUE}

covid_preprints_published <- 
  read_csv("data/covid19_preprints_published.csv")
           
published_dois <- covid_preprints_published %>%
  filter(!is.na(preprint_of_doi)) %>%
  pull(preprint_of_doi)

published_articles_data <- cr_works_(published_dois, 
                                     parse = TRUE,
                                     .progress = "time")

parsePublishedArticleData <- function(item) {
  tibble(
    published_doi = item$message$DOI,
    published_date = lubridate::date(item$message$created$`date-time`),
    published_journal = if(length(item$message$`container-title`)) item$message$`container-title`[[1]] else NA_character_,
    published_publisher = item$message$publisher
  )
}

published_articles <- map_df(published_articles_data,
                             parsePublishedArticleData)

published_articles <- published_articles %>%
  mutate(published_doi = str_trim(str_to_lower(published_doi))) %>%
  distinct()

rm(published_articles_data, published_dois)

```

# Merge preprints and published articles data


```{r message = FALSE, warning = FALSE}


covid_preprints_time_to_publish <- covid_preprints_published %>%
  mutate(preprint_of_doi = str_trim(str_to_lower(preprint_of_doi))) %>%
  left_join(published_articles, by = c("preprint_of_doi" = "published_doi")) %>%
  mutate(delay_in_days = as.numeric(ymd(published_date) -  ymd(posted_date)))
  

write_csv(covid_preprints_time_to_publish, "data/covid19_preprints_time_to_publish.csv")

covid_preprints_time_to_publish <- read_csv("data/covid19_preprints_time_to_publish.csv")

```

#Visualizations

```{r message = FALSE, warning = FALSE}
# Theme options
theme_set(theme_minimal() +
            theme(text = element_text(size = 10),
                  axis.title.x = element_text(size = 10,
                                              margin = margin(5, 0, 5, 0)),
                  axis.text.x = element_text(size = 8),
                  axis.title.y = element_text(size = 10,
                                              margin = margin(0, 5, 0, 5)),
                  axis.text.y = element_text(size = 8),
                  plot.title = element_text(size = 10),
                  panel.border = element_rect(color = "#E0E0E0", 
                                              size = 0.5, 
                                              fill = NA),
                  plot.margin = margin(5,5,5,5),
                  legend.key.size = unit(0.5, "cm"),
                  legend.text = element_text(size = 8)))

# Create color palette
pal_1 <- colorspace::lighten(pals::tol(n = 10), amount = 0.2)
pal_2 <- colorspace::lighten(pals::tol(n = 10), amount = 0.4)
palette <- c(pal_1, pal_2)

#palette for alluvial plot
pal_3 <- pals::tol(n=10)

```

```{r message = FALSE, warning = FALSE}
#Set parameters for use in visualizations

# manually check min/max delay to set chart parameters
min_days <- min(covid_preprints_time_to_publish$delay_in_days,
           na.rm=TRUE)
max_days <- max(covid_preprints_time_to_publish$delay_in_days,
           na.rm=TRUE)

# Group all OSF preprints together
OSF_names <- covid_preprints_time_to_publish %>%
  count(source) %>%
  filter(str_detect(source, "OSF")) %>%
  arrange(desc(n)) %>%
  pull(source)

#create vector of names of servers w/ linked preprints
#in descending order of number of preprints
servers_selected <- covid_preprints_time_to_publish %>%
  mutate(source = case_when(
    source %in% OSF_names ~ "OSF",
    TRUE ~ source)) %>%
  group_by(source) %>%
  summarise_all(~ sum(!is.na(.))) %>%
  ungroup() %>%
  filter(is_preprint_of > 0) %>%
  arrange(desc(identifier)) %>%
  slice(1:7) %>%
  pull(source)

```

```{r message = FALSE, warning = FALSE, include = FALSE}
# Publishing timeline

p1 <- covid_preprints_time_to_publish %>%
  mutate(covid_preprint = "covid_preprint") %>%
  filter(!is.na(delay_in_days)) %>%
  mutate(pub_bracket = cut(as.numeric(delay_in_days), 
                           seq(-180, 390, by = 30), 
                           labels=seq(-180, 360, by = 30))) %>%
  group_by(covid_preprint) %>% 
  count(pub_bracket) %>%
  mutate(prop = n*100 / sum(n)) %>%
  filter(!is.na(pub_bracket)) %>%
  ggplot(aes(x = pub_bracket, 
             y = prop, 
             fill=covid_preprint, 
             color=covid_preprint
             )) +
  geom_bar(alpha = 0.25, width = 1, size = 0.25, 
           stat = "identity", position="identity") +
  labs(x = "Time from preprint posting to publication (days)",
       y = "% of published preprints") +
  scale_color_manual(values = palette[2]) +
  scale_fill_manual(values = palette[2]) +
  theme(legend.position = "none") +
  ggsave("outputs/figures/days_to_publish/days_to_publish_all.png", width = 12, height = 6)

```

```{r message = FALSE, warning = FALSE, include = FALSE}
# Time to publication for different preprint servers

p2 <- covid_preprints_time_to_publish %>%
  mutate(covid_preprint = "covid_preprint") %>%
  mutate(source = case_when(
    source %in% OSF_names ~ "OSF",
    TRUE ~ source)) %>%
  #filter(!is.na(delay_in_days)) %>%
  filter(source %in% servers_selected) %>%
  mutate(source = factor(source),
         source = forcats::fct_infreq(source)) %>%
  filter(!is.na(delay_in_days)) %>%
  #filter(delay_in_days > 0) %>%
  ggplot(aes(x = source, y = delay_in_days, color = covid_preprint)) +
  geom_boxplot(outlier.shape = NA) +
  geom_jitter(aes(fill = factor(covid_preprint)),
              shape = 21, size = 0.7, alpha = 0.7,
              position = position_jitterdodge(jitter.width = 0.5)) + 
  labs(x = "", 
       y = str_wrap("Time to publication (days)", 30), 
       fill = "", color = "") +
  scale_color_manual(values = palette[2]) +
  scale_fill_manual(values = palette[2]) +
  scale_y_continuous(breaks=seq(-180,390,30)) +
  theme(axis.text.x = element_text(size = 16, 
                                   angle = 0, 
                                   hjust = 0.5),
        axis.text.y = element_text(size = 14),
        axis.title.y = element_text(size = 14),
        #axis.title.y = element_blank(),
        legend.position = "top") +
  guides(fill = FALSE, colour = FALSE) +
  ggsave("outputs/figures/days_to_publish/days_to_publish_sources.png", width = 12, height = 6) 
```

```{r message = FALSE, warning = FALSE, include = FALSE}

#create alluvial diagram

covid_preprints_destination <- covid_preprints_time_to_publish %>%
  filter(!is.na(is_preprint_of)) %>%
  select(source, published_publisher) %>%
  mutate(source = case_when(
    source %in% OSF_names ~ "OSF",
    TRUE ~ source)) %>%
  mutate(source = case_when(
    source %in% servers_selected ~ source,
    TRUE ~ "other"))

#create lookup table for publisher names 
#use "JMIR " and "other " to distinguish from source names
#this is a lazy shortcut to enable ordering of levels in plot
publisher <- covid_preprints_destination %>%
  count(published_publisher) %>%
  arrange(desc(n)) %>%
  mutate(publisher = case_when(
    published_publisher == "JMIR Publications Inc." ~ "JMIR ",
    published_publisher == "Springer Science and Business Media LLC" ~ "Springer Nature",
    #published_publisher == "Elsevier BV" ~ "Elsevier",
    published_publisher == "MDPI AG" ~ "MDPI",
    #published_publisher == "Wiley" ~ "Wiley",
    #published_publisher == "Frontiers Media SA" ~ "Frontiers",
    #published_publisher == "Oxford University Press (OUP)" ~ "OUP",
    #published_publisher == "American Society for Microbiology" ~ "ASM",
    published_publisher == "American Chemical Society (ACS)" ~ "ACS",
    #published_publisher == "Informa UK Limited" ~ "T&F",
    is.na(published_publisher)  ~ "not retrieved",
    TRUE ~ "other ")) %>%
  select(-n)

#replace publisher names
covid_preprints_destination <- covid_preprints_destination %>%
  left_join(publisher)

#create vectors for server/publisher names, manually ordered for use in graphs. NB Take care to use "JMIR " and "other " in destination_levels

#for origin, add one element to 'servers_selected'
origin_levels <- c("JMIR",
                   "Research Square",
                   "Preprints.org",
                   "ChemRxiv",
                   "medRxiv",
                   "bioRxiv",
                   "OSF",
                   "other")

destination_levels <- c("JMIR ",
                   "Springer Nature",
                   "MDPI",
                   "ACS",
                   "other ",
                   "not retrieved") 

#create plot
p3 <- covid_preprints_destination %>%
  mutate(origin = factor(source, levels = origin_levels),
         destination = factor(publisher, levels = destination_levels)) %>%
  group_by(origin, destination) %>%
  summarize(freq = n()) %>%
  ungroup() %>%
  #filter(freq >= 5) %>%
  ggplot(aes(axis1 = origin, 
             axis2 = destination, 
             y = freq)) +
  scale_x_discrete(limits = c("Preprint server", "Publisher"), 
                   expand = c(0, 0),
                   position = "top") +
  scale_y_continuous(expand = c(0, 0)) +
  #labs(title = paste0("Destination of preprints with links to published papers (n=",nrow(covid_preprints_destination),")")) +
  geom_alluvium(aes(fill = origin), reverse = TRUE) +
  geom_stratum(reverse = TRUE, color = "grey50") + 
  geom_text(aes(label = after_stat(stratum)), size = 8, 
            stat = "stratum", 
            reverse = TRUE, color = "grey25") +
  guides(fill = FALSE) +
  theme(plot.title = element_text(size = 20),
        axis.text.x = element_text(size = 24, face = "bold"),
        axis.title.y = element_blank(),
        axis.text.y = element_blank(),
        axis.ticks.y = element_blank(),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank()) +
  scale_fill_manual(values = pal_3) +
  ggsave("outputs/figures/destination/covid19_preprints_destination.png", width = 12, height = 18)

#NB Alternative (non-lazy) approach to accommodate identical levels in origin and destination: transform data into long format and set one level vector
#see https://stackoverflow.com/questions/48346546/ggalluvial-ordering-stratum (this also uses geom_flow instead of geom_alluvium)
```

```{r message = FALSE, warning = FALSE, include = FALSE}

# Create empty figure for table layout in Readme

ggplot() + 
  theme_void() + 
  ggsave(paste0("outputs/figures/destination/empty.png"), 
         width = 6, 
         height = 18)

```


# Calculate means for Readme file

```{r}

count <- covid_preprints_time_to_publish %>%
  mutate(source = case_when(
    source %in% OSF_names ~ "OSF",
    TRUE ~ source)) %>%
  filter(source %in% servers_selected) %>%
  filter(!is.na(delay_in_days)) %>%
  #group_by(source) %>%
  summarize(mean = mean(delay_in_days))


```