03_covid19_preprints_published_CSHL.Rmd

---
title: "COVID-19 Preprints - CSHL"
output: github_document
---

Sourcing information on preprints that have been published as journal articles directly from bioRxiv/medRxiv

Script for querying bioRxiv/medRxiv re-used from [Preprinting a pandemic](https://github.com/preprinting-a-pandemic/pandemic_preprints) (see [preprint](https://doi.org/10.1101/2020.05.22.111294) on bioRxiv)

# Load required packages 

```{r message = FALSE, warning = FALSE}

library(lubridate)
library(tidyverse)
library(rvest)
library(httr)
library(rcrossref)
library(colorspace)

```


# Set sample date
# Retrieve the latest sample date for preprints

```{r message = FALSE, warning = FALSE}
sample_date <- Sys.Date()

sample_date_preprints <- fromJSON(
  "https://raw.githubusercontent.com/nicholasmfraser/covid19_preprints/master/data/metadata.json"
  ) 

sample_date_preprints <- sample_date_preprints %>%
  .$posted_date %>%
  as.Date()

```


# Retrieve preprint metadata via bioRxiv API
# script adapted from [Preprinting a pandemic - preprint_details.Rmd](https://github.com/preprinting-a-pandemic/pandemic_preprints/blob/master/preprint_details.Rmd)

# NB Alternatively, could query only dois in covid_preprints

```{r message = FALSE, warning = FALSE, cache = TRUE}
# See https://api.biorxiv.org for details
# Note that the API allows querying of both bioRxiv and medRxiv via the 
# 'server' parameter (although this is not documented)
max_results_per_page <- 100 # max allowable number of results per page
base_url <- "https://api.biorxiv.org/details/"
start <- "2020-01-01"
end <- sample_date
getPreprintData <- function(server) {
  
  # Make initial request
  url <- paste0(base_url, server, "/", start, "/", end, "/", 0)
  request <- httr::GET(url = url)
  content <- httr::content(request, as = "parsed")
  
  # Determine total number of results and required iterations for paging
  total_results <- content$messages[[1]]$total
  pages <- ceiling(total_results / max_results_per_page) - 1
  
  data <- content$collection
  
  for (i in 1:pages) {
    cursor <- format(i * max_results_per_page, scientific = FALSE) # otherwise page 100000 becomes 1e05, which the api does not recognise
    url <- paste0(base_url, server, "/", start, "/", end, "/", cursor)
    request <- httr::RETRY("GET", url, times = 5, pause_base = 1, pause_cap = 60) # retry if server error
    content <- httr::content(request, as = "parsed")
    data <- c(data, content$collection)
    
    Sys.sleep(1) # don't hit the API too hard
  }
  return(data)
}

preprint_data <- purrr::map(c("biorxiv", "medrxiv"), getPreprintData)


```


```{r message = FALSE, warning = FALSE, cache = TRUE}

parsePreprintData <- function(item) {
  tibble(
    source = item$server,
    doi = item$doi,
    posted_date = item$date,
    version = item$version,
    is_published = item$published != "NA",
    published_doi = if(item$published == "NA") NA_character_ else item$published
  )
}


# Parse data to dataframe
preprints_all <- map_dfr(preprint_data, ~ map_df(.x, parsePreprintData)) %>%
  #for this purpose, date and version info not needed
  #as information on published articles is added to all versions
  select(source, doi, is_published, published_doi) %>%
  distinct() %>%
  mutate(
    # clean up DOIs for later matching
    doi = str_trim(str_to_lower(doi)),
    published_doi = str_trim(str_to_lower(published_doi))) 
# 75411 -> 59416 of which 14674 with published version
# NB confirmed 59416 unique dois


```

# Match to list of COVID19-preprints collected from Crossref

```{r message = FALSE, warning = FALSE}
preprints_crossref <- read_csv("data/covid19_preprints_published.csv")

preprints_matched <- preprints_crossref %>%
  left_join(preprints_all, by = c("identifier" = "doi")) %>%
  select(-`source.y`) %>%
  rename(source =`source.x`) %>%
  filter(source %in% c("bioRxiv", "medRxiv")) %>%
  rename(is_published_CSHL = is_published,
         published_doi_CSHL = published_doi) %>%
  distinct()

write_csv(preprints_matched, "data/covid19_preprints_published_CSHL.csv")

preprints_matched <-
  read_csv("data/covid19_preprints_published_CSHL.csv")

```

# Check matches from Crossref with matches from bioRxiv/medRxiv directly

```{r message = FALSE, warning = FALSE}

match_check <- preprints_matched %>%
  count(source, is_preprint_of, is_published_CSHL)

#confirm that all Crossref matches are matched in CSHL as well

```


# Get published article metadata via Crossref 
# (publication date, journal, publisher)

```{r message = FALSE, warning = FALSE, cache = TRUE}
 
published_dois <- preprints_matched %>%
  filter(!is.na(published_doi_CSHL)) %>%
  pull(published_doi_CSHL)

published_articles_data <- cr_works_(published_dois, 
                                     parse = TRUE,
                                     .progress = "time")

parsePublishedArticleData <- function(item) {
  tibble(
    published_doi = item$message$DOI,
    published_date = lubridate::date(item$message$created$`date-time`),
    published_journal = if(length(item$message$`container-title`)) item$message$`container-title`[[1]] else NA_character_,
    published_publisher = item$message$publisher
  )
}

published_articles <- map_df(published_articles_data,
                             parsePublishedArticleData) 

published_articles <- published_articles %>%
  mutate(published_doi = str_trim(str_to_lower(published_doi))) %>%
  distinct()

rm(published_articles_data, published_dois)

```

# Merge preprints and published articles data


```{r message = FALSE, warning = FALSE}


preprints_matched_time_to_publish <- preprints_matched %>%
  mutate(published_doi_CSHL = str_trim(str_to_lower(published_doi_CSHL))) %>%
  left_join(published_articles, by = c("published_doi_CSHL" = "published_doi")) %>%
  distinct() %>%
  mutate(delay_in_days = as.numeric(ymd(published_date) -  ymd(posted_date)))
  

write_csv(preprints_matched_time_to_publish, "data/covid19_preprints_published_CSHL_time_to_publish.csv")

preprints_matched_time_to_publish <-
  read_csv("data/covid19_preprints_published_CSHL_time_to_publish.csv")

```


# Visualizations

```{r message = FALSE, warning = FALSE}
# Default theme options
theme_set(theme_minimal() +
          theme(text = element_text(size = 12),
          axis.text.x = element_text(angle = 90, vjust = 0.5),
          axis.title.x = element_text(margin = margin(20, 0, 0, 0)),
          axis.title.y = element_text(margin = margin(0, 20, 0, 0)),
          legend.key.size = unit(0.5, "cm"),
          legend.text = element_text(size = 8),
          plot.caption = element_text(size = 10, hjust = 0, color = "grey25", 
                                      margin = margin(20, 0, 0, 0))))
# Create a nice color palette
pal_1 <- colorspace::lighten(pals::tol(n = 10), amount = 0.2)
pal_2 <- colorspace::lighten(pals::tol(n = 10), amount = 0.4)
palette <- c(pal_1, pal_2)
```


```{r message = FALSE, warning = FALSE}
# For graphs per preprint server


# Create vector with sources in order of descending number of preprints
source_count <- preprints_matched %>%
  #filter(!is.na(is_preprint_of)) %>%
  count(source) %>%
  arrange(desc(n)) %>%
  pull(source)

```

```{r message = FALSE, warning = FALSE}
#Set parameters for use in visualizations

# manually check min/max delay to set chart parameters
min_days <- min(preprints_matched_time_to_publish$delay_in_days,
           na.rm=TRUE)
max_days <- max(preprints_matched_time_to_publish$delay_in_days,
           na.rm=TRUE)

```


```{r message = FALSE, warning = FALSE}

# Weekly preprint counts - per preprint server

# Select source to display in graph
var <- source_count[2]

# Create graph
p1 <- preprints_matched %>%
  filter(source == var) %>%
  mutate(
    status = case_when(
      !is.na(is_preprint_of)  ~ paste0("linked to published journal article in Crossref and ", var),
      (is.na(is_preprint_of) & is_published_CSHL == T) ~ paste0("linked to published journal article in ",var, " only"),
      (is.na(is_preprint_of) & is_published_CSHL == F) ~ "not linked to published journal article",
      #label records getting error from biorxiv API as not linked
      (is.na(is_preprint_of) & is.na(is_published_CSHL)) ~ "not linked to published journal article"
    ),
    status = factor(status,
                    levels = c(paste0("linked to published journal article in Crossref and ", var),
                               paste0("linked to published journal article in ",var, " only"), 
                               "not linked to published journal article")),
    posted_week = ymd(cut(posted_date,
                          breaks = "week",
                          start.on.monday = TRUE))) %>%
  count(status, posted_week) %>%
  ggplot(aes(x = posted_week, y = n, 
             fill = forcats::fct_rev(status))) +
  geom_col() +
  labs(x = "Posted Date (year-month)", y = "Preprints", fill = paste0("status (from Crossref and ", var, ")"),
       title = paste0("COVID-19 preprints per week on ",var), 
       subtitle = paste0("(preprints up until ", sample_date_preprints, ", sample date ", sample_date,")")
       ) +
  scale_x_date(date_breaks = "1 month",
               date_labels = "%Y-%m",
               expand = c(0, 0),
               limits = c(ymd("2020-01-13"), ymd(sample_date_preprints))) +
  scale_fill_manual(values = c(palette[1],palette[3],palette[2])) +
  ggsave(paste0("outputs/figures/CSHL_comparison/covid19_preprints_published_compare_Crossref_",
                var,
                "_week.png"), 
         width = 12, height = 6)
```


```{r message = FALSE, warning = FALSE}
# Time to publication for different preprint servers

p2 <- preprints_matched_time_to_publish %>%
  filter(!is.na(delay_in_days)) %>%
  mutate(covid_preprint = case_when(
           !is.na(is_preprint_of) ~ "linked in Crossref and on preprint server",
           is.na(is_preprint_of) ~ "linked on preprint server only"
         )) %>%
  filter(source %in% c("medRxiv", "bioRxiv")) %>%
  mutate(source = factor(source, 
                         levels = c("medRxiv", "bioRxiv"))) %>%
  #filter(!is.na(delay_in_days)) %>%
  #filter(delay_in_days > 0) %>%
  ggplot(aes(x = source, y = delay_in_days, color = covid_preprint)) +
  geom_boxplot(outlier.shape = NA) +
  geom_jitter(aes(fill = factor(covid_preprint)),
              shape = 21, size = 0.7, alpha = 0.7,
              position = position_jitterdodge(jitter.width = 0.5)) + 
  labs(x = "", y = str_wrap("Time to publication (days)", 30), 
       fill = "", color = "") +
  scale_color_manual(values = palette[2:3]) +
  scale_fill_manual(values = palette[2:3]) +
  scale_y_continuous(breaks=seq(-30,420,30)) +
  theme(axis.text.x = element_text(size = 16, 
                                   angle = 0, 
                                   hjust = 0.5),
        axis.text.y = element_text(size = 14),
        axis.title.y = element_text(size = 14),
        legend.text = element_text(size = 10),
        legend.position = "right") +
  guides(fill = FALSE, colour = guide_legend(reverse=TRUE)) +
  ggsave("outputs/figures/CSHL_comparison/days_to_publish_CSHL_comparison.png", width = 12, height = 6) 
```

``` {r}

# Percentage of preprints published

preprints_matched_percentage <- preprints_matched %>%
  mutate(`linked in crossref` = case_when(
           !is.na(is_preprint_of) ~ "is_published",
           is.na(is_preprint_of) ~ NA_character_),
         `linked on preprint server` = case_when(
          is_published_CSHL == T ~ "is_published",
          is_published_CSHL == F ~ NA_character_)) %>%
  pivot_longer(cols = c("linked in crossref", 
                        "linked on preprint server"),
               names_to = "status",
               values_to = "is_published") %>%
  count(source, status, is_published) %>%
  group_by(source, status) %>%
  mutate(prop = n*100 / sum(n),
         total = sum(n)) %>%
  ungroup() %>%
  filter(!is.na(is_published)) %>%
  arrange(desc(total)) %>%
  mutate(
    #source_label = str_c(source, "\n(", total, ")"),
    source_label = str_c(source),
    data_label = str_c(round(prop,1), "%")) %>%
  mutate(source_label = factor(source_label),
         source_label = fct_inorder(source_label))

p3 <- preprints_matched_percentage %>%
  ggplot(aes(x = source_label, y = prop,
             width=.75)) +
  geom_col(aes(color = status, fill = status), 
           size = 0.25, 
          #position = "dodge"
           position = position_dodge(0.8)
           ) +
   labs(x = "", y = "% linked to published journal article") +
  scale_color_manual(values = palette[2:3]) +
  scale_fill_manual(values = palette[2:3]) + 
  geom_text(position = position_dodge2(width= 0.8), 
            aes(label = data_label), 
            size = 5.5,
            vjust = -1) +
  ylim(0, 40) +
  theme(axis.title.x = element_blank()) +
  #guides(fill = FALSE) + 
  theme(axis.text.x = element_text(size = 16, angle = 0, vjust = 1),
        axis.text.y = element_blank(),
        axis.title.x = element_blank(),
        #axis.title.y = element_text(size = 12),
        axis.title.y = element_blank(),
        legend.text = element_text(size = 14),
        legend.position = "right"
        )
ggsave("outputs/figures/CSHL_comparison/covid19_preprints_published_percentage_compare_Crossref_CSHL.png", width = 12, height = 6)

```