fluency22-analyses.Rmd

---
title: "fluencycogsci22_project"
output:
  pdf_document: default
  html_document: default
---

# import libraries

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
#options(warn=-1)
library(tidyverse)
library(tidyboot)
library(ggplot2)
library(ggthemes)
library(broom)
library(lme4)
```

This Rmd file contains the script for analyzing the fluency data. We first analyze the behavioral patterns and then compare the different foraging models. To reproduce the analyses, you can use the .csv files provided in the git repository, or obtain the csvs from the Jupyter notebook and then proceed to analyze the data via the scripts in this Rmd notebook.

# download data

First we download the precomputed behavioral data.

```{r}
raw_data = read.csv("data/raw_data.csv")
fluency_data = read.csv("data/cogsci2022-metrics.csv") %>%
  left_join(raw_data %>% select(dataset, subject) %>% distinct()) %>%
  group_by(subject) %>% mutate(response_number = row_number()) %>%
  rename(response = item)
```

# figure 1

```{r}
t1plot = fluency_data[1649:1653,] %>% 
  select(response_number, response, phonology, semantic) %>% 
  mutate(r2 = lag(response),
         response = paste(r2, response, sep="-"))%>%
  filter(!is.na(r2))%>%
  mutate(response_number = factor(1:4))%>%
  pivot_longer(names_to = "similarity", cols =phonology:semantic )%>%
  mutate(similarity = fct_relevel(similarity, "semantic", "phonology"))%>%
  rename(`retrieval order` = response_number)

t1plot %>%
  ggplot(aes(x = `retrieval order`, y = value, group = similarity, color = similarity)) +
  geom_line(aes(linetype = similarity), size = 1)+
  geom_point(size = 10)+
  geom_label(aes(label = response), label.size = NA, 
             nudge_x = -.33, nudge_y = .03, size = 8)+
  labs(y = "similarity")+
  theme_classic()+
  scale_color_calc()+
    coord_cartesian(ylim=c(-0.01,0.7))+
   theme(axis.title = element_text(size = rel(2)),
          legend.position = "none",
         plot.title = element_text(hjust = .5),
         axis.line = element_line( size = 0.2),
         strip.text.x = element_text(size = rel(2.5)),
         axis.text.x = element_text(size = rel(1)))

```

# behavioral results

## response statistics

```{r}
## num items produced

fluency_data %>% 
  group_by(dataset, subject) %>%
  summarise(items = n()) %>%
  group_by(dataset) %>%
  summarise(mean_items = mean(items),
            sd_items = sd(items))

## mean similarities
fluency_data %>% filter(phonology != -999 & phonology !=1) %>%
  filter(response_number > 1)%>%
  group_by(dataset) %>%
  summarise(mean_sem = mean(semantic, na.rm = TRUE),
            sd_sem = sd(semantic, na.rm = TRUE),
            mean_phon = mean(phonology, na.rm = TRUE),
            sd_phon = sd(phonology, na.rm = TRUE))
```

## cue usage over time

```{r}
## combine semantic + phonological in one plot
order_data = fluency_data %>% filter(phonology != -999 & phonology !=1) %>%
  group_by(response_number) %>%
  summarise_at(vars(phonology, semantic), mean) %>%
    rename(phonological = phonology, semantic = semantic)%>%
  pivot_longer(names_to = "similarity", cols =phonological:semantic )%>%
  mutate(similarity = fct_relevel(similarity, "semantic", "phonological"))%>%
  rename(`retrieval order` = response_number)
```
### figure 2
```{r}
fig2 = order_data %>%
  ggplot(aes(x = `retrieval order`, y = value, group = similarity, color = similarity)) +
  geom_point(alpha = 0.2)+
  geom_smooth(method = "lm")+
  labs(y = "similarity")+
  theme_few()+
  scale_color_calc()+
   theme(axis.title = element_text(size = rel(2)),
          legend.title = element_text(face = "bold", size = rel(2)),
         legend.text  = element_text(size = rel(2)),
         plot.title = element_text(hjust = .5),
         strip.text.x = element_text(size = rel(2.5)),
         axis.text.x = element_text(size = rel(1)))
ggsave("plots/fig2.pdf", fig2, width = 8, height = 6)
```

### model
```{r}
order_model_data = fluency_data %>% filter(phonology != -999 & phonology !=1) %>%
  select(subject, response_number, phonology, semantic) %>%
  rename(phonological = phonology, semantic = semantic)%>%
  pivot_longer(names_to = "similarity", cols =phonological:semantic )%>%
  mutate(similarity = fct_relevel(similarity, "semantic", "phonological"))

ordermodel = lmerTest::lmer(data = order_model_data, 
     value ~ response_number*similarity + 
                                    (response_number*similarity|subject),
              control=lmerControl(check.conv.singular = .makeCC(action = "ignore",  tol = 1e-4),
                                 optimizer = "bobyqa"))
summary(ordermodel)
```

## cues and retrieval performance

```{r}
## data
items_produced = fluency_data %>% 
  group_by(dataset, subject) %>%
  summarise(items = n()) %>%
  left_join( fluency_data %>% filter(response_number > 1) %>%
  group_by(dataset, subject) %>% 
    summarise_at(vars(phonology, semantic), mean, na.rm = TRUE))%>%
  # rescale phonological similarity
  mutate(phonology = phonology + 0.3) %>%
  rename(phonological = phonology, semantic = semantic)%>%
  pivot_longer(names_to = "similarity", cols =phonological:semantic )%>%
  mutate(similarity = fct_relevel(similarity, "semantic", "phonological")) 
```

### figure 3

```{r}
fig3 = items_produced %>%
  ggplot(aes(x= value, y = items, color = similarity, group = similarity)) +
  geom_point(alpha = 0.2)+
geom_smooth(method = "lm")+
  theme_few()+
  labs(x = "similarity")+
  scale_color_calc()+
  theme(axis.title = element_text(size = rel(2)),
          legend.title = element_text(face = "bold", size = rel(2)),
         legend.text  = element_text(size = rel(2)),
         plot.title = element_text(hjust = .5),
         strip.text.x = element_text(size = rel(2.5)),
         axis.text.x = element_text(size = rel(1)))
ggsave("plots/fig3.pdf", fig3, width = 8, height = 6)
```
### model
```{r}
## unscale phon similarity for model

items_produced = items_produced %>% 
  mutate(value = ifelse(similarity == "phonological", value - 0.3, value))

itemmodel = lmerTest::lmer(data = items_produced, 
     value ~ items*similarity + (1|subject),
              control=lmerControl(check.conv.singular = .makeCC(action = "ignore",  tol = 1e-4),
                                 optimizer = "bobyqa"))
summary(itemmodel)
```

## clusters and switches

### figure 4
```{r}

switchplotdata = fluency_data %>% filter(phonology != -999 & phonology !=1) %>%
  filter(dataset== "LEA") %>%
  select(dataset, subject, phonology, semantic,
         participant,simdrop, troyer) %>%
  pivot_longer(names_to = "switch_method", 
               cols = c("simdrop", "participant", "troyer"))%>%
  filter(!is.na(value))%>%
  group_by(dataset, subject, switch_method, value) %>%
  summarise_at(vars(phonology, semantic), mean, na.rm = TRUE)%>% 
  rename(switch = value) %>%
  pivot_longer(names_to = "similarity", cols = phonology:semantic) %>%
  mutate(designation = ifelse(switch==0, "cluster", "switch"),
         `switch method` = fct_recode(switch_method,
                                    `participant\ndesignated` = "participant",
                                    `similarity\ndrop` = "simdrop",
                                    `Troyer\nnorms` = "troyer"),
         similarity= fct_recode(similarity, `phonological similarity` = "phonology",
                                `semantic similarity` = "semantic"),
         similarity = fct_relevel(similarity, "semantic similarity", "phonological similarity"),
         `switch method` = fct_relevel(`switch method`, "Troyer\nnorms", "similarity\ndrop",
                                       "participant\ndesignated"))

scaleFUN <- function(x) sprintf("%.2f", x)

## barplot
semplot = switchplotdata  %>% filter(dataset == "LEA") %>%
  filter(similarity == "semantic similarity") %>%
   group_by(`switch method`, designation, similarity) %>%
summarise(ci = list(mean_cl_boot(value) %>% 
                        rename(mean=y, lwr=ymin, upr=ymax))) %>% unnest %>%
  ggplot(aes(x=`switch method` , y = mean,
             color = designation, fill = designation)) +
           geom_bar(stat = "identity", position = "dodge", width = 0.7, color= "black")+
        geom_errorbar(aes(ymin=lwr, ymax=upr), size = 0.5, width=.1,
                color = "black", position = position_dodge(0.75))+
  theme_few()+
  scale_fill_few()+
  labs(x ="", y = "mean\nsimilarity", title = "semantic similarity")+
  scale_y_continuous(labels=scaleFUN)+
  #theme(aspect.ratio = 1)+
   theme(axis.title = element_text(size = rel(2.5)),
          legend.title = element_text(face = "bold", size = rel(2.5)),
         legend.text  = element_text(size = rel(2.5)),
         plot.title = element_text(hjust = .5, size = rel(3)),
         strip.text.x = element_text(size = rel(2.5)),
         axis.text.x = element_text(size = rel(2.5)))

phonplot = switchplotdata  %>% filter(dataset == "LEA") %>%
  filter(similarity != "semantic similarity") %>%
   group_by(`switch method`, designation, similarity) %>%
summarise(ci = list(mean_cl_boot(value) %>% 
                        rename(mean=y, lwr=ymin, upr=ymax))) %>% unnest %>%
  ggplot(aes(x=`switch method` , y = mean,
             color = designation, fill = designation)) +
           geom_bar(stat = "identity", position = "dodge", width = 0.7, color= "black")+
        geom_errorbar(aes(ymin=lwr, ymax=upr), size = 0.5, width=.1,
                color = "black", position = position_dodge(0.75))+
  #scale_y_continuous(labels=scaleFUN)+
  theme_few()+
  scale_fill_few()+
      scale_y_continuous(breaks = seq(0, 0.12, 0.04), limits = c(0, 0.12))+
  labs(x ="\nswitch method", y = "mean\nsimilarity", title = "phonological similarity")+
   theme(axis.title = element_text(size = rel(2.5)),
          legend.title = element_text(face = "bold", size = rel(2.5)),
         legend.text  = element_text(size = rel(2.5)),
         plot.title = element_text(hjust = .5, size = rel(3)),
         strip.text.x = element_text(size = rel(2.5)),
         axis.text.x = element_text(size = rel(2.5)))

fig4 = gridExtra::grid.arrange(semplot, phonplot, nrow = 2)
ggsave("plots/fig4.pdf", fig4, height = 10, width = 10)

```
### model
```{r}
switchmodel_data = fluency_data %>% filter(phonology != -999 & phonology !=1) %>%
  filter(dataset == "LEA") %>%
  select(dataset,  subject,response, phonology, semantic,
         participant,simdrop, troyer) %>%
  pivot_longer(names_to = "switch_method", 
               cols = c("simdrop", "participant", "troyer"))%>%
  filter(!is.na(value)) %>%
  rename(switch = value) %>%
  pivot_longer(names_to = "similarity", cols = phonology:semantic) %>%
  mutate(designation = ifelse(switch==0, "cluster", "switch"),
         `switch method` = fct_recode(switch_method,
                                    `participant designated` = "participant",
                                    `similarity drop` = "simdrop",
                                    `Troyer norms` = "troyer"),
         similarity= fct_recode(similarity, phonological = "phonology",
                                semantic = "semantic"),
         similarity = fct_relevel(similarity, "semantic", "phonological"))

switchmodel = lme4::lmer(data = switchmodel_data,
  value ~ similarity*designation*switch_method + 
                                    (designation*switch_method|subject),REML = FALSE, 
          control = lmerControl(optimizer="optimx",
            check.conv.singular = .makeCC(action = "ignore",  tol = 1e-4),
                            optCtrl=list(method='nlminb')))


summary(switchmodel)
```

### figure 5

We obtained the plot from the raw data, and then edited its labels/colors in Powerpoint. The final plot is uploaded in the git repository.

```{r}
scaleFUN <- function(x) sprintf("%.2f", x)
fig = fluency_data[c(792:797, 209:214),]%>%
  mutate(troyer  = ifelse(row_number() == 1, 0,troyer),
         participant  = ifelse(row_number() == 1,
                                                 0,participant),
         simdrop  = ifelse(row_number() == 1, 0,simdrop)) %>%
  select(subject, response_number, response, phonology, semantic, 
         participant, troyer, simdrop) %>% 
  group_by(subject) %>%
  mutate(r2 = lag(response),
         response = paste(r2, response, sep="-"))%>%
  filter(!is.na(r2))%>%
  rename(phonological = phonology, semantic = semantic,
         participant = participant,
         Troyer = troyer, `similarity-drop` = simdrop)%>%
  ungroup()%>%
  mutate(response_number = rep(factor(1:5), 2),
         subject = c(rep(1,5), rep(2,5)))%>%
  pivot_longer(names_to = "similarity", cols =phonological:semantic )%>%
  mutate(similarity = fct_relevel(similarity, "semantic", "phonological"),
         Troyer = fct_recode(factor(Troyer), cluster = "0", switch = "1"),
         `similarity-drop` = fct_recode(factor(`similarity-drop`), cluster = "0", switch = "1"),
         participant = fct_recode(factor(participant), cluster = "0", switch = "1"),)%>%
  rename(`retrieval order` = response_number)

fig1 = fig %>% filter(subject == 2) %>%
  ggplot(aes(x = `retrieval order`, y = value, group = similarity)) +
  geom_line(aes(linetype = similarity),size = 1)+
  labs(y = "similarity", x = "")+
  geom_vline(xintercept = 4, color = "goldenrod", linetype = "dashed")+
  geom_vline(xintercept = 5, color = "springgreen3", linetype = "dashed")+
    geom_point(data = fig %>% filter(subject == 2 & !response %in% c("raccoon", "pig")),
      size = 10, color = "skyblue3")+
  geom_point(data = fig %>% filter(subject == 2 & response %in% c("raccoon")), 
               size = 10, color = "goldenrod", shape = 15)+
    geom_point(data = fig %>% filter(subject == 2 & response %in% c("pig")), 
               size = 10, fill = "springgreen3",shape = 23, color = "springgreen3")+
    theme_classic()+
  scale_color_colorblind()+
  scale_shape_manual(values=c(19, 15))+
    scale_y_continuous(breaks = seq(0, 0.8, 0.2), limits = c(0, 0.6))+
   theme(axis.title = element_text(size = rel(2)),
         legend.title = element_text(face = "bold", hjust = .5, size = rel(2.5)),
         legend.text  = element_text(size = rel(2.5)),
         plot.title = element_text(face = "bold", hjust = .5, size = rel(2.5)),
         axis.line = element_line( size = 0.2),
         strip.text.x = element_text(size = rel(2.5)),
         axis.text.x = element_text(size = rel(1)))


fig2 = fig %>% filter(subject == 1) %>%
  ggplot(aes(x = `retrieval order`, y = value, group = similarity)) +
  geom_line(aes(linetype = similarity),size = 1)+
  labs(y = "similarity", x = "retrieval order")+
  geom_vline(xintercept = 4, color = "goldenrod", linetype = "dashed")+
  geom_vline(xintercept = 5, color = "springgreen3", linetype = "dashed")+
    geom_point(data = fig %>% filter(subject == 1 & !response %in% c("dolphin", "lizard")),
      size = 10, color = "skyblue3")+
  geom_point(data = fig %>% filter(subject == 1 & response %in% c("dolphin")), 
               size = 10, color = "goldenrod",shape = 15)+
  geom_point(data = fig %>% filter(subject == 1 & response %in% c("lizard")), 
               size = 10, fill = "springgreen3",shape = 23, color = "springgreen3")+
    theme_classic()+
  scale_color_colorblind()+
    scale_y_continuous(breaks = seq(0, 0.8, 0.2), limits = c(0, 0.6))+
  scale_shape_manual(values=c(19, 15))+
   theme(axis.title = element_text(size = rel(2)),
         legend.title = element_text(face = "bold", hjust = .5, size = rel(2.5)),
         legend.text  = element_text(size = rel(2.5)),
         plot.title = element_text(face = "bold", hjust = .5, size = rel(2.5)),
         axis.line = element_line( size = 0.2),
         strip.text.x = element_text(size = rel(2.5)),
         axis.text.x = element_text(size = rel(1)))

fig5skeleton = gridExtra::grid.arrange(fig1, fig2, nrow = 2)
ggsave("plots/fig5skeleton.pdf", fig5skeleton, width = 12, height = 10)
# added labels and key in powerpoint after saving this preliminary plot
```
### switch correlations
```{r}
x = fluency_data %>% filter(phonology != -999 & phonology !=1)%>%
  filter(dataset== "LEA") 

Hmisc::rcorr(x$participant, x$troyer)
Hmisc::rcorr(x$participant, x$simdrop)
```

# computational foraging models

Here, we first download the foraging models fit at the particpant level - please refer to the Jupyter notebook for model specifications.

```{r}

fluency_data_fits <- read_csv("data/cogsci2022-fits.csv") %>%
  left_join(raw_data %>% select(dataset, subject) %>% distinct()) %>%
# remove models that are not relevant to analyses 
  filter((dataset == "HJT" & !str_detect(model_names, "participant")) |
           (dataset == "LEA" ))%>%
  filter(!model_names %in% c("psyrev static participant", "static plocal participant",
                                "psyrev static simdrop", "static plocal simdrop"))
```
## median BIC

In this method, a BIC is calculated separately for each subject based on the number of items they produced, and then the median BIC is calculated from the difference of optimal and random models

```{r}
bic = fluency_data_fits %>%
  filter(!model_names %in% c("psyrev static participant", "static plocal participant",
                                "psyrev static simdrop", "static plocal simdrop"))%>%
  select(-c(1,2)) %>% distinct() %>%
  mutate(k = ifelse((str_detect(model_names, "psyrev static") | 
                      str_detect(model_names, "psyrev dynamic simdrop") | 
                      str_detect(model_names, "psyrev dynamic troyer") | 
                      str_detect(model_names, "psyrev dynamic participant")) , 
                    2, 3), 
         optimalBIC = k*log(N) - 2*(-optimal_nLLs),
         randomBIC = k*log(N) - 2*(-random_nLLs)) # + here bc we have nLLs


median_bic = bic %>% 
  mutate(deltaBIC  = randomBIC - optimalBIC) %>%
  group_by(model_names) %>%
  summarise(medianBIC = median(deltaBIC)) %>%
  arrange( desc(medianBIC))
```

## sign test

```{r}
m_troyer = bic %>% filter(!str_detect(model_names, "simdrop") & 
                             !(str_detect(model_names, "participant"))) %>%
  select(subject, model_names, optimalBIC) %>%
  pivot_wider(names_from = model_names, values_from = optimalBIC)%>%
  mutate(pmodels_dynamic = as.numeric(`dynamic plocal troyer` < `psyrev dynamic troyer`| 
                                        `dynamic pglobal troyer` < `psyrev dynamic troyer`|
                                        `dynamic pswitchonly troyer` < `psyrev dynamic troyer`))

binom.test(sum(m_troyer$pmodels_dynamic),nrow(m_troyer), alternative = "greater")

m_simdrop = bic %>% filter(!str_detect(model_names, "troyer") & 
                             !(str_detect(model_names, "participant"))) %>%
  select(subject, model_names, optimalBIC) %>%
  pivot_wider(names_from = model_names, values_from = optimalBIC)%>%
  mutate(pmodels_dynamic = as.numeric(`dynamic plocal simdrop` < `psyrev dynamic simdrop`| 
                                        `dynamic pglobal simdrop` < `psyrev dynamic simdrop`|
                                        `dynamic pswitchonly simdrop` < `psyrev dynamic simdrop`))

binom.test(sum(m_simdrop$pmodels_dynamic),nrow(m_simdrop), alternative = "greater")

m_participant = bic %>% filter(str_detect(model_names, "participant")) %>%
  select(subject, model_names, optimalBIC) %>%
  pivot_wider(names_from = model_names, values_from = optimalBIC)%>%
  mutate(pmodels_dynamic = as.numeric(`dynamic plocal participant` < `psyrev dynamic participant`| 
                                        `dynamic pglobal participant` < `psyrev dynamic participant`|
                                   `dynamic pswitchonly participant` < `psyrev dynamic participant`))

binom.test(sum(m_participant$pmodels_dynamic),nrow(m_participant), alternative = "greater")
```