elpaco-escience · bvreede · Oct 6, 2023 · Oct 6, 2023 · Oct 6, 2023 · Oct 6, 2023
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -14,6 +14,7 @@ License: Apache License (>= 2)
 Encoding: UTF-8
 LazyData: true
 Suggests: 
+    devtools,
     rmarkdown,
     testthat (>= 3.0.0)
 VignetteBuilder: knitr
@@ -25,7 +26,6 @@ Imports:
     ggrepel,
     ggthemes,
     knitr,
-    magrittr,
     stats,
     stringx,
     tidyr,

diff --git a/NAMESPACE b/NAMESPACE
@@ -8,12 +8,10 @@ export(inspect_corpus)
 export(inspect_language)
 export(report_summaries)
 export(theme_turnPlot)
-import(cowplot)
 import(dplyr)
 import(ggplot2)
 import(ggrepel)
 import(ggthemes)
 import(knitr)
-import(magrittr)
 import(tidyr)
 import(viridis)
diff --git a/R/geom_turn.R b/R/geom_turn.R
@@ -9,6 +9,7 @@
 #' @param inherit.aes If FALSE, overrides the default aesthetics, rather than combining with them. This is most useful for helper functions that define both data and aesthetics and shouldn't inherit behaviour from the default plot specification, e.g. borders().
 #' @param ... Other arguments passed on to layer(). These are often aesthetics, used to set an aesthetic to a fixed value, like colour = "red" or size = 3. They may also be parameters to the paired geom/stat.
 #' @export
+#' @rdname geom_turn
 geom_turn <- function(mapping = NULL, data = NULL,
                       stat = "identity", position = "identity",
                       ..., na.rm = FALSE, show.legend = NA, inherit.aes = TRUE) {
@@ -25,6 +26,8 @@ geom_turn <- function(mapping = NULL, data = NULL,
   )
 }
 
+#' GeomTurn
+#'
 #' @rdname ggplot2-ggproto
 #' @format NULL
 #' @usage NULL

diff --git a/R/helper-functions.R b/R/helper-functions.R
@@ -17,8 +17,6 @@
 #' @import viridis
 #' @import tidyr
 #' @import ggrepel
-#' @import magrittr
-#' @import cowplot
 #' @import knitr
 inspect_corpus <- function(d, d.tokens, lang=NULL,saveplot=F,allsources=F) {
 
@@ -72,7 +70,7 @@ inspect_corpus <- function(d, d.tokens, lang=NULL,saveplot=F,allsources=F) {
                     direction="y",nudge_y = -0.2,size=3,
                     max.overlaps=Inf)
 
-  panel <- plot_grid(pA,pB,pC,labels=c("A","B","C"),rel_widths = c(1,1,2),nrow=1)
+  panel <- cowplot::plot_grid(pA,pB,pC,labels=c("A","B","C"),rel_widths = c(1,1,2),nrow=1)
   print(panel)
   cat("\n")
 

diff --git a/R/inspect_language.R b/R/inspect_language.R
@@ -3,18 +3,20 @@
 #' @param data_conv conversation dataset
 #' @param data_tokens tokens dataset
 #' @param lang language
+#' @param returnplots boolean indicating whether plots should be made
 #' @param saveplot should the plot be saved
 #' @param allsources all sources
 #'
 #' @export
 inspect_language <- function(data_conv,
-                             data_tokens,
-                             lang,
+                             data_tokens = NULL,
+                             lang = NULL,
+                             returnplots = FALSE,
                              saveplot=FALSE,
                              allsources=FALSE) {
   # conversation data
   dp <- data_conv |>
-    dplyr::filter(language == lang)
+    dplyr::filter(.data$language == lang)
 
   nturns <- sum(!is.na(dp$FTO)) # QUESTION: does this make sense?
 
@@ -23,34 +25,60 @@ inspect_language <- function(data_conv,
   pC <- plot_turn_duration(data=dp)
   pD <- plot_top_turn_types(data=dp)
 
-  # token data
-  dt <- data_tokens |>
-    dplyr::filter(language==lang)
-  nwords <- dt$total[1]
 
-  pE <- plot_token_rank(data=dt, nwords)
 
-  # combine the plots
-  top_row <- cowplot::plot_grid(pA,pB,pC,labels=c("A","B","C"),rel_widths = c(1,1,1),nrow=1)
-  bottom_row <- cowplot::plot_grid(pD,pE,labels=c("D","E"),rel_widths = c(1,1),nrow=1)
-  panel <- cowplot::plot_grid(top_row,bottom_row,ncol=1)
-  print(panel)
+  if(!is.null(data_tokens)){
+    # token data
+    dt <- data_tokens |>
+      dplyr::filter(.data$language==lang)
+    nwords <- dt$total[1]
+    pE <- plot_token_rank(data=dt, nwords)
+  }
 
-  if(saveplot) {
-    filename <- paste0('qc-panel-',lang,'.png')
-    ggplot2::ggsave(filename,bg="white",width=2400,height=1200,units="px")
+
+  # generate plots
+  if(returnplots){
+    if(!is.null(data_tokens)){
+      generate_plots(saveplot, pA, pB, pC, pD, pE)
+    } else {
+    generate_plots(saveplot, pA, pB, pC, pD, pE = NULL)
+    }
   }
 
   # sample conversation
   data_convplot <- prepare_convplot(data_conv, lang)
-  pconv <- plot_conversation(data_convplot)
-  print(pconv)
+
+  if(returnplots){
+    pconv <- plot_conversation(data_convplot)
+    print(pconv)
+
+  }
 
   # print summary stats
   report_summaries(data_conv, lang, allsources)
 }
 
 
+generate_plots <- function(saveplot = FALSE, pA, pB, pC, pD, pE){
+  # combine the plots
+
+    if(is.null(pE)){
+    top_row <- cowplot::plot_grid(pA,pB,labels=c("A","B"),rel_widths = c(1,1),nrow=1)
+    bottom_row <- cowplot::plot_grid(pC,pD,labels=c("C", "D"),rel_widths = c(1,1),nrow=1)
+  } else{
+    top_row <- cowplot::plot_grid(pA,pB,pC,labels=c("A","B","C"),rel_widths = c(1,1,1),nrow=1)
+    bottom_row <- cowplot::plot_grid(pD,pE,labels=c("D","E"),rel_widths = c(1,1),nrow=1)
+  }
+
+  panel <- cowplot::plot_grid(top_row,bottom_row,ncol=1)
+  print(panel)
+
+  if(saveplot) {
+    filename <- paste0('qc-panel-',lang,'.png')
+    ggplot2::ggsave(filename,bg="white",width=2400,height=1200,units="px")
+  }
+}
+
 
 
 
diff --git a/R/summaries.R b/R/summaries.R
@@ -32,48 +32,57 @@ report_summaries <- function(data, lang, allsources){
 
   if(allsources) {
     print(knitr::kable(bysource |>
-                         dplyr::select(-start,-finish,-talktime,-totaltime)))
+                         dplyr::select(-"start",
+                                       -"finish",
+                                       -"talktime",
+                                       -"totaltime")))
   } else {
     if(nsources > 10) {
       cat("\n")
       cat("Showing only the first 10 sources; use `allsources=T` to show all")
     }
     print(knitr::kable(bysource |>
-                         dplyr::select(-start,-finish,-talktime,-totaltime) |>
+                         dplyr::select(-"start",
+                                       -"finish",
+                                       -"talktime",
+                                       -"totaltime") |>
                          dplyr::slice(1:10)))
   }
 }
 
 
-
 summarize_language_data <- function(data, lang){
+  if(!"translation" %in% colnames(data)){
+    data$translation <- NA
+  }
+
   data |>
-    dplyr::filter(language == lang) |>
-    dplyr::group_by(source) |>
-    dplyr::mutate(translation = ifelse(is.na(translation),0,1)) |>
-    dplyr::summarize(start=min.na(begin),finish=max.na(end),
-                     turns=dplyr::n_distinct(uid),
-                     translated=round(sum(translation)/turns,2),
-                     words=sum(nwords,na.rm=T),
-                     people=dplyr::n_distinct(participant),
-                     talktime = sum(duration),
-                     totaltime = finish - start,
-                     talkprop = round(talktime / totaltime,1),
-                     minutes = round((totaltime/1000 / 60),1),
-                     hours = round((totaltime/1000) / 3600,2))
+    dplyr::filter(.data$language == lang) |>
+    dplyr::group_by(.data$source) |>
+    dplyr::mutate(translation = ifelse(is.na(.data$translation),0,1)) |>
+    dplyr::summarize(start=min.na(.data$begin),finish=max.na(.data$end),
+                     turns=dplyr::n_distinct(.data$uid),
+                     translated=round(sum(.data$translation)/.data$turns,2),
+                     words=sum(.data$nwords,na.rm=T),
+                     people=dplyr::n_distinct(.data$participant),
+                     talktime = sum(.data$duration),
+                     totaltime = .data$finish - .data$start,
+                     talkprop = round(.data$talktime / .data$totaltime,1),
+                     minutes = round((.data$totaltime/1000 / 60),1),
+                     hours = round((.data$totaltime/1000) / 3600,2))
 }
 
 
 summarize_source_data <- function(data, lang){
   data |>
     summarize_language_data(lang=lang) |> #TODO this uses another function?
-    dplyr::summarize(turns = sum(turns),
-                     translated=round(mean.na(translated),2),
-                     words = sum(words),
-                     turnduration=round(mean.na(sum(talktime)/turns)),
-                     talkprop = round(mean.na(talkprop),2),
+    dplyr::summarize(turns = sum(.data$turns),
+                     translated=round(mean.na(.data$translated),2),
+                     words = sum(.data$words),
+                     turnduration=round(mean.na(sum(.data$talktime)/.data$turns)),
+                     talkprop = round(mean.na(.data$talkprop),2),
                      people = dplyr::n_distinct(data$participant),
-                     hours = round(sum(hours),2),
-                     turns_per_h = round(turns/hours)) |>
-    dplyr::arrange(desc(hours))
+                     hours = round(sum(.data$hours),2),
+                     turns_per_h = round(.data$turns/.data$hours)) |>
+    dplyr::arrange(desc(.data$hours))
 }
diff --git a/man/ggplot2-ggproto.Rd b/man/ggplot2-ggproto.Rd
diff --git a/man/inspect_language.Rd b/man/inspect_language.Rd
diff --git a/tests/testthat/_snaps/inspect_language.md b/tests/testthat/_snaps/inspect_language.md
@@ -0,0 +1,38 @@
+# language inspection yields stats
+
+    Code
+      cat(inspect_language(data, lang = "dutch"))
+    Output
+
+
+      ###  5 hours
+
+      | turns| translated| words| turnduration| talkprop| people| hours| turns_per_h|
+      |-----:|----------:|-----:|------------:|--------:|------:|-----:|-----------:|
+      | 14022|          0| 69169|         1257|     0.98|      3|     5|        2804|
+
+
+      ### nature
+
+      |nature |     n|
+      |:------|-----:|
+      |laugh  |   599|
+      |talk   | 13366|
+      |NA     |    57|
+
+      ###  20 sources
+      Showing only the first 10 sources; use `allsources=T` to show all
+
+      |source          | turns| translated| words| people| talkprop| minutes| hours|
+      |:---------------|-----:|----------:|-----:|------:|--------:|-------:|-----:|
+      |/dutch2/DVA10O  |   501|          0|  3498|      2|      0.9|      15|  0.25|
+      |/dutch2/DVA11Q  |   792|          0|  3318|      2|      1.0|      15|  0.25|
+      |/dutch2/DVA12S  |   640|          0|  3112|      2|      0.9|      15|  0.25|
+      |/dutch2/DVA13U  |   717|          0|  3548|      2|      1.0|      15|  0.25|
+      |/dutch2/DVA14W  |   721|          0|  3099|      2|      0.9|      15|  0.25|
+      |/dutch2/DVA15Y  |   770|          0|  3387|      2|      1.1|      15|  0.25|
+      |/dutch2/DVA16AA |   604|          0|  3889|      2|      1.1|      15|  0.25|
+      |/dutch2/DVA17AC |   782|          0|  3888|      2|      1.0|      15|  0.25|
+      |/dutch2/DVA19AG |   648|          0|  2957|      2|      0.9|      15|  0.25|
+      |/dutch2/DVA1A   |   681|          0|  3432|      2|      1.0|      15|  0.25|
+
diff --git a/tests/testthat/test-inspect_language.R b/tests/testthat/test-inspect_language.R
@@ -0,0 +1,19 @@
+## set up the test environment
+# Install ifadv and devtools only if required
+if (!requireNamespace("ifadv")){
+  if (!requireNamespace("devtools")){
+    install.packages("devtools")
+  }
+  devtools::install_github("elpaco-escience/ifadv")
+}
+
+data <- ifadv::ifadv
+
+test_that("language inspection yields stats", {
+  expect_snapshot(cat(
+    inspect_language(
+      data, lang="dutch"
+      ))
+  )
+
+})