Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor to fix CI issues #28

Merged
merged 6 commits into from
Oct 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ License: Apache License (>= 2)
Encoding: UTF-8
LazyData: true
Suggests:
devtools,
rmarkdown,
testthat (>= 3.0.0)
VignetteBuilder: knitr
Expand All @@ -25,7 +26,6 @@ Imports:
ggrepel,
ggthemes,
knitr,
magrittr,
stats,
stringx,
tidyr,
Expand Down
2 changes: 0 additions & 2 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,10 @@ export(inspect_corpus)
export(inspect_language)
export(report_summaries)
export(theme_turnPlot)
import(cowplot)
import(dplyr)
import(ggplot2)
import(ggrepel)
import(ggthemes)
import(knitr)
import(magrittr)
import(tidyr)
import(viridis)
3 changes: 3 additions & 0 deletions R/geom_turn.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#' @param inherit.aes If FALSE, overrides the default aesthetics, rather than combining with them. This is most useful for helper functions that define both data and aesthetics and shouldn't inherit behaviour from the default plot specification, e.g. borders().
#' @param ... Other arguments passed on to layer(). These are often aesthetics, used to set an aesthetic to a fixed value, like colour = "red" or size = 3. They may also be parameters to the paired geom/stat.
#' @export
#' @rdname geom_turn
geom_turn <- function(mapping = NULL, data = NULL,
stat = "identity", position = "identity",
..., na.rm = FALSE, show.legend = NA, inherit.aes = TRUE) {
Expand All @@ -25,6 +26,8 @@ geom_turn <- function(mapping = NULL, data = NULL,
)
}

#' GeomTurn
#'
#' @rdname ggplot2-ggproto
#' @format NULL
#' @usage NULL
Expand Down
4 changes: 1 addition & 3 deletions R/helper-functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,6 @@
#' @import viridis
#' @import tidyr
#' @import ggrepel
#' @import magrittr
#' @import cowplot
#' @import knitr
inspect_corpus <- function(d, d.tokens, lang=NULL,saveplot=F,allsources=F) {

Expand Down Expand Up @@ -72,7 +70,7 @@ inspect_corpus <- function(d, d.tokens, lang=NULL,saveplot=F,allsources=F) {
direction="y",nudge_y = -0.2,size=3,
max.overlaps=Inf)

panel <- plot_grid(pA,pB,pC,labels=c("A","B","C"),rel_widths = c(1,1,2),nrow=1)
panel <- cowplot::plot_grid(pA,pB,pC,labels=c("A","B","C"),rel_widths = c(1,1,2),nrow=1)
print(panel)
cat("\n")

Expand Down
64 changes: 46 additions & 18 deletions R/inspect_language.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,20 @@
#' @param data_conv conversation dataset
#' @param data_tokens tokens dataset
#' @param lang language
#' @param returnplots boolean indicating whether plots should be made
#' @param saveplot should the plot be saved
#' @param allsources all sources
#'
#' @export
inspect_language <- function(data_conv,
data_tokens,
lang,
data_tokens = NULL,
lang = NULL,
returnplots = FALSE,
saveplot=FALSE,
allsources=FALSE) {
# conversation data
dp <- data_conv |>
dplyr::filter(language == lang)
dplyr::filter(.data$language == lang)

nturns <- sum(!is.na(dp$FTO)) # QUESTION: does this make sense?

Expand All @@ -23,34 +25,60 @@ inspect_language <- function(data_conv,
pC <- plot_turn_duration(data=dp)
pD <- plot_top_turn_types(data=dp)

# token data
dt <- data_tokens |>
dplyr::filter(language==lang)
nwords <- dt$total[1]

pE <- plot_token_rank(data=dt, nwords)

# combine the plots
top_row <- cowplot::plot_grid(pA,pB,pC,labels=c("A","B","C"),rel_widths = c(1,1,1),nrow=1)
bottom_row <- cowplot::plot_grid(pD,pE,labels=c("D","E"),rel_widths = c(1,1),nrow=1)
panel <- cowplot::plot_grid(top_row,bottom_row,ncol=1)
print(panel)
if(!is.null(data_tokens)){
# token data
dt <- data_tokens |>
dplyr::filter(.data$language==lang)
nwords <- dt$total[1]
pE <- plot_token_rank(data=dt, nwords)
}

if(saveplot) {
filename <- paste0('qc-panel-',lang,'.png')
ggplot2::ggsave(filename,bg="white",width=2400,height=1200,units="px")

# generate plots
if(returnplots){
if(!is.null(data_tokens)){
generate_plots(saveplot, pA, pB, pC, pD, pE)
} else {
generate_plots(saveplot, pA, pB, pC, pD, pE = NULL)
}
}

# sample conversation
data_convplot <- prepare_convplot(data_conv, lang)
pconv <- plot_conversation(data_convplot)
print(pconv)

if(returnplots){
pconv <- plot_conversation(data_convplot)
print(pconv)

}

# print summary stats
report_summaries(data_conv, lang, allsources)
}


generate_plots <- function(saveplot = FALSE, pA, pB, pC, pD, pE){
# combine the plots

if(is.null(pE)){
top_row <- cowplot::plot_grid(pA,pB,labels=c("A","B"),rel_widths = c(1,1),nrow=1)
bottom_row <- cowplot::plot_grid(pC,pD,labels=c("C", "D"),rel_widths = c(1,1),nrow=1)
} else{
top_row <- cowplot::plot_grid(pA,pB,pC,labels=c("A","B","C"),rel_widths = c(1,1,1),nrow=1)
bottom_row <- cowplot::plot_grid(pD,pE,labels=c("D","E"),rel_widths = c(1,1),nrow=1)
}

panel <- cowplot::plot_grid(top_row,bottom_row,ncol=1)
print(panel)

if(saveplot) {
filename <- paste0('qc-panel-',lang,'.png')
ggplot2::ggsave(filename,bg="white",width=2400,height=1200,units="px")
}
}




57 changes: 33 additions & 24 deletions R/summaries.R
Original file line number Diff line number Diff line change
Expand Up @@ -32,48 +32,57 @@ report_summaries <- function(data, lang, allsources){

if(allsources) {
print(knitr::kable(bysource |>
dplyr::select(-start,-finish,-talktime,-totaltime)))
dplyr::select(-"start",
-"finish",
-"talktime",
-"totaltime")))
} else {
if(nsources > 10) {
cat("\n")
cat("Showing only the first 10 sources; use `allsources=T` to show all")
}
print(knitr::kable(bysource |>
dplyr::select(-start,-finish,-talktime,-totaltime) |>
dplyr::select(-"start",
-"finish",
-"talktime",
-"totaltime") |>
dplyr::slice(1:10)))
}
}



summarize_language_data <- function(data, lang){
if(!"translation" %in% colnames(data)){
data$translation <- NA
}

data |>
dplyr::filter(language == lang) |>
dplyr::group_by(source) |>
dplyr::mutate(translation = ifelse(is.na(translation),0,1)) |>
dplyr::summarize(start=min.na(begin),finish=max.na(end),
turns=dplyr::n_distinct(uid),
translated=round(sum(translation)/turns,2),
words=sum(nwords,na.rm=T),
people=dplyr::n_distinct(participant),
talktime = sum(duration),
totaltime = finish - start,
talkprop = round(talktime / totaltime,1),
minutes = round((totaltime/1000 / 60),1),
hours = round((totaltime/1000) / 3600,2))
dplyr::filter(.data$language == lang) |>
dplyr::group_by(.data$source) |>
dplyr::mutate(translation = ifelse(is.na(.data$translation),0,1)) |>
dplyr::summarize(start=min.na(.data$begin),finish=max.na(.data$end),
turns=dplyr::n_distinct(.data$uid),
translated=round(sum(.data$translation)/.data$turns,2),
words=sum(.data$nwords,na.rm=T),
people=dplyr::n_distinct(.data$participant),
talktime = sum(.data$duration),
totaltime = .data$finish - .data$start,
talkprop = round(.data$talktime / .data$totaltime,1),
minutes = round((.data$totaltime/1000 / 60),1),
hours = round((.data$totaltime/1000) / 3600,2))
}


summarize_source_data <- function(data, lang){
data |>
summarize_language_data(lang=lang) |> #TODO this uses another function?
dplyr::summarize(turns = sum(turns),
translated=round(mean.na(translated),2),
words = sum(words),
turnduration=round(mean.na(sum(talktime)/turns)),
talkprop = round(mean.na(talkprop),2),
dplyr::summarize(turns = sum(.data$turns),
translated=round(mean.na(.data$translated),2),
words = sum(.data$words),
turnduration=round(mean.na(sum(.data$talktime)/.data$turns)),
talkprop = round(mean.na(.data$talkprop),2),
people = dplyr::n_distinct(data$participant),
hours = round(sum(hours),2),
turns_per_h = round(turns/hours)) |>
dplyr::arrange(desc(hours))
hours = round(sum(.data$hours),2),
turns_per_h = round(.data$turns/.data$hours)) |>
dplyr::arrange(desc(.data$hours))
}
10 changes: 10 additions & 0 deletions man/ggplot2-ggproto.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 5 additions & 2 deletions man/inspect_language.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

38 changes: 38 additions & 0 deletions tests/testthat/_snaps/inspect_language.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# language inspection yields stats

Code
cat(inspect_language(data, lang = "dutch"))
Output


### 5 hours

| turns| translated| words| turnduration| talkprop| people| hours| turns_per_h|
|-----:|----------:|-----:|------------:|--------:|------:|-----:|-----------:|
| 14022| 0| 69169| 1257| 0.98| 3| 5| 2804|


### nature

|nature | n|
|:------|-----:|
|laugh | 599|
|talk | 13366|
|NA | 57|

### 20 sources
Showing only the first 10 sources; use `allsources=T` to show all

|source | turns| translated| words| people| talkprop| minutes| hours|
|:---------------|-----:|----------:|-----:|------:|--------:|-------:|-----:|
|/dutch2/DVA10O | 501| 0| 3498| 2| 0.9| 15| 0.25|
|/dutch2/DVA11Q | 792| 0| 3318| 2| 1.0| 15| 0.25|
|/dutch2/DVA12S | 640| 0| 3112| 2| 0.9| 15| 0.25|
|/dutch2/DVA13U | 717| 0| 3548| 2| 1.0| 15| 0.25|
|/dutch2/DVA14W | 721| 0| 3099| 2| 0.9| 15| 0.25|
|/dutch2/DVA15Y | 770| 0| 3387| 2| 1.1| 15| 0.25|
|/dutch2/DVA16AA | 604| 0| 3889| 2| 1.1| 15| 0.25|
|/dutch2/DVA17AC | 782| 0| 3888| 2| 1.0| 15| 0.25|
|/dutch2/DVA19AG | 648| 0| 2957| 2| 0.9| 15| 0.25|
|/dutch2/DVA1A | 681| 0| 3432| 2| 1.0| 15| 0.25|

19 changes: 19 additions & 0 deletions tests/testthat/test-inspect_language.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
## set up the test environment
# Install ifadv and devtools only if required
if (!requireNamespace("ifadv")){
if (!requireNamespace("devtools")){
install.packages("devtools")
}
devtools::install_github("elpaco-escience/ifadv")
}

data <- ifadv::ifadv

test_that("language inspection yields stats", {
expect_snapshot(cat(
inspect_language(
data, lang="dutch"
))
)

})