diff --git a/DESCRIPTION b/DESCRIPTION index da271204..d17ee07a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -31,3 +31,4 @@ Imports: Config/testthat/edition: 3 URL: https://github.com/CSAFE-ISU/handwriterRF BugReports: https://github.com/CSAFE-ISU/handwriterRF/issues +VignetteBuilder: knitr diff --git a/R/compare.R b/R/compare.R index 15e12685..fd07db3f 100644 --- a/R/compare.R +++ b/R/compare.R @@ -52,20 +52,20 @@ #' @examples #' \donttest{ #' # Compare two documents from the same writer with a similarity score -#' s1 <- system.file(file.path("extdata", "docs", "w0030_s01_pWOZ_r01.png"), +#' s1 <- system.file(file.path("extdata", "docs", "w0005_s01_pLND_r03.png"), #' package = "handwriterRF" #' ) -#' s2 <- system.file(file.path("extdata", "docs", "w0030_s01_pWOZ_r02.png"), +#' s2 <- system.file(file.path("extdata", "docs", "w0005_s02_pWOZ_r02.png"), #' package = "handwriterRF" #' ) #' compare_documents(s1, s2, score_only = TRUE) #' #' # Compare two documents from the same writer with a score-based #' # likelihood ratio -#' s1 <- system.file(file.path("extdata", "docs", "w0030_s01_pWOZ_r01.png"), +#' s1 <- system.file(file.path("extdata", "docs", "w0005_s01_pLND_r03.png"), #' package = "handwriterRF" #' ) -#' s2 <- system.file(file.path("extdata", "docs", "w0030_s01_pWOZ_r02.png"), +#' s2 <- system.file(file.path("extdata", "docs", "w0005_s02_pWOZ_r02.png"), #' package = "handwriterRF" #' ) #' compare_documents(s1, s2, score_only = FALSE) diff --git a/R/plots.R b/R/plots.R index 6fae58f0..5e67cece 100644 --- a/R/plots.R +++ b/R/plots.R @@ -118,8 +118,13 @@ plot_scores <- function(scores, obs_score = NULL, n_bins = 50) { ) + # add text ggplot2::labs(title = "The observed similarity score compared to reference similarity scores", x = "Score", y = "Rate") } else { - p <- p + ggplot2::labs(title = "Reference similarity scores", x = "Score", y = "Rate") + p <- p + + ggplot2::labs(title = "Reference similarity scores", x = "Score", y = "Rate") } + p <- p + + ggplot2::theme(legend.position = "bottom", + legend.text = ggplot2::element_text(size = 6), + legend.title = ggplot2::element_text(size = 8)) return(p) } diff --git a/R/slrs.R b/R/slrs.R index df248919..eafac5dc 100644 --- a/R/slrs.R +++ b/R/slrs.R @@ -56,16 +56,16 @@ #' @examples #' \donttest{ #' # Compare two samples from the same writer -#' s1 <- system.file(file.path("extdata", "docs", "w0030_s01_pWOZ_r01.png"), +#' s1 <- system.file(file.path("extdata", "docs", "w0005_s01_pLND_r03.png"), #' package = "handwriterRF" #' ) -#' s2 <- system.file(file.path("extdata", "docs", "w0030_s01_pWOZ_r02.png"), +#' s2 <- system.file(file.path("extdata", "docs", "w0005_s02_pWOZ_r02.png"), #' package = "handwriterRF" #' ) #' calculate_slr(s1, s2) #' #' # Compare samples from two writers -#' s1 <- system.file(file.path("extdata", "docs", "w0030_s01_pWOZ_r01.png"), +#' s1 <- system.file(file.path("extdata", "docs", "w0005_s02_pWOZ_r02.png"), #' package = "handwriterRF" #' ) #' s2 <- system.file(file.path("extdata", "docs", "w0238_s01_pWOZ_r02.png"), diff --git a/R/train.R b/R/train.R index 2b88afcc..f57a4199 100644 --- a/R/train.R +++ b/R/train.R @@ -20,9 +20,34 @@ #' Train a Random Forest #' -#' Train a random forest with \pkg{ranger} from a data frame of cluster fill rates. +#' Train a random forest with \pkg{ranger} from a data frame of writer profiles +#' estimated with \code{\link{get_cluster_fill_rates}}. `train_rf` calculates +#' the distance between all pairs of writer profiles using one or more distance +#' measures. Currently, the available distance measures are absolute, Manhattan, +#' Euclidean, maximum, and cosine. #' -#' @param df A data frame of cluster fill rates created with +#' The absolute distance between two n-length vectors of cluster fill rates, a +#' and b, is a vector of the same length as a and b. It can be calculated as +#' abs(a-b) where subtraction is performed element-wise, then the absolute +#' value of each element is returned. More specifically, element i of the vector is \eqn{|a_i +#' - b_i|} for \eqn{i=1,2,...,n}. +#' +#' The Manhattan distance between two n-length vectors of cluster fill rates, a and b, is +#' \eqn{\sum_{i=1}^n |a_i - b_i|}. In other words, it is the sum of the absolute +#' distance vector. +#' +#' The Euclidean distance between two n-length vectors of cluster fill rates, a and b, is +#' \eqn{\sqrt{\sum_{i=1}^n (a_i - b_i)^2}}. In other words, it is the sum of the elements of the +#' absolute distance vector. +#' +#' The maximum distance between two n-length vectors of cluster fill rates, a and b, is +#' \eqn{\max_{1 \leq i \leq n}{\{|a_i - b_i|\}}}. In other words, it is the sum of the elements of the +#' absolute distance vector. +#' +#' The cosine distance between two n-length vectors of cluster fill rates, a and b, is +#' \eqn{\sum_{i=1}^n (a_i - b_i)^2 / (\sqrt{\sum_{i=1}^n a_i^2}\sqrt{\sum_{i=1}^n b_i^2})}. +#' +#' @param df A data frame of writer profiles created with #' \code{\link{get_cluster_fill_rates}} #' @param ntrees An integer number of decision trees to use #' @param distance_measures A vector of distance measures. Any combination of diff --git a/README.md b/README.md index 37a5f497..faa1c440 100644 --- a/README.md +++ b/README.md @@ -53,8 +53,8 @@ Compare 2 of these samples. In this case, both samples are from writer 30. ``` r -sample1 <- system.file(file.path("extdata", "docs", "w0030_s01_pWOZ_r01.png"), package = "handwriterRF") -sample2 <- system.file(file.path("extdata", "docs", "w0030_s01_pWOZ_r02.png"), package = "handwriterRF") +sample1 <- system.file(file.path("extdata", "docs", "w0005_s01_pLND_r03.png"), package = "handwriterRF") +sample2 <- system.file(file.path("extdata", "docs", "w0005_s02_pWOZ_r02.png"), package = "handwriterRF") slr <- calculate_slr(sample1, sample2) ``` @@ -84,8 +84,8 @@ the data frame fits on this page. slr ``` - docname1 writer1 docname2 writer2 score slr - 1 w0030_s01_pWOZ_r01 unknown1 w0030_s01_pWOZ_r02 unknown2 0.955 135.159 + docname1 writer1 docname2 writer2 score slr + 1 w0005_s01_pLND_r03 unknown1 w0005_s02_pWOZ_r02 unknown2 0.635 1.482318 ### Interpret the Score-base Likelihood Ratio @@ -95,4 +95,4 @@ View a verbal interpretation of the score-based likelihood ratio. interpret_slr(slr) ``` - [1] "A score-based likelihood ratio of 135.2 means the likelihood of observing a similarity score of 0.955 if the documents were written by the same person is 135.2 times greater than the likelihood of observing this score if the documents were written by different writers." + [1] "A score-based likelihood ratio of 1.5 means the likelihood of observing a similarity score of 0.635 if the documents were written by the same person is 1.5 times greater than the likelihood of observing this score if the documents were written by different writers." diff --git a/README.qmd b/README.qmd index 8ac86dcd..fece50ad 100644 --- a/README.qmd +++ b/README.qmd @@ -41,8 +41,8 @@ library(handwriterRF) The package includes 4 example handwriting samples from the [CSAFE Handwriting Database](https://forensicstats.org/handwritingdatabase/). Compare 2 of these samples. In this case, both samples are from writer 30. ```{r calculate1, message=FALSE} -sample1 <- system.file(file.path("extdata", "docs", "w0030_s01_pWOZ_r01.png"), package = "handwriterRF") -sample2 <- system.file(file.path("extdata", "docs", "w0030_s01_pWOZ_r02.png"), package = "handwriterRF") +sample1 <- system.file(file.path("extdata", "docs", "w0005_s01_pLND_r03.png"), package = "handwriterRF") +sample2 <- system.file(file.path("extdata", "docs", "w0005_s02_pWOZ_r02.png"), package = "handwriterRF") slr <- calculate_slr(sample1, sample2) ``` diff --git a/data-raw/make_clusters.R b/data-raw/make_clusters.R deleted file mode 100644 index 8592af47..00000000 --- a/data-raw/make_clusters.R +++ /dev/null @@ -1,31 +0,0 @@ -# create temp folder - -graphs_dir <- file.path(tempdir(), "make_clusters", "graphs") -clusters_dir <- file.path(tempdir(), "make_clusters", "clusters") -create_dir(graphs_dir) -create_dir(clusters_dir) - -# get graphs -handwriter::process_batch_dir( - system.file("extdata", package = "handwriterRF"), - graphs_dir -) - -# get clusters -handwriter::get_clusters_batch( - template = templateK40, - input_dir = graphs_dir, - output_dir = clusters_dir, - writer_indices = c(2, 5), - doc_indices = c(7, 18) -) - -# save -w0030_s01_pWOZ_r01_clusters <- readRDS(file.path(clusters_dir, "w0030_s01_pWOZ_r01.rds")) -w0030_s01_pWOZ_r02_clusters <- readRDS(file.path(clusters_dir, "w0030_s01_pWOZ_r02.rds")) -w0238_s01_pWOZ_r02_clusters <- readRDS(file.path(clusters_dir, "w0238_s01_pWOZ_r02.rds")) -w0238_s01_pWOZ_r03_clusters <- readRDS(file.path(clusters_dir, "w0238_s01_pWOZ_r03.rds")) -usethis::use_data(w0030_s01_pWOZ_r01_clusters) -usethis::use_data(w0030_s01_pWOZ_r02_clusters) -usethis::use_data(w0238_s01_pWOZ_r02_clusters) -usethis::use_data(w0238_s01_pWOZ_r03_clusters) diff --git a/data-raw/train_valid_test.R b/data-raw/train_valid_test.R index c6cbdcb5..fcb3ea66 100644 --- a/data-raw/train_valid_test.R +++ b/data-raw/train_valid_test.R @@ -1,5 +1,8 @@ -devtools::load_all() +install.packages("handwriter") +devtools::install_github("CSAFE-ISU/handwriterRF") +library(handwriter) +library(handwriterRF) # Helper Functions -------------------------------------------------------- @@ -41,11 +44,15 @@ make_csafe_sets <- function(rates, prompts = c("pWOZ", "pLND"), num_per_prompt = } drop_columns <- function(df) { + df$doc <- paste(df$session, df$prompt, df$rep, sep = "_") df <- df %>% dplyr::ungroup() %>% - dplyr::select(-session, -prompt, -rep) + dplyr::select(-tidyselect::any_of(c("session", "prompt", "rep"))) } + # drop writer and doc column to prevent error with expand_docnames + rates <- rates %>% dplyr::select(-tidyselect::any_of(c("writer", "doc"))) + # split writers train, validation, and test sets all_writers <- find_writers_with_27_docs(df = rates) writers <- split_writers( @@ -71,7 +78,6 @@ make_cvl_sets <- function(rates, num_per_writer = 4, use_German_prompt = FALSE, find_writers_with_5plus_docs <- function(df) { # Filter cvl data frame for writers with 5 or more docs - df <- expand_cvl_docnames(df) writers <- df %>% dplyr::group_by(writer) %>% dplyr::summarize(n = dplyr::n()) %>% @@ -80,28 +86,29 @@ make_cvl_sets <- function(rates, num_per_writer = 4, use_German_prompt = FALSE, return(writers) } - sample_prompts <- function(df, writers, num_per_writer, use_German_prompt) { - df <- expand_cvl_docnames(df = df) - + sample_cvl_prompts <- function(df, set_writers, num_per_writer, use_German_prompt) { if (!use_German_prompt) { df <- df %>% dplyr::filter(prompt != "6-cropped") } df <- df %>% - dplyr::filter(writer %in% writers) %>% + dplyr::filter(writer %in% set_writers) %>% dplyr::group_by(writer) %>% dplyr::slice_sample(n = num_per_writer) return(df) } - drop_prompt_column <- function(df) { + drop_cvl_prompt_column <- function(df) { df <- df %>% dplyr::ungroup() %>% dplyr::select(-prompt) return(df) } + rates <- expand_cvl_docnames(df = rates) + rates$writer <- paste0("c", rates$writer) + all_writers <- find_writers_with_5plus_docs(df = rates) writers <- split_writers( all_writers = all_writers, @@ -109,14 +116,14 @@ make_cvl_sets <- function(rates, num_per_writer = 4, use_German_prompt = FALSE, num_validation_writers = num_validation_writers ) docs <- lapply(writers, function(w) { - sample_prompts( + sample_cvl_prompts( df = rates, - writers = w, + set_writers = w, num_per_writer = num_per_writer, use_German_prompt = use_German_prompt ) }) - docs <- lapply(docs, drop_prompt_column) + docs <- lapply(docs, drop_cvl_prompt_column) return(docs) } @@ -145,54 +152,53 @@ split_writers <- function(all_writers, num_train_writers, num_validation_writers set.seed(100) -# Create data frames of csafe and cvl cluster fill rates -# csafe <- load_cluster_fill_rates(clusters_dir = "/Users/stephanie/Documents/handwriting_datasets/CSAFE_Handwriting_Database/clusters") -# saveRDS(csafe, "data-raw/csafe_cfr.rds") -# -# cvl <- load_cluster_fill_rates("/Users/stephanie/Documents/handwriting_datasets/CVL/clusters") -# saveRDS(cvl, "data-raw/cvl_cfr.rds") +# If you need cluster assignments CVL data +handwriter::get_clusters_batch(input_dir = "path/to/cvl/graphs/dir", + output_dir = "path/to/cvl/clusters/dir", + template = "path/to/template.rds", + writer_indices = c(1,4), + doc_indices = c(6,6), + num_cores = 4) -# Load cluster fill rates -csafe <- readRDS("data-raw/csafe_cfr.rds") -cvl <- readRDS("data-raw/cvl_cfr.rds") +# Create data frames of csafe and cvl cluster fill rates +csafe <- load_cluster_fill_rates(clusters_dir = "/Users/stephanie/Documents/handwriting_datasets/CSAFE_Handwriting_Database/300dpi/clusters") +cvl <- load_cluster_fill_rates("/Users/stephanie/Documents/handwriting_datasets/CVL/300dpi/clusters") -# Make sets +# Make sets. Feel free to change num_train_writers and num_validation_writers. +# Writers not assigned to either of these sets will be placed in the test set. csafe <- make_csafe_sets( - rates = csafe, prompts = c("pWOZ", "pLND"), num_per_prompt = 2, - num_train_writers = 100, num_validation_writers = 150 + rates = csafe, + prompts = c("pWOZ", "pLND"), + num_per_prompt = 2, + num_train_writers = 100, + num_validation_writers = 150 ) -saveRDS(csafe, "data-raw/csafe_sets.rds") - cvl <- make_cvl_sets( - rates = cvl, num_per_writer = 4, use_German_prompt = FALSE, - num_train_writers = 100, num_validation_writers = 150 + rates = cvl, + num_per_writer = 4, + use_German_prompt = FALSE, + num_train_writers = 100, + num_validation_writers = 150 ) -saveRDS(cvl, "data-raw/cvl_sets.rds") - -# csafe <- readRDS("data-raw/csafe_sets.rds") -# cvl <- readRDS("data-raw/cvl_sets.rds") # Train random forest train <- rbind(csafe$train, cvl$train) -saveRDS(train, "data-raw/train.rds") -usethis::use_data(train, overwrite = TRUE) -random_forest <- train_rf(train, ntrees = 200, distance_measures = c("abs", "euc")) -saveRDS(random_forest, "data-raw/random_forest.rds") -usethis::use_data(random_forest, overwrite = TRUE) +# Choose distance measures +rf <- train_rf(train, + ntrees = 200, + distance_measures = c("abs", "euc")) -# Get similarity scores on validation set -validation <- rbind(csafe$validation, cvl$validation) -saveRDS(validation, "data-raw/validation.rds") -usethis::use_data(validation, overwrite = TRUE) -ref_scores <- get_ref_scores(rforest = random_forest, df = validation) -saveRDS(ref_scores, "data-raw/ref_scores.rds") -usethis::use_data(ref_scores, overwrite = TRUE) +# Get similarity scores on validation set. Note: there will be many times more +# 'different writer' scores compared to 'same writer' scores. +validation <- rbind(csafe$validation, cvl$validation) +rscores <- get_ref_scores(rforest = rf, df = validation) # Test set test <- rbind(csafe$test, cvl$test) -saveRDS(test, "data-raw/test.rds") -usethis::use_data(test, overwrite = TRUE) -plot_scores(scores = ref_scores) +results <- compare_writer_profiles(writer_profiles = test, + score_only = FALSE, + rforest = rf, + reference_scores = rscores) diff --git a/inst/extdata/clusters/w0030_s01_pWOZ_r01.rds b/inst/extdata/clusters/w0030_s01_pWOZ_r01.rds deleted file mode 100644 index 38ddd74b..00000000 Binary files a/inst/extdata/clusters/w0030_s01_pWOZ_r01.rds and /dev/null differ diff --git a/inst/extdata/clusters/w0030_s01_pWOZ_r02.rds b/inst/extdata/clusters/w0030_s01_pWOZ_r02.rds deleted file mode 100644 index 3849ea13..00000000 Binary files a/inst/extdata/clusters/w0030_s01_pWOZ_r02.rds and /dev/null differ diff --git a/inst/extdata/docs/w0005_s01_pLND_r03.png b/inst/extdata/docs/w0005_s01_pLND_r03.png new file mode 100644 index 00000000..635efa5b Binary files /dev/null and b/inst/extdata/docs/w0005_s01_pLND_r03.png differ diff --git a/inst/extdata/docs/w0005_s02_pWOZ_r02.png b/inst/extdata/docs/w0005_s02_pWOZ_r02.png new file mode 100644 index 00000000..18463707 Binary files /dev/null and b/inst/extdata/docs/w0005_s02_pWOZ_r02.png differ diff --git a/inst/extdata/docs/w0030_s01_pWOZ_r01.png b/inst/extdata/docs/w0030_s01_pWOZ_r01.png deleted file mode 100644 index a1a07ce4..00000000 Binary files a/inst/extdata/docs/w0030_s01_pWOZ_r01.png and /dev/null differ diff --git a/inst/extdata/docs/w0030_s01_pWOZ_r02.png b/inst/extdata/docs/w0030_s01_pWOZ_r02.png deleted file mode 100644 index 862bdccb..00000000 Binary files a/inst/extdata/docs/w0030_s01_pWOZ_r02.png and /dev/null differ diff --git a/man/calculate_slr.Rd b/man/calculate_slr.Rd index 3cd13061..a333b3d9 100644 --- a/man/calculate_slr.Rd +++ b/man/calculate_slr.Rd @@ -54,16 +54,16 @@ Johnson and Danica Ommen (2021) \url{doi:10.1002/sam.11566}. \examples{ \donttest{ # Compare two samples from the same writer -s1 <- system.file(file.path("extdata", "docs", "w0030_s01_pWOZ_r01.png"), +s1 <- system.file(file.path("extdata", "docs", "w0005_s01_pLND_r03.png"), package = "handwriterRF" ) -s2 <- system.file(file.path("extdata", "docs", "w0030_s01_pWOZ_r02.png"), +s2 <- system.file(file.path("extdata", "docs", "w0005_s02_pWOZ_r02.png"), package = "handwriterRF" ) calculate_slr(s1, s2) # Compare samples from two writers -s1 <- system.file(file.path("extdata", "docs", "w0030_s01_pWOZ_r01.png"), +s1 <- system.file(file.path("extdata", "docs", "w0005_s02_pWOZ_r02.png"), package = "handwriterRF" ) s2 <- system.file(file.path("extdata", "docs", "w0238_s01_pWOZ_r02.png"), diff --git a/man/compare_documents.Rd b/man/compare_documents.Rd index 21cfb18e..adb5f6e0 100644 --- a/man/compare_documents.Rd +++ b/man/compare_documents.Rd @@ -52,20 +52,20 @@ as a comparison method. \examples{ \donttest{ # Compare two documents from the same writer with a similarity score -s1 <- system.file(file.path("extdata", "docs", "w0030_s01_pWOZ_r01.png"), +s1 <- system.file(file.path("extdata", "docs", "w0005_s01_pLND_r03.png"), package = "handwriterRF" ) -s2 <- system.file(file.path("extdata", "docs", "w0030_s01_pWOZ_r02.png"), +s2 <- system.file(file.path("extdata", "docs", "w0005_s02_pWOZ_r02.png"), package = "handwriterRF" ) compare_documents(s1, s2, score_only = TRUE) # Compare two documents from the same writer with a score-based # likelihood ratio -s1 <- system.file(file.path("extdata", "docs", "w0030_s01_pWOZ_r01.png"), +s1 <- system.file(file.path("extdata", "docs", "w0005_s01_pLND_r03.png"), package = "handwriterRF" ) -s2 <- system.file(file.path("extdata", "docs", "w0030_s01_pWOZ_r02.png"), +s2 <- system.file(file.path("extdata", "docs", "w0005_s02_pWOZ_r02.png"), package = "handwriterRF" ) compare_documents(s1, s2, score_only = FALSE) diff --git a/man/train_rf.Rd b/man/train_rf.Rd index 235e9701..07d4b37d 100644 --- a/man/train_rf.Rd +++ b/man/train_rf.Rd @@ -14,7 +14,7 @@ train_rf( ) } \arguments{ -\item{df}{A data frame of cluster fill rates created with +\item{df}{A data frame of writer profiles created with \code{\link{get_cluster_fill_rates}}} \item{ntrees}{An integer number of decision trees to use} @@ -37,7 +37,33 @@ different writer and same writer pairs.} A random forest } \description{ -Train a random forest with \pkg{ranger} from a data frame of cluster fill rates. +Train a random forest with \pkg{ranger} from a data frame of writer profiles +estimated with \code{\link{get_cluster_fill_rates}}. `train_rf` calculates +the distance between all pairs of writer profiles using one or more distance +measures. Currently, the available distance measures are absolute, Manhattan, +Euclidean, maximum, and cosine. +} +\details{ +The absolute distance between two n-length vectors of cluster fill rates, a +and b, is a vector of the same length as a and b. It can be calculated as +abs(a-b) where subtraction is performed element-wise, then the absolute +value of each element is returned. More specifically, element i of the vector is \eqn{|a_i +- b_i|} for \eqn{i=1,2,...,n}. + +The Manhattan distance between two n-length vectors of cluster fill rates, a and b, is +\eqn{\sum_{i=1}^n |a_i - b_i|}. In other words, it is the sum of the absolute +distance vector. + +The Euclidean distance between two n-length vectors of cluster fill rates, a and b, is +\eqn{\sqrt{\sum_{i=1}^n (a_i - b_i)^2}}. In other words, it is the sum of the elements of the +absolute distance vector. + +The maximum distance between two n-length vectors of cluster fill rates, a and b, is +\eqn{\max_{1 \leq i \leq n}{\{|a_i - b_i|\}}}. In other words, it is the sum of the elements of the +absolute distance vector. + +The cosine distance between two n-length vectors of cluster fill rates, a and b, is +\eqn{\sum_{i=1}^n (a_i - b_i)^2 / (\sqrt{\sum_{i=1}^n a_i^2}\sqrt{\sum_{i=1}^n b_i^2})}. } \examples{ rforest <- train_rf( diff --git a/vignettes/.gitignore b/vignettes/.gitignore new file mode 100644 index 00000000..097b2416 --- /dev/null +++ b/vignettes/.gitignore @@ -0,0 +1,2 @@ +*.html +*.R diff --git a/vignettes/make-new-random-forests-and-refence-scores.Rmd b/vignettes/make-new-random-forests-and-refence-scores.Rmd new file mode 100644 index 00000000..630355f9 --- /dev/null +++ b/vignettes/make-new-random-forests-and-refence-scores.Rmd @@ -0,0 +1,171 @@ +--- +title: "Make New Random Forests and Reference Scores" +output: rmarkdown::html_vignette +vignette: > + %\VignetteIndexEntry{train-model} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} +--- + +```{r, include = FALSE} + knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>" + ) +``` + +HandwriterRF has a pre-trained random forest and set of reference similarity scores that you may use with the functions `compare_documents()` and `compare_writer_profiles()`. This tutorial shows you how to train your own random forest and create your own set of reference scores. + +## Training Data + +You need scanned handwriting samples saved as PNG images for training the random forest and making reference scores. The training set must include at least two samples from each writer so that the random forest can see examples of two documents written by the *same writer* and examples of two documents written by *different writers*. + +The [CSAFE Handwriting Database](https://data.csafe.iastate.edu/HandwritingDatabase/) contains suitable handwriting samples that you can download for free if you don't have your own samples. + +## Train a Random Forest + +### Estimate Writer Profiles + +Place handwriting samples that you will use to train a random forest in a folder. The first step is to estimate a writer profile from each handwriting sample. We do this with `get_writer_profiles()`. Behind the scenes, `get_writer_profiles()` performs the following steps for each sample: + +1. Splits the handwriting into component shapes, called *graphs*, with `handwriter::processDocument()`. +2. The graphs are sorted into clusters of similar shapes using a cluster template created with `handwriter::make_clustering_template()`. By default, `get_writer_profiles()` uses the cluster template `templateK40` included with handwriterRF. You may create your own cluster template if you prefer. +3. The number of graphs assigned to each cluster in the template are counted with `handwriter::get_cluster_fill_counts()`. +4. The proportion of graphs assigned to each cluster is calculated with `get_cluster_fill_rates()`. The cluster fill rates serve as an estimate of a writer profile for the writer of the document. + +Load handwriterRF. + +```{r setup} +library(handwriterRF) +``` + +Calculate writer profiles for the training samples with `templateK40`. The output is a data frame. + +```{r profiles, eval=FALSE} +profiles <- get_writer_profiles( + input_dir = "path/to/training/samples/folder", + num_cores = 1, + template = templateK40, + output_dir = "path/to/output/folder" +) +``` + +### Train a Random Forest +Now that we have writer profiles, we can train a random forest. `train_rf()` performs the following steps: + +1. Calculates the distance between each pair of writer profiles. The user chooses which distance measure(s) to use. The available distance measures are absolute, Manhattan, Euclidean, maximum, and cosine. Type `?train_rf` for more information about these measures. +2. Groups the distances into two classes - *same writer* and *different writers* - depending upon whether the two samples were from the same writer or two different writers. +3. Uses the 'ranger' package to train a random forest on the distances. + +When running `train_rf()` you have a several choices to make: + +- Choose the number of decision trees to use. In our experiments with samples from the CSAFE Handwriting Database and the CVL Handwriting Database, we found that `ntrees = 200` produced good results. +- If you want the random forest to be saved in an RDS file, specify an output directory. If you don't use the `output_dir` argument, the random forest will be returned but not saved to your computer. +- There will be more *different writer* distances compared to *same writer*. If you want to train the random forest on *balanced classes*, where there are the same number of distances for both classes, set `downsample = TRUE`. This randomly samples the *different writer* distances to equal the number of *same writer* distances. + +```{r single-rf, eval=FALSE} +rf <- train_rf( + df = profiles, + ntrees = 200, + distance_measures = c("abs", "man", "euc", "max", "cos"), + output_dir = "path/to/output/folder", + downsample = TRUE +) +``` + +If you would like to train a series of random forests with `lapply` or a for loop, use the run number and output directory arguments. The run number is added to the file name when the random forest is saved, so that subsequent random forests are not saved over the previous ones. + +```{r multiple-rfs, eval=FALSE} +for (i in 1:10) { + rf <- train_rf( + df = profiles, + ntrees = 200, + distance_measures = c("abs", "man"), + output_dir = "path/to/output/folder", + run_number = i, + downsample = TRUE + ) +} + +``` + +## Create a Reference Set of Similarity Scores + +The functions `compare_documents()` and `compare_writer_profiles()` either return a similarity score or a score-based likelihood. Both express how similar or not two handwriting samples are to each other. + +The score-based likelihood ratio (SLR) builds upon the observed similarity score by comparing it to reference *same writer* and *different writer* similarity scores. The SLR is the ratio of the likelihood of observing the similarity score if the samples where written by the same writer to the likelihood of observing the similarity score if the samples where written by the different writers. + +If `compare_documents()` and `compare_writer_profiles()` only return the similarity score, reference scores are not used. But if these functions calculate an SLR they need reference scores. HandwriterRF includes a set of reference score as `ref_scores` for use with these functions, but you can also create your own set of reference scores. + +Refer to the sections above to obtain suitable training samples and estimate writer profiles. + +```{r ref-profiles, eval=FALSE} +ref_profiles <- get_writer_profiles( + input_dir = "path/to/ref/samples/folder", + num_cores = 1, + template = templateK40, + output_dir = "path/to/output/folder" +) + +rscores <- get_ref_scores(rforest = rf, + df = ref_profiles) +``` + +We can plot the built-in reference scores. The plot is similar to a histogram. All similarity scores are greater than or equal to zero and less than or equal to one. `plot_scores` divides the interval from 0 to 1 into `n_bins`, and calculates the proportion of scores in bin. A histogram would show the number of scores in each bin, but because there are many times more *different writers* scores compared to *same writer* the histogram for *different writers* scores dwarfs the *same writer* histogram. Plot the rate instead of the frequency, fixes this problem. + +```{r plot, out.width="75%", dpi=300} +plot_scores(scores = ref_scores) +``` + +If we want to see how an observed score compares to the *same writer* and *different writers* scores, we use `obs_score`. + +```{r plot-obs, out.width="75%", dpi=300} +plot_scores(scores = ref_scores, + obs_score = 0.2) +``` + +Plot your own reference scores. + +```{r plot-own, eval=FALSE} +plot_scores(scores = rscores, + obs_score = 0.2) +``` + +## Compare Documents with New Random Forest and Reference Scores + +In this section, we will use the new random forest and set of reference scores to compare two handwritten documents. As before, the handwriting samples need to be scanned and saved as PNG files. Do not use samples or writers that were used to create the random forest or the reference scores. + +Before using the new random forest and reference scores, use +the random forest and reference scores included with handwriterRF. Compare two handwriting samples from the same writer and included with handwriterRF and return a similarity score and score-based likelihood ratio. +```{r compare, message=FALSE} +sample1 <- system.file("extdata", "docs", "w0238_s01_pWOZ_r02.png", package = "handwriterRF") +sample2 <- system.file("extdata", "docs", "w0238_s01_pWOZ_r03.png", package = "handwriterRF") + +df <- compare_documents( + sample1, + sample2, + score_only = FALSE, + project_dir = "~/Desktop/test" +) +df +``` + +The SLR is greater than one, which means the similarity score is more like the reference *same writer* scores than the *different writers* scores. We can also plot the observed score with the reference scores. + +```{r} +plot_scores(scores = ref_scores, obs_score = df$score) +``` +Next, compare the same documents with the new random forest and reference scores and plot the obeserved score. + +```{r new-compare, eval=FALSE} +df_new <- compare_documents( + sample1, + sample2, + score_only = FALSE, + rforest = rf, + reference_scores = rscores +) +df_new + +plot_scores(scores = rscores, obs_score = df_new$score) +```