CSAFE-ISU · stephaniereinders · Nov 6, 2024 · Nov 5, 2024 · Nov 5, 2024 · Nov 5, 2024
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -11,6 +11,7 @@ Encoding: UTF-8
 LazyData: true
 RoxygenNote: 7.3.2
 Suggests: 
+    ggplot2,
     knitr,
     rmarkdown,
     testthat (>= 3.0.0),

diff --git a/NAMESPACE b/NAMESPACE
@@ -6,5 +6,6 @@ export(get_cluster_fill_rates)
 export(get_csafe_train_set)
 export(get_distances)
 export(interpret_slr)
+export(plot_histograms)
 export(train_rf)
 importFrom(magrittr,"%>%")
diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,9 @@
 # handwriterRF (development version)
 
+## Minor improvements and bug fixes
+
+* Created `plot_histograms()` to plot histograms of the reference same writer and different writer similarity scores in `random_forest$scores`.
+
 # handwriterRF 1.0.2
 
 * Removed quotes around "same writer" and "different writer" in documentation.

diff --git a/R/data.R b/R/data.R
@@ -185,11 +185,11 @@
 "templateK40"
 
 
-#' A \pkg{ranger} Random Forest, Distances, and Densities
+#' A \pkg{ranger} Random Forest, Distances, and Similarity Scores
 #'
-#' A list that contains a trained random forest created with \pkg{ranger}, the data
-#' frame of distances used to train the random forest, and two densities
-#' obtained from the random forest.
+#' A list that contains a trained random forest created with \pkg{ranger}, the
+#' data frame of distances used to train the random forest, and similarity
+#' scores calculated from the training data.
 #'
 #' @format A list with the following components:
 #' \describe{
@@ -201,12 +201,9 @@
 #' 300 'different' distances in the data frame.}
 #' \item{rf}{A random forest created with \pkg{ranger} with settings:
 #' importance = 'permutation', scale.permutation.importance = TRUE, and num.trees = 200.}
-#' \item{densities}{A similarity score was obtained for each pair of handwriting samples in the
+#' \item{scores}{A similarity score was obtained for each pair of handwriting samples in the
 #' training data frame, dists, by calculating the proportion of decision trees that voted 'same'
-#' class for the pair. The 'same_writer' density was created by applying \code{\link[stats]{density}}
-#' to the similarity scores for the 300 same writer pairs in dists. Similarly, the 'diff_writer'
-#' density was created by applying the \code{\link[stats]{density}} function to the similarity scores for the 300
-#' different writer pairs in dists. The default settings were used with \code{\link[stats]{density}}.}
+#' class for the pair.}
 #' }
 #'
 #' @examples
@@ -216,11 +213,9 @@
 #' # view the distances data frame
 #' random_forest$dists
 #'
-#' # plot the same writer density
-#' plot(random_forest$densities$same_writer)
-#'
-#' # plot the different writer density
-#' plot(random_forest$densities$diff_writer)
+#' # plot histograms of the similarity scores and place a vertical
+#' # line at similarity score 0.9.
+#' plot_histograms(random_forest, 0.9)
 #'
 #' @md
 "random_forest"
diff --git a/R/percent-rank.R b/R/percent-rank.R
@@ -0,0 +1,185 @@
+# The handwriterRF R package performs writership analysis of handwritten
+# documents. Copyright (C) 2024 Iowa State University of Science and Technology
+# on behalf of its Center for Statistics and Applications in Forensic Evidence
+#
+# This program is free software: you can redistribute it and/or modify it under
+# the terms of the GNU General Public License as published by the Free Software
+# Foundation, either version 3 of the License, or (at your option) any later
+# version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
+# details.
+#
+# You should have received a copy of the GNU General Public License along with
+# this program.  If not, see <https://www.gnu.org/licenses/>.
+
+
+# External Functions ------------------------------------------------------
+
+#' Calculate Percent Ranks
+#'
+#' Compares two handwriting samples scanned and saved a PNG images with the
+#' following steps:
+#' \enumerate{
+#'     \item \code{\link[handwriter]{processDocument}} splits the writing in both samples into component shapes, or graphs.
+#'     \item \code{\link[handwriter]{get_clusters_batch}} groups the graphs into clusters of similar shapes.
+#'     \item \code{\link[handwriter]{get_cluster_fill_counts}} counts the number of graphs assigned to each cluster.
+#'     \item \code{\link{get_cluster_fill_rates}} calculates the proportion of graphs assigned to each cluster. The cluster fill rates serve as a writer profile.
+#'     \item A similarity score is calculated between the cluster fill rates of the two documents using a random forest trained with \pkg{ranger}.
+#'     \item The similarity score is compared to reference samples of same writer and different
+#'     writer similarity scores. The percent rank of the observed similarity score is returned for each sample. The percent rank for score x is
+#'     calculated as the number of scores in the sample less than or equal to x divided by the total number of scores.
+#' }
+#'
+#' @param sample1_path A file path to a handwriting sample saved in PNG file
+#'   format.
+#' @param sample2_path A file path to a second handwriting sample saved in PNG
+#'   file format.
+#' @param rforest Optional. A random forest trained with \pkg{ranger}. If
+#'   rforest is not given, the data object random_forest is used.
+#' @param project_dir Optional. A path to a directory where helper files will be
+#'   saved. If no project directory is specified, the helper files will be saved
+#'   to tempdir() and deleted before the function terminates.
+#'
+#' @return A list of two numbers
+#'
+#' @export
+#'
+#' @examples
+#' \donttest{
+#' # Compare two samples from the same writer
+#' sample1 <- system.file(file.path("extdata", "w0030_s01_pWOZ_r01.png"), package = "handwriterRF")
+#' sample2 <- system.file(file.path("extdata", "w0030_s01_pWOZ_r02.png"), package = "handwriterRF")
+#' calculate_slr(sample1, sample2)
+#'
+#' # Compare samples from two writers
+#' sample1 <- system.file(file.path("extdata", "w0030_s01_pWOZ_r01.png"), package = "handwriterRF")
+#' sample2 <- system.file(file.path("extdata", "w0238_s01_pWOZ_r02.png"), package = "handwriterRF")
+#' calculate_slr(sample1, sample2)
+#' }
+#'
+calculate_percent_rank <- function(sample1_path, sample2_path, rforest = random_forest, project_dir = NULL) {
+  copy_samples_to_project_dir <- function(sample1_path, sample2_path, project_dir) {
+    # Copy samples to project_dir > docs
+    message("Copying samples to output directory > docs...\n")
+    create_dir(file.path(project_dir, "docs"))
+
+    # rename samples if file paths are different but file names are the same
+    if (identical(basename(sample1_path), basename(sample2_path))){
+      file.copy(sample1_path, file.path(project_dir, "docs", "sample1.png"))
+      file.copy(sample2_path, file.path(project_dir, "docs", "sample2.png"))
+    } else {
+      file.copy(sample1_path, file.path(project_dir, "docs", basename(sample1_path)))
+      file.copy(sample2_path, file.path(project_dir, "docs", basename(sample2_path)))
+    }
+
+    # get the sample paths in the project directory
+    sample_paths <- list.files(file.path(project_dir, "docs"), full.names = TRUE)
+    return(sample_paths)
+  }
+
+  skip_if_processed <- function(sample_path, project_dir) {
+    # process file if it hasn't already been processed and saved in project_dir
+    # > graph
+    outfile <- gsub(".png", "_proclist.rds", basename(sample_path))
+    outfile_path <- file.path(project_dir, "graphs", outfile)
+    if (!file.exists(outfile_path)) {
+      doc <- handwriter::processDocument(sample_path)
+      saveRDS(doc, outfile_path)
+    }
+    return()
+  }
+
+  process_and_save_samples <- function(sample1_path, sample2_path, project_dir) {
+    # Process samples and save in project_dir > graphs
+    message("Processing samples...")
+
+    create_dir(file.path(project_dir, "graphs"))
+
+    skip_if_processed(sample_path = sample1_path, project_dir = project_dir)
+    skip_if_processed(sample_path = sample2_path, project_dir = project_dir)
+
+    return()
+  }
+
+  get_percent_rank <- function(score, ref_scores) {
+    # Calculate the percent rank of score for a sample of reference scores
+    pr <- 100 * sum(ref_scores <= score) / length(ref_scores)
+    return(pr)
+  }
+
+  # error if sample1_path == sample2_path
+  if (identical(sample1_path, sample2_path)) {
+    stop("sample1_path and sample2_path cannot be identical.")
+  }
+
+  # set output directory as temp directory if NULL
+  if (is.null(project_dir)) {
+    project_dir <- file.path(tempdir(), "comparison")
+  }
+
+  # keep original sample paths so they can be recorded in the data frame at the
+  # end
+  sample1_path_org <- sample1_path
+  sample2_path_org <- sample2_path
+
+  # copy samples
+  sample_paths <- copy_samples_to_project_dir(
+    sample1_path = sample1_path,
+    sample2_path = sample2_path,
+    project_dir = project_dir
+  )
+  sample1_path <- sample_paths[1]
+  sample2_path <- sample_paths[2]
+
+  # process
+  process_and_save_samples(
+    sample1_path = sample1_path,
+    sample2_path = sample2_path,
+    project_dir = project_dir
+  )
+
+  # cluster
+  clusters <- handwriter::get_clusters_batch(
+    template = templateK40,
+    input_dir = file.path(project_dir, "graphs"),
+    output_dir = file.path(project_dir, "clusters"),
+    writer_indices = c(2, 5),
+    doc_indices = c(7, 18),
+    save_master_file = TRUE
+  )
+  counts <- handwriter::get_cluster_fill_counts(clusters)
+  rates <- get_cluster_fill_rates(counts)
+
+  # distance
+  message("Calculating distance between samples...\n")
+  dist_measures <- which_dists(rforest = rforest)
+  d <- get_distances(df = rates, distance_measures = dist_measures)
+
+  # score
+  message("Calculating similarity score between samples...\n")
+  score <- get_score(rforest = rforest, d = d)
+
+  # percent rank
+  message("Calculating percent ranks for samples...\n")
+  browser()
+  percent_rank_same_writer <- get_percent_rank(score = score, ref_scores = random_forest$scores$same_writer)
+  percent_rank_diff_writer <- get_percent_rank(score = score, ref_scores = random_forest$scores$diff_writer)
+
+  # make data frame of results
+  df <- data.frame("sample1_path" = sample1_path_org, "sample2_path" = sample2_path_org,
+                   "docname1" = basename(sample1_path_org), "docname2" = basename(sample2_path_org),
+                   "score" = score, "percent_rank_same_writer" = percent_rank_same_writer,
+                   "percent_rank_diff_writer" = percent_rank_diff_writer)
+
+  # delete project folder from temp directory or save results to project folder
+  if (project_dir == file.path(tempdir(), "comparison")) {
+    unlink(project_dir, recursive = TRUE)
+  } else {
+    saveRDS(df, file.path(project_dir, "percent_rank.rds"))
+  }
+
+  return(df)
+}
diff --git a/R/plots.R b/R/plots.R
@@ -0,0 +1,87 @@
+# The handwriterRF R package performs writership analysis of handwritten
+# documents. Copyright (C) 2024 Iowa State University of Science and Technology
+# on behalf of its Center for Statistics and Applications in Forensic Evidence
+#
+# This program is free software: you can redistribute it and/or modify it under
+# the terms of the GNU General Public License as published by the Free Software
+# Foundation, either version 3 of the License, or (at your option) any later
+# version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
+# details.
+#
+# You should have received a copy of the GNU General Public License along with
+# this program.  If not, see <https://www.gnu.org/licenses/>.
+
+
+# External Functions ------------------------------------------------------
+
+
+#' Plot Histograms
+#'
+#' Plot histograms of same writer and different writers reference similarity
+#' scores from a random forest created with [train_rf()]. Plot a vertical,
+#' dashed line at a similarity score calculated with [calculate_slr()] to see
+#' whether the score is more typical of the same writer or different writers
+#' reference scores.
+#'
+#' @param rforest A random forest created with [train_rf()]
+#' @param score A similarity score calculated with [calculate_slr()]
+#'
+#' @return A ggplot2 plot of histograms
+#' @export
+#'
+#' @examples
+#' plot_histograms(rforest = random_forest)
+#'
+#' # Add a vertical line 0.1 on the horizontal axis.
+#' plot_histograms(rforest = random_forest, score = 0.1)
+#'
+plot_histograms <- function(rforest, score = NULL) {
+  # Prevent note "no visible binding for global variable"
+  Score <- Group <- NULL
+
+  if (!requireNamespace("ggplot2", quietly = TRUE)) {
+    stop(
+      "Package \"ggplot2\" must be installed to use this function.",
+      call. = FALSE
+    )
+  }
+
+  scores <- rforest$scores
+
+  df <- data.frame(
+    Score = c(scores$same_writer, scores$diff_writer),
+    Group = rep(c("same writer", "different writers"), each = 300)
+  )
+
+  p <- df %>% ggplot2::ggplot(ggplot2::aes(x = Score)) +
+    ggplot2::geom_histogram(position = "identity",
+                            ggplot2::aes(fill = Group),
+                            alpha = 0.4,
+                            bins = 30) +  # Histograms with transparency
+    ggplot2::scale_fill_manual(values = c("same writer" = "#6BA4B8", "different writers" = "#F68D2E")) +  # Customize colors
+    ggplot2::labs(title = "Reference Similarity Scores", x = "Score", y = "Frequency") +
+    ggplot2::theme_bw()
+
+  # Optional - add vertical line at score
+  if (!is.null(score)) {
+    p <- p +
+      ggplot2::geom_vline(xintercept = score,
+                          color = "black",
+                          linetype = "dashed") +  # Add vertical line
+      ggplot2::annotate("text",
+                        x = score,
+                        y = 75,  # Dynamically position the label
+                        label = paste("similarity score", score),
+                        color = "black",
+                        size = 3,
+                        angle = 90,
+                        vjust = -1,
+                        hjust = 0.5)  # Add text annotation
+  }
+
+  return(p)
+}
diff --git a/R/slrs.R b/R/slrs.R
@@ -159,6 +159,7 @@ calculate_slr <- function(sample1_path, sample2_path, rforest = random_forest, p
 
   # SLR
   message("Calculating SLR for samples...\n")
+  rforest$densities <- make_densities_from_rf(scores = rforest$scores)
   numerator <- eval_density_at_point(den = rforest$densities$same_writer, x = score, type = "numerator")
   denominator <- eval_density_at_point(den = rforest$densities$diff_writer, x = score, type = "denominator")
   slr <- numerator / denominator
@@ -255,3 +256,22 @@ eval_density_at_point <- function(den, x, type, zero_correction = 1e-10) {
 
   return(y)
 }
+
+
+#' Make Densities from a Trained Random Forest
+#'
+#' Create densities of same writer and different writer scores produced by a
+#' trained random forest.
+#'
+#' @param scores A list of reference scores created with \code{\link{make_scores_from_rf()}}.
+#'
+#' @return A list of densities
+#'
+#' @noRd
+make_densities_from_rf <- function(scores) {
+  pdfs <- list()
+  pdfs$same_writer <- stats::density(scores$same_writer, kernel = 'gaussian', n = 10000)
+  pdfs$diff_writer <- stats::density(scores$diff_writer, kernel = 'gaussian', n = 10000)
+
+  return(pdfs)
+}