diff --git a/.Rbuildignore b/.Rbuildignore index 189c0ec..5610b24 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -13,3 +13,4 @@ ^altdoc$ ^_quarto$ ^cran-comments\.md$ +^CRAN-SUBMISSION$ diff --git a/DESCRIPTION b/DESCRIPTION index 518f2db..92bba8a 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,11 +1,11 @@ Package: handwriterRF Type: Package Title: Handwriting Analysis with Random Forests -Version: 1.0.1 +Version: 1.0.2 Authors@R: c(person("Iowa State University of Science and Technology on behalf of its Center for Statistics and Applications in Forensic Evidence", role = c("aut", "cph", "fnd")), person("Stephanie", "Reinders", role = c("aut", "cre"), email = "reinders.stephanie@gmail.com")) Maintainer: Stephanie Reinders -Description: Perform forensic handwriting analysis of two scanned handwritten documents. This package implements the statistical method described by Madeline Johnson and Danica Ommen (2021) . Similarity measures and a random forest produce a score-based likelihood ratio that quantifies the strength of the evidence in favor of the documents being written by the 'same writer' or 'different writers.' +Description: Perform forensic handwriting analysis of two scanned handwritten documents. This package implements the statistical method described by Madeline Johnson and Danica Ommen (2021) . Similarity measures and a random forest produce a score-based likelihood ratio that quantifies the strength of the evidence in favor of the documents being written by the same writer or different writers. License: GPL (>= 3) Encoding: UTF-8 LazyData: true diff --git a/NEWS.md b/NEWS.md index e8c23fe..29fea90 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,9 @@ +# handwriterRF 1.0.2 + +* Removed quotes around "same writer" and "different writer" in documentation. + +* Removed dontrun{} from the examples for random_forest. Changed example for get_distances() to something that runs in less than 5 seconds and removed dontrun{} from this example. The examples for calculate_slr() take longer than 5 seconds to run so dontrun{} was changed to donttest{} for these examples. + # handwriterRF 1.0.1 # handwriterRF 1.0.0 diff --git a/R/data.R b/R/data.R index 1861fcf..c8b949a 100644 --- a/R/data.R +++ b/R/data.R @@ -216,13 +216,11 @@ #' # view the distances data frame #' random_forest$dists #' -#' \dontrun{ #' # plot the same writer density #' plot(random_forest$densities$same_writer) #' #' # plot the different writer density #' plot(random_forest$densities$diff_writer) -#' } #' #' @md "random_forest" diff --git a/R/distances.R b/R/distances.R index 3bc9ad6..662998a 100644 --- a/R/distances.R +++ b/R/distances.R @@ -62,10 +62,8 @@ #' # calculate maximum and Euclidean distances between the first 3 documents in cfr. #' distances <- get_distances(df = cfr[1:3, ], distance_measures = c('max', 'euc')) #' -#' \dontrun{ -#' # calculate absolute and Euclidean distances between all documents in cfr. -#' distances <- get_distances(df = cfr, distance_measures = c('abs', 'euc')) -#' } +#' distances <- get_distances(df = cfr, distance_measures = c('man')) +#' get_distances <- function(df, distance_measures) { dists <- list() diff --git a/R/scores.R b/R/scores.R index 27a6c88..1f0bfe2 100644 --- a/R/scores.R +++ b/R/scores.R @@ -35,7 +35,7 @@ get_score <- function(d, rforest) { get_prop_same_votes <- function(preds) { # Get the proportion of decision trees in the trained random forest that - # predict, or vote, 'same writer'. + # predict (vote) same writer. preds <- as.data.frame(preds) ntrees <- ncol(preds) prop <- rowSums(preds == 2) / ntrees diff --git a/R/slrs.R b/R/slrs.R index a905510..ba4556b 100644 --- a/R/slrs.R +++ b/R/slrs.R @@ -28,9 +28,9 @@ #' \item \code{\link[handwriter]{get_cluster_fill_counts}} counts the number of graphs assigned to each cluster. #' \item \code{\link{get_cluster_fill_rates}} calculates the proportion of graphs assigned to each cluster. The cluster fill rates serve as a writer profile. #' \item A similarity score is calculated between the cluster fill rates of the two documents using a random forest trained with \pkg{ranger}. -#' \item The similarity score is compared to reference distributions of 'same writer' and 'different -#' writer' similarity scores. The result is a score-based likelihood ratio that conveys the strength -#' of the evidence in favor of 'same writer' or 'different writer'. For more details, see Madeline +#' \item The similarity score is compared to reference distributions of same writer and different +#' writer similarity scores. The result is a score-based likelihood ratio that conveys the strength +#' of the evidence in favor of same writer or different writer. For more details, see Madeline #' Johnson and Danica Ommen (2021) . #' } #' @@ -49,7 +49,7 @@ #' @export #' #' @examples -#' \dontrun{ +#' \donttest{ #' # Compare two samples from the same writer #' sample1 <- system.file(file.path("extdata", "w0030_s01_pWOZ_r01.png"), package = "handwriterRF") #' sample2 <- system.file(file.path("extdata", "w0030_s01_pWOZ_r02.png"), package = "handwriterRF") diff --git a/R/train.R b/R/train.R index 24cae92..c915f86 100644 --- a/R/train.R +++ b/R/train.R @@ -31,7 +31,7 @@ #' saved. #' @param run_number An integer used for both the set.seed function and to #' distinguish between different runs on the same input data frame. -#' @param downsample Whether to downsample the number of 'different writer' +#' @param downsample Whether to downsample the number of different writer #' distances before training the random forest. If TRUE, the different writer #' distances will be randomly sampled, resulting in the same number of #' different writer and same writer pairs. @@ -137,7 +137,7 @@ get_csafe_train_set <- function(df, train_prompt_codes) { #' Make Densities from a Trained Random Forest #' -#' Create densities of 'same writer' and 'different writer' scores produced by a +#' Create densities of same writer and different writer scores produced by a #' trained random forest. #' #' @param rforest A \pkg{ranger} random forest created with \code{\link{train_rf}}. diff --git a/README.qmd b/README.qmd index 74f0a83..b1d97f8 100644 --- a/README.qmd +++ b/README.qmd @@ -58,9 +58,9 @@ The result is a data frame: - *docname1* is the file name of the first sample. - *docname2* is the file name of the second sample. - *score* is the similarity score between the two samples. -- *numerator* is the numerator value of the score-based likelihood ratio. Intuitively, the larger the value the more the similarity score looks like the reference 'same writer' similarity scores. -- *denominator* is the denominator value of the score-based likelihood ratio. Intuitively, the larger the value the more the similarity score looks like the reference 'different writers' similarity scores. -- *slr* is a score-based likelihood ratio that quantifies the strength of evidence in favor of 'same writer' or 'different writer.' +- *numerator* is the numerator value of the score-based likelihood ratio. Intuitively, the larger the value the more the similarity score looks like the reference same writer similarity scores. +- *denominator* is the denominator value of the score-based likelihood ratio. Intuitively, the larger the value the more the similarity score looks like the reference different writers similarity scores. +- *slr* is a score-based likelihood ratio that quantifies the strength of evidence in favor of same writer or different writer. Display the slr data frame. We hide the file path columns here so that the data frame fits on this page. diff --git a/cran-comments.md b/cran-comments.md index c5e174e..fcbd4f2 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -1,10 +1,9 @@ ## Resubmission This is a resubmission. In this version I have: -* Fixed error in test "Train random forest works with ranger package" that -occurred on Debian. Despite setting the random number generator seed, -the random forest created on Debian has reasonable values but is not equal to the random forest created on a Mac and used in the test as the expected output. Now the -test instead checks that the function runs without error. +* Removed quotes around "same writer" and "different writer" in documentation. + +* Removed dontrun{} from the examples for random_forest. Changed example for get_distances() to something that runs in less than 5 seconds and removed dontrun{} from this example. The examples for calculate_slr() take longer than 5 seconds to run so dontrun{} was changed to donttest{} for these examples. ## R CMD check results diff --git a/man/calculate_slr.Rd b/man/calculate_slr.Rd index 4ab76a4..19dee7d 100644 --- a/man/calculate_slr.Rd +++ b/man/calculate_slr.Rd @@ -37,14 +37,14 @@ following steps: \item \code{\link[handwriter]{get_cluster_fill_counts}} counts the number of graphs assigned to each cluster. \item \code{\link{get_cluster_fill_rates}} calculates the proportion of graphs assigned to each cluster. The cluster fill rates serve as a writer profile. \item A similarity score is calculated between the cluster fill rates of the two documents using a random forest trained with \pkg{ranger}. - \item The similarity score is compared to reference distributions of 'same writer' and 'different - writer' similarity scores. The result is a score-based likelihood ratio that conveys the strength - of the evidence in favor of 'same writer' or 'different writer'. For more details, see Madeline + \item The similarity score is compared to reference distributions of same writer and different + writer similarity scores. The result is a score-based likelihood ratio that conveys the strength + of the evidence in favor of same writer or different writer. For more details, see Madeline Johnson and Danica Ommen (2021) . } } \examples{ -\dontrun{ +\donttest{ # Compare two samples from the same writer sample1 <- system.file(file.path("extdata", "w0030_s01_pWOZ_r01.png"), package = "handwriterRF") sample2 <- system.file(file.path("extdata", "w0030_s01_pWOZ_r02.png"), package = "handwriterRF") diff --git a/man/get_distances.Rd b/man/get_distances.Rd index a55f163..b76d883 100644 --- a/man/get_distances.Rd +++ b/man/get_distances.Rd @@ -51,8 +51,6 @@ The cosine distance between two n-length vectors of cluster fill rates, a and b, # calculate maximum and Euclidean distances between the first 3 documents in cfr. distances <- get_distances(df = cfr[1:3, ], distance_measures = c('max', 'euc')) -\dontrun{ -# calculate absolute and Euclidean distances between all documents in cfr. -distances <- get_distances(df = cfr, distance_measures = c('abs', 'euc')) -} +distances <- get_distances(df = cfr, distance_measures = c('man')) + } diff --git a/man/random_forest.Rd b/man/random_forest.Rd index e818c9d..6dbc14a 100644 --- a/man/random_forest.Rd +++ b/man/random_forest.Rd @@ -38,13 +38,11 @@ random_forest$rf # view the distances data frame random_forest$dists -\dontrun{ # plot the same writer density plot(random_forest$densities$same_writer) # plot the different writer density plot(random_forest$densities$diff_writer) -} } \keyword{datasets} diff --git a/man/train_rf.Rd b/man/train_rf.Rd index 71f2f46..13d6a2d 100644 --- a/man/train_rf.Rd +++ b/man/train_rf.Rd @@ -28,7 +28,7 @@ saved.} \item{run_number}{An integer used for both the set.seed function and to distinguish between different runs on the same input data frame.} -\item{downsample}{Whether to downsample the number of 'different writer' +\item{downsample}{Whether to downsample the number of different writer distances before training the random forest. If TRUE, the different writer distances will be randomly sampled, resulting in the same number of different writer and same writer pairs.}