From f164c6e9ea53a9f8e48f291095173f70f0b17960 Mon Sep 17 00:00:00 2001 From: gabrielodom Date: Tue, 13 Jul 2021 14:30:29 -0400 Subject: [PATCH] Resolved Issue #46; updated and organized package dependencies (and removed tibble::, added tidyr:: dependency); fixed various formatting/style/workflow errors in lookup*() functions; added warning to lookup_syn() for more than one substance; added tests. --- DESCRIPTION | 10 ++--- NAMESPACE | 5 ++- R/lookup.R | 46 +++++++++++++++----- R/lookup_syn.R | 73 +++++++++++++++++++++++--------- R/parse.R | 4 +- man/lookup.Rd | 6 ++- tests/testthat/test-lookup.R | 16 +++++++ tests/testthat/test-lookup_syn.R | 53 ++++++++++++++++------- 8 files changed, 158 insertions(+), 55 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index d9de4c1..73594df 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -38,21 +38,21 @@ Depends: Imports: dplyr, magrittr, + purrr, stats, stringr, - tibble, + tidyr, tidytext, utils Suggests: - purrr, - knitr, - rmarkdown, conflicted, + knitr, readr, + rmarkdown, rvest, sqldf, - tidyr, testthat, + tibble, usethis, xml2 VignetteBuilder: knitr diff --git a/NAMESPACE b/NAMESPACE index 0bbb2a6..45b9d17 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -8,10 +8,13 @@ importFrom(dplyr,anti_join) importFrom(dplyr,case_when) importFrom(dplyr,filter) importFrom(dplyr,mutate) +importFrom(dplyr,rename) +importFrom(dplyr,select) importFrom(magrittr,"%>%") +importFrom(purrr,map_df) importFrom(stats,na.omit) importFrom(stringr,str_detect) importFrom(stringr,str_replace_all) -importFrom(tibble,tibble) +importFrom(tidyr,drop_na) importFrom(tidytext,unnest_tokens) importFrom(utils,data) diff --git a/R/lookup.R b/R/lookup.R index 8281de3..10b35d8 100644 --- a/R/lookup.R +++ b/R/lookup.R @@ -12,9 +12,15 @@ #' for in column \code{category}? Defaults to TRUE. #' @param searchSynonym Should the substances listed in \code{...} be searched #' for in column \code{synonym}? Defaults to TRUE. +#' @param dropUnmatched Should words in \code{drug_vec} which do not match +#' substances in our database be dropped from the table? Defaults to FALSE. #' #' @return A lookup table with category \code{data.frame} having four columns: #' original search term, drug class, drug category, and drug street name. +#' +#' @importFrom purrr map_df +#' @importFrom tidyr drop_na +#' #' @export #' #' @examples @@ -23,23 +29,34 @@ lookup <- function(drug_vec = NULL, ..., searchClass = TRUE, searchCategory = TRUE, - searchSynonym = TRUE) { + searchSynonym = TRUE, + dropUnmatched = FALSE) { + # browser() thingy <- c(drug_vec, as.character(list(...))) thingy_char <- vapply( X = as.character(thingy), FUN = tolower, FUN.VALUE = character(1), - USE.NAMES = FALSE) + USE.NAMES = FALSE + ) # lookup individual words - answer <- purrr::map_df(thingy_char, .lookup, - searchClass, - searchCategory, - searchSynonym) + answer <- map_df( + .x = thingy_char, + .f = .lookup, + searchClass, + searchCategory, + searchSynonym + ) row.names(answer) <- NULL + + if (dropUnmatched) { + answer <- drop_na(answer, "class":"synonym") + } + answer } @@ -68,13 +85,22 @@ lookup <- function(drug_vec = NULL, ..., # Combine row match logic (use OR for base FALSE layer) matches_lgl <- classRowMatches | categoryRowMatches | synonymRowMatches - answer <- DOPE::lookup_df[matches_lgl,, drop = FALSE] + answer <- DOPE::lookup_df[matches_lgl, , drop = FALSE] if (nrow(answer) == 0){ - answer <- data.frame(original_word = x, class = NA_character_, category = NA_character_, - synonym = NA_character_) + answer <- data.frame( + original_word = x, + class = NA_character_, + category = NA_character_, + synonym = NA_character_ + ) } else{ - answer <- data.frame(original_word = x, answer) + answer <- data.frame( + original_word = x, + answer + ) } + answer + } diff --git a/R/lookup_syn.R b/R/lookup_syn.R index 02a4ac4..ca88796 100644 --- a/R/lookup_syn.R +++ b/R/lookup_syn.R @@ -7,58 +7,89 @@ #' #' @return A lookup table with category \code{data.frame} having three columns: #' drug class, drug category match, and synonym name. +#' +#' @importFrom magrittr %>% +#' @importFrom dplyr rename select filter +#' #' @export #' #' @examples #' lookup_syn("zip") lookup_syn <- function(drug_name) { + # browser() #binding vars to function category <- original_word <- synonym <- NULL + # Make sure only a single word is supplied (this function is not designed to + # be vectorised) + if (length(drug_name) > 1L) { + drug_name <- drug_name[1] + warning("Only the first element of drug_name will be used. + If you have multiple substances to query, please use lookup() instead.") + } + # Make sure drug_name is a string if (is.character(drug_name)){ + # make lowercase and remove leading and trailing whitespace - drug_name <- tolower(drug_name) %>% trimws() + drug_name <- + tolower(drug_name) %>% + trimws() + } else { stop("drug_name should be a string of one drug name") } # lookup individual words - matches <- DOPE::lookup(drug_name, searchClass=TRUE, - searchCategory=TRUE, - searchSynonym=TRUE) + matches_df <- DOPE::lookup( + drug_name, + searchClass = TRUE, + searchCategory = TRUE, + searchSynonym = TRUE + ) - cat_matches <- unique(matches[,c("category")]) + drugCateg_char <- matches_df[ , "category", drop = TRUE] + cat_matches <- unique(drugCateg_char) + + if (length(cat_matches) == 1) { + + answer <- lookup( + matches_df$category, + searchClass = TRUE, + searchCategory = TRUE, + searchSynonym = FALSE + ) - if (length(cat_matches) == 1){ - answer <- lookup(matches$category, searchClass=TRUE, - searchCategory=TRUE, - searchSynonym = FALSE) - answer <- answer %>% - dplyr::rename("category_match" = category) %>% - dplyr::select(-original_word) - answer <- subset(answer, synonym != drug_name) - } else if(drug_name %in% matches[,c("category")] ){ - answer <- matches[matches$category == drug_name, ] answer <- answer %>% - dplyr::rename("category_match" = category) %>% - dplyr::select(-original_word) - answer <- subset(answer, synonym != drug_name) + rename("category_match" = category) %>% + select(-original_word) %>% + filter(synonym != drug_name) + + } else if (drug_name %in% drugCateg_char) { + + answer <- matches_df %>% + filter(category == drug_name) %>% + rename("category_match" = category) %>% + select(-original_word) %>% + filter(synonym != drug_name) + } else { + message( "Your search matched multiple categories. Please choose one \n", "from the following list and refine your search. Example: \n", -"lookup_syn('", unique(matches[c("category")])[1,1], "') or ", -"lookup_syn('", unique(matches[c("category")])[2,1], "')" +"lookup_syn('", cat_matches[1], "') or ", +"lookup_syn('", cat_matches[2], "')" ) - answer <- unique(matches[c("category")]) + answer <- unique(matches_df["category"]) } answer + } diff --git a/R/parse.R b/R/parse.R index bf03580..168791c 100644 --- a/R/parse.R +++ b/R/parse.R @@ -13,7 +13,7 @@ #' @importFrom stringr str_detect str_replace_all #' @importFrom tidytext unnest_tokens #' @importFrom dplyr mutate filter anti_join case_when -#' @importFrom tibble tibble +#' #' @export #' #' @examples @@ -37,7 +37,7 @@ parse <- function(drug_vec){ #ex. and combination of "speedball" also preserved unnestedDrugs_df <- - tibble(drug_vec) %>% + data.frame(drug_vec) %>% unnest_tokens( word, drug_vec, token = "regex", pattern = "[,|-|+|&| ]", to_lower = TRUE ) %>% diff --git a/man/lookup.Rd b/man/lookup.Rd index 1be69cc..6c35036 100644 --- a/man/lookup.Rd +++ b/man/lookup.Rd @@ -9,7 +9,8 @@ lookup( ..., searchClass = TRUE, searchCategory = TRUE, - searchSynonym = TRUE + searchSynonym = TRUE, + dropUnmatched = FALSE ) } \arguments{ @@ -25,6 +26,9 @@ for in column \code{category}? Defaults to TRUE.} \item{searchSynonym}{Should the substances listed in \code{...} be searched for in column \code{synonym}? Defaults to TRUE.} + +\item{dropUnmatched}{Should words in \code{drug_vec} which do not match +substances in our database be dropped from the table? Defaults to FALSE.} } \value{ A lookup table with category \code{data.frame} having four columns: diff --git a/tests/testthat/test-lookup.R b/tests/testthat/test-lookup.R index 2cdc121..672d502 100644 --- a/tests/testthat/test-lookup.R +++ b/tests/testthat/test-lookup.R @@ -34,6 +34,7 @@ test_that("lookup with dots works", { expect_equal(testOut2_df, lookup("zip", "shrooms")) }) + ### Test Vector input ### test_vector <- c("zip", "shrooms") @@ -41,6 +42,7 @@ test_that("lookup with vector as first argument works", { expect_equal(testOut2_df, lookup(test_vector)) }) + ### Test return of original word ### testOut_df3 <- data.frame( original_word = c("zip"), @@ -53,3 +55,17 @@ testOut_df3 <- data.frame( test_that("the original lookup word is returned in the dataframe", { expect_equal(testOut_df3, lookup("zip")) }) + + +### Test Dropping Rows with No Classification ### +# See https://github.com/CTN-0094/DOPE/issues/46 +testSentence_char <- + "Uh, there is a lot of sexual, sex in the gay community while on MDMA" +parsedSentence_char <- parse(testSentence_char) + +test_that("we can drop unclassified words", { + expect_equal( + lookup(parsedSentence_char, dropUnmatched = TRUE), + lookup("mdma") + ) +}) diff --git a/tests/testthat/test-lookup_syn.R b/tests/testthat/test-lookup_syn.R index 18ffa7f..4642918 100644 --- a/tests/testthat/test-lookup_syn.R +++ b/tests/testthat/test-lookup_syn.R @@ -1,30 +1,53 @@ ### Test for single category match ### testOut_df <- data.frame( - class = rep(c("cannabis"),each=5), - category_match = rep(c("marijuana"), each= 5), + class = rep("cannabis", each = 5), + category_match = rep("marijuana", each = 5), synonym = c("420", "a-bomb", "acapulco gold", "acapulco red", "ace") ) test_that("single category match works", { - expect_equal(testOut_df, head(lookup_syn("draf weed"),5)) + expect_equal( + testOut_df, + head(lookup_syn("draf weed"), 5) + ) +}) + + +### Test that Multiple Entries are Discarded ### +test_that("Drug vector only uses first entry and throws warning", { + expect_warning( + lookup_syn(c("ritalin", "opium")), + "Only the first element" + ) }) + +### Test Drug Category Search ### +# This is the else if() part +test_that("Drug category matches multiple times handled correctly", { + expect_equal( + data.frame( + class = "stimulant", + category_match = "ritalin", + synonym = "kibbles and bits" + ), + lookup_syn("ritalin") + ) +}) + + ### Test with multiple category matches ### testOut_df2 <- data.frame( - category =c( - "amphetamine" - ,"cocaine" - ,"crack" - ,"crack cocaine" - ,"khat" - ,"methamphetamine" - ,"methcathinone" - ,"methylphenidate" - ,"ritalin" - ,"synthetic cathinone" + category = c( + "amphetamine", "cocaine", "crack", "crack cocaine", "khat", + "methamphetamine", "methcathinone", "methylphenidate", "ritalin", + "synthetic cathinone" ) ) test_that("multiple category match works", { - expect_equal(testOut_df2$category, lookup_syn("stimulant")$category) + expect_equal( + testOut_df2$category, + lookup_syn("stimulant")$category + ) })