From f164c6e9ea53a9f8e48f291095173f70f0b17960 Mon Sep 17 00:00:00 2001
From: gabrielodom <gabriel.odom@fiu.edu>
Date: Tue, 13 Jul 2021 14:30:29 -0400
Subject: [PATCH] Resolved Issue #46; updated and organized package
 dependencies (and removed tibble::, added tidyr:: dependency); fixed various
 formatting/style/workflow errors in lookup*() functions; added warning to
 lookup_syn() for more than one substance; added tests.

---
 DESCRIPTION                      | 10 ++---
 NAMESPACE                        |  5 ++-
 R/lookup.R                       | 46 +++++++++++++++-----
 R/lookup_syn.R                   | 73 +++++++++++++++++++++++---------
 R/parse.R                        |  4 +-
 man/lookup.Rd                    |  6 ++-
 tests/testthat/test-lookup.R     | 16 +++++++
 tests/testthat/test-lookup_syn.R | 53 ++++++++++++++++-------
 8 files changed, 158 insertions(+), 55 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index d9de4c1..73594df 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -38,21 +38,21 @@ Depends:
 Imports: 
     dplyr,
     magrittr,
+    purrr,
     stats,
     stringr,
-    tibble,
+    tidyr,
     tidytext,
     utils
 Suggests: 
-    purrr,
-    knitr,
-    rmarkdown,
     conflicted,
+    knitr,
     readr,
+    rmarkdown,
     rvest,
     sqldf,
-    tidyr,
     testthat,
+    tibble,
     usethis,
     xml2
 VignetteBuilder: knitr
diff --git a/NAMESPACE b/NAMESPACE
index 0bbb2a6..45b9d17 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -8,10 +8,13 @@ importFrom(dplyr,anti_join)
 importFrom(dplyr,case_when)
 importFrom(dplyr,filter)
 importFrom(dplyr,mutate)
+importFrom(dplyr,rename)
+importFrom(dplyr,select)
 importFrom(magrittr,"%>%")
+importFrom(purrr,map_df)
 importFrom(stats,na.omit)
 importFrom(stringr,str_detect)
 importFrom(stringr,str_replace_all)
-importFrom(tibble,tibble)
+importFrom(tidyr,drop_na)
 importFrom(tidytext,unnest_tokens)
 importFrom(utils,data)
diff --git a/R/lookup.R b/R/lookup.R
index 8281de3..10b35d8 100644
--- a/R/lookup.R
+++ b/R/lookup.R
@@ -12,9 +12,15 @@
 #'   for in column \code{category}? Defaults to TRUE.
 #' @param searchSynonym Should the substances listed in \code{...} be searched
 #'   for in column \code{synonym}? Defaults to TRUE.
+#' @param dropUnmatched Should words in \code{drug_vec} which do not match
+#'   substances in our database be dropped from the table? Defaults to FALSE.
 #'
 #' @return A lookup table with category \code{data.frame} having four columns:
 #'   original search term, drug class, drug category, and drug street name.
+#'
+#' @importFrom purrr map_df
+#' @importFrom tidyr drop_na
+#'
 #' @export
 #'
 #' @examples
@@ -23,23 +29,34 @@
 lookup <- function(drug_vec = NULL, ...,
                    searchClass = TRUE,
                    searchCategory = TRUE,
-                   searchSynonym = TRUE) {
+                   searchSynonym = TRUE,
+                   dropUnmatched = FALSE) {
+  # browser()
 
   thingy <- c(drug_vec, as.character(list(...)))
   thingy_char <- vapply(
     X = as.character(thingy),
     FUN = tolower,
     FUN.VALUE = character(1),
-    USE.NAMES = FALSE)
+    USE.NAMES = FALSE
+  )
 
   # lookup individual words
 
-  answer <- purrr::map_df(thingy_char, .lookup,
-                          searchClass,
-                          searchCategory,
-                          searchSynonym)
+  answer <- map_df(
+    .x = thingy_char,
+    .f = .lookup,
+    searchClass,
+    searchCategory,
+    searchSynonym
+  )
 
   row.names(answer) <- NULL
+
+  if (dropUnmatched) {
+    answer <- drop_na(answer, "class":"synonym")
+  }
+
   answer
 
 }
@@ -68,13 +85,22 @@ lookup <- function(drug_vec = NULL, ...,
   # Combine row match logic (use OR for base FALSE layer)
   matches_lgl <- classRowMatches | categoryRowMatches | synonymRowMatches
 
-  answer <-  DOPE::lookup_df[matches_lgl,, drop = FALSE]
+  answer <-  DOPE::lookup_df[matches_lgl, , drop = FALSE]
 
   if (nrow(answer) == 0){
-    answer <- data.frame(original_word = x, class = NA_character_,  category = NA_character_,
-                         synonym = NA_character_)
+    answer <- data.frame(
+      original_word = x,
+      class = NA_character_,
+      category = NA_character_,
+      synonym = NA_character_
+    )
   } else{
-    answer <- data.frame(original_word = x, answer)
+    answer <- data.frame(
+      original_word = x,
+      answer
+    )
   }
+
   answer
+
 }
diff --git a/R/lookup_syn.R b/R/lookup_syn.R
index 02a4ac4..ca88796 100644
--- a/R/lookup_syn.R
+++ b/R/lookup_syn.R
@@ -7,58 +7,89 @@
 #'
 #' @return A lookup table with category \code{data.frame} having three columns:
 #'   drug class, drug category match, and synonym name.
+#'
+#' @importFrom magrittr %>%
+#' @importFrom dplyr rename select filter
+#'
 #' @export
 #'
 #' @examples
 #'   lookup_syn("zip")
 
 lookup_syn <- function(drug_name) {
+  # browser()
 
   #binding vars to function
   category <- original_word <- synonym <- NULL
 
+  # Make sure only a single word is supplied (this function is not designed to
+  #   be vectorised)
+  if (length(drug_name) > 1L) {
+    drug_name <- drug_name[1]
+    warning("Only the first element of drug_name will be used.
+  If you have multiple substances to query, please use lookup() instead.")
+  }
+
   # Make sure drug_name is a string
   if (is.character(drug_name)){
+
     # make lowercase and remove leading and trailing whitespace
-    drug_name <- tolower(drug_name) %>% trimws()
+    drug_name <-
+      tolower(drug_name) %>%
+      trimws()
+
   } else {
     stop("drug_name should be a string of one drug name")
   }
 
   # lookup individual words
-  matches <- DOPE::lookup(drug_name, searchClass=TRUE,
-                          searchCategory=TRUE,
-                          searchSynonym=TRUE)
+  matches_df <- DOPE::lookup(
+    drug_name,
+    searchClass = TRUE,
+    searchCategory = TRUE,
+    searchSynonym = TRUE
+  )
 
-  cat_matches <- unique(matches[,c("category")])
+  drugCateg_char <- matches_df[ , "category", drop = TRUE]
+  cat_matches <- unique(drugCateg_char)
+
+  if (length(cat_matches) == 1) {
+
+    answer <- lookup(
+      matches_df$category,
+      searchClass = TRUE,
+      searchCategory = TRUE,
+      searchSynonym = FALSE
+    )
 
-  if (length(cat_matches) == 1){
-    answer <- lookup(matches$category, searchClass=TRUE,
-                     searchCategory=TRUE,
-                     searchSynonym = FALSE)
-    answer <- answer %>%
-      dplyr::rename("category_match" = category) %>%
-      dplyr::select(-original_word)
-    answer <- subset(answer, synonym != drug_name)
-  } else if(drug_name %in% matches[,c("category")] ){
-    answer <- matches[matches$category == drug_name, ]
     answer <- answer %>%
-      dplyr::rename("category_match" = category) %>%
-      dplyr::select(-original_word)
-    answer <- subset(answer, synonym != drug_name)
+      rename("category_match" = category) %>%
+      select(-original_word) %>%
+      filter(synonym != drug_name)
+
+  } else if (drug_name %in% drugCateg_char) {
+
+    answer <- matches_df %>%
+      filter(category == drug_name) %>%
+      rename("category_match" = category) %>%
+      select(-original_word) %>%
+      filter(synonym != drug_name)
+
   } else {
+
     message(
 "Your search matched multiple categories. Please choose one \n",
 "from the following list and refine your search. Example: \n",
-"lookup_syn('", unique(matches[c("category")])[1,1], "') or ",
-"lookup_syn('", unique(matches[c("category")])[2,1], "')"
+"lookup_syn('", cat_matches[1], "') or ",
+"lookup_syn('", cat_matches[2], "')"
 )
 
-    answer <- unique(matches[c("category")])
+    answer <- unique(matches_df["category"])
 
   }
 
   answer
+
 }
 
 
diff --git a/R/parse.R b/R/parse.R
index bf03580..168791c 100644
--- a/R/parse.R
+++ b/R/parse.R
@@ -13,7 +13,7 @@
 #' @importFrom stringr str_detect str_replace_all
 #' @importFrom tidytext unnest_tokens
 #' @importFrom dplyr mutate filter anti_join case_when
-#' @importFrom tibble tibble
+#'
 #' @export
 #'
 #' @examples
@@ -37,7 +37,7 @@ parse <- function(drug_vec){
   #ex. and combination of "speedball" also preserved
 
   unnestedDrugs_df <-
-    tibble(drug_vec) %>%
+    data.frame(drug_vec) %>%
     unnest_tokens(
       word, drug_vec, token = "regex", pattern = "[,|-|+|&| ]", to_lower = TRUE
     ) %>%
diff --git a/man/lookup.Rd b/man/lookup.Rd
index 1be69cc..6c35036 100644
--- a/man/lookup.Rd
+++ b/man/lookup.Rd
@@ -9,7 +9,8 @@ lookup(
   ...,
   searchClass = TRUE,
   searchCategory = TRUE,
-  searchSynonym = TRUE
+  searchSynonym = TRUE,
+  dropUnmatched = FALSE
 )
 }
 \arguments{
@@ -25,6 +26,9 @@ for in column \code{category}? Defaults to TRUE.}
 
 \item{searchSynonym}{Should the substances listed in \code{...} be searched
 for in column \code{synonym}? Defaults to TRUE.}
+
+\item{dropUnmatched}{Should words in \code{drug_vec} which do not match
+substances in our database be dropped from the table? Defaults to FALSE.}
 }
 \value{
 A lookup table with category \code{data.frame} having four columns:
diff --git a/tests/testthat/test-lookup.R b/tests/testthat/test-lookup.R
index 2cdc121..672d502 100644
--- a/tests/testthat/test-lookup.R
+++ b/tests/testthat/test-lookup.R
@@ -34,6 +34,7 @@ test_that("lookup with dots works", {
   expect_equal(testOut2_df, lookup("zip", "shrooms"))
 })
 
+
 ###  Test Vector input  ###
 test_vector <- c("zip", "shrooms")
 
@@ -41,6 +42,7 @@ test_that("lookup with vector as first argument works", {
   expect_equal(testOut2_df, lookup(test_vector))
 })
 
+
 ###  Test return of original word  ###
 testOut_df3 <- data.frame(
   original_word = c("zip"),
@@ -53,3 +55,17 @@ testOut_df3 <- data.frame(
 test_that("the original lookup word is returned in the dataframe", {
   expect_equal(testOut_df3, lookup("zip"))
 })
+
+
+###  Test Dropping Rows with No Classification  ###
+# See https://github.com/CTN-0094/DOPE/issues/46
+testSentence_char <-
+  "Uh, there is a lot of sexual, sex in the gay community while on MDMA"
+parsedSentence_char <- parse(testSentence_char)
+
+test_that("we can drop unclassified words", {
+  expect_equal(
+    lookup(parsedSentence_char, dropUnmatched = TRUE),
+    lookup("mdma")
+  )
+})
diff --git a/tests/testthat/test-lookup_syn.R b/tests/testthat/test-lookup_syn.R
index 18ffa7f..4642918 100644
--- a/tests/testthat/test-lookup_syn.R
+++ b/tests/testthat/test-lookup_syn.R
@@ -1,30 +1,53 @@
 ### Test for single category match ###
 testOut_df <- data.frame(
-  class = rep(c("cannabis"),each=5),
-  category_match = rep(c("marijuana"), each= 5),
+  class = rep("cannabis", each = 5),
+  category_match = rep("marijuana", each = 5),
   synonym = c("420", "a-bomb", "acapulco gold", "acapulco red", "ace")
 )
 
 test_that("single category match works", {
-  expect_equal(testOut_df, head(lookup_syn("draf weed"),5))
+  expect_equal(
+    testOut_df,
+    head(lookup_syn("draf weed"), 5)
+  )
+})
+
+
+###  Test that Multiple Entries are Discarded  ###
+test_that("Drug vector only uses first entry and throws warning", {
+  expect_warning(
+    lookup_syn(c("ritalin", "opium")),
+    "Only the first element"
+  )
 })
 
+
+###  Test Drug Category Search  ###
+# This is the else if() part
+test_that("Drug category matches multiple times handled correctly", {
+  expect_equal(
+    data.frame(
+      class = "stimulant",
+      category_match = "ritalin",
+      synonym = "kibbles and bits"
+    ),
+    lookup_syn("ritalin")
+  )
+})
+
+
 ###  Test with multiple category matches  ###
 testOut_df2 <- data.frame(
-  category =c(
-    "amphetamine"
-    ,"cocaine"
-    ,"crack"
-    ,"crack cocaine"
-    ,"khat"
-    ,"methamphetamine"
-    ,"methcathinone"
-    ,"methylphenidate"
-    ,"ritalin"
-    ,"synthetic cathinone"
+  category = c(
+    "amphetamine", "cocaine", "crack", "crack cocaine", "khat",
+    "methamphetamine", "methcathinone", "methylphenidate", "ritalin",
+    "synthetic cathinone"
   )
 )
 
 test_that("multiple category match works", {
-  expect_equal(testOut_df2$category, lookup_syn("stimulant")$category)
+  expect_equal(
+    testOut_df2$category,
+    lookup_syn("stimulant")$category
+  )
 })