From 2986c6d2b4517fe08cae42f7bd25459162457923 Mon Sep 17 00:00:00 2001 From: trvinh Date: Mon, 26 Aug 2024 14:49:08 +0200 Subject: [PATCH] class2tree checks duplicated taxa in higher levels, resolve #934 --- R/class2tree.R | 29 +++++++++++++++++++++++++++++ tests/testthat/test-class2tree.R | 14 ++++++++++++-- 2 files changed, 41 insertions(+), 2 deletions(-) diff --git a/R/class2tree.R b/R/class2tree.R index ff97aa40..28bfcec1 100644 --- a/R/class2tree.R +++ b/R/class2tree.R @@ -396,6 +396,35 @@ rank_indexing <- function (rankList) { #' @author Vinh Tran {tran@bio.uni-frankfurt.de} taxonomy_table_creator <- function (nameList, rankList) { colnames(nameList)[1] <- "tip" + # remove duplicated taxa (e.g. taxa with higher levels, that already belong + # to the taxonomy string of other taxa) + duplicatedTaxa <- lapply( + nameList$X1, + function (x) { + matchs <-data.frame(which(nameList[,-1] == x, arr.ind=TRUE)) + checkHigherRank <- lapply( + matchs$row, + function(y) { + if (nameList[y,]$X1 != x) return(1) + } + ) + if (length(unlist(checkHigherRank)) > 0) + return(strsplit(x, "#", fixed = TRUE)[[1]][1]) + } + ) + if (length(unlist(duplicatedTaxa)) > 0) { + msg <- paste("WARNING:", length(unlist(duplicatedTaxa))) + msg <- "duplicated taxa have been ignored!" + if (length(unlist(duplicatedTaxa)) == 1) + msg <- "duplicated taxon has been ignored!" + if (length(unlist(duplicatedTaxa)) < 5) { + msg <- paste(msg, "Including: ") + msg <- c(msg, paste(unlist(duplicatedTaxa), collapse = "; ")) + } + message(msg) + } + nameList <- nameList[!(nameList$tip %in% unlist(duplicatedTaxa)),] + rankList <- rankList[!(rankList$tip %in% unlist(duplicatedTaxa)),] # get indexed rank list index2RankDf <- rank_indexing(rankList) # get ordered rank list diff --git a/tests/testthat/test-class2tree.R b/tests/testthat/test-class2tree.R index 3d8925bd..70ea8caa 100644 --- a/tests/testthat/test-class2tree.R +++ b/tests/testthat/test-class2tree.R @@ -11,7 +11,8 @@ spnames <- c("Klattia flava", "Trollius sibiricus", "Arachis paraguariensis", dupnames <- c("Mus musculus", "Escherichia coli", "Haloferax denitrificans", "Mus musculus") - +duptaxa <- c("Haliotis", "Haliotis cracherodii", "Haliotis rufescens", + "Megabalanus californicus") test_that("internal functions of class2tree", { skip_on_cran() # uses secrets @@ -76,7 +77,7 @@ test_that("class2tree returns the correct value and class", { anyDuplicated(gsub("\\.\\d+$", "", names(tr$classification))), 0) }) -test_that("class2tree will abort when input contains duplicate taxa", { +test_that("class2tree will abort when input contains duplicated taxa", { skip_on_cran() # uses secrets vcr::use_cassette("class2tree_classification_dup_call", { out <- classification(dupnames, db = "ncbi", messages = FALSE) @@ -84,3 +85,12 @@ test_that("class2tree will abort when input contains duplicate taxa", { expect_error(class2tree(out), "Input list of classifications contains duplicates") }) + +test_that("class2tree detects duplicated taxa in higher levels", { + skip_on_cran() # uses secrets + vcr::use_cassette("class2tree_classification_dup_high_level", { + out <- classification(duptaxa, db = "ncbi", messages = FALSE) + }) + tree <- class2tree(out) + expect_true(nrow(tree$classification) < length(duptaxa)) +}) \ No newline at end of file