diff --git a/DESCRIPTION b/DESCRIPTION index d4d83ea..7a8001d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: coMethDMR Title: Accurate identification of co-methylated and differentially methylated regions in epigenome-wide association studies -Version: 0.99.15 +Version: 0.99.16 Authors@R: c( person("Fernanda", "Veitzman", email = "fveit001@fiu.edu", role = c("cre")), person("Lissette", "Gomez", email = "lxg255@miami.edu", role = c("aut")), @@ -30,13 +30,11 @@ Imports: AnnotationHub, BiocParallel, bumphunter, - dplyr, ExperimentHub, GenomicRanges, IRanges, lmerTest, methods, - rlang, stats, utils Suggests: diff --git a/NAMESPACE b/NAMESPACE index 753c817..7a3148e 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -36,12 +36,8 @@ importFrom(IRanges,IRanges) importFrom(IRanges,subsetByOverlaps) importFrom(bumphunter,clusterMaker) importFrom(bumphunter,getSegments) -importFrom(dplyr,filter) -importFrom(dplyr,group_by) -importFrom(dplyr,n) importFrom(lmerTest,lmer) importFrom(methods,is) -importFrom(rlang,.data) importFrom(stats,as.formula) importFrom(stats,coef) importFrom(stats,cor) diff --git a/R/WriteCloseByAllRegions.R b/R/WriteCloseByAllRegions.R index eaf075f..149c59f 100644 --- a/R/WriteCloseByAllRegions.R +++ b/R/WriteCloseByAllRegions.R @@ -5,10 +5,6 @@ #' @param regions GRanges of input genomic regions #' @param genome Human genome of reference: hg19 or hg38 #' @param arrayType Type of array: "450k" or "EPIC" -#' @param manifest_gr A GRanges object with the genome manifest (as returned by -#' \code{\link[ExperimentHub]{ExperimentHub}} or by -#' \code{\link{ImportSesameData}}). This function by default ignores this -#' argument in favour of the \code{genome} and \code{arrayType} arguments. #' @param ignoreStrand Whether strand can be ignored, default is TRUE #' @param maxGap an integer, genomic locations within maxGap from each other #' are placed into the same cluster @@ -25,8 +21,6 @@ #' #' @export #' -#' @importFrom dplyr group_by filter n -#' @importFrom rlang .data #' @importFrom GenomicRanges findOverlaps #' @importFrom methods is #' @examples @@ -54,19 +48,20 @@ WriteCloseByAllRegions <- function( regions, genome = c("hg19","hg38"), arrayType = c("450k","EPIC"), - manifest_gr = NULL, ignoreStrand = TRUE, maxGap = 200, minCpGs = 3, ... ){ + # browser() ### Check Inputs ### if (maxGap == 200 & minCpGs == 3) { - stop( + warning( "A file of close by CpGs for maxGap = 200 and minCpGs = 3 for genic and - intergenic regions already exist at /inst/extdata/", - call. = FALSE + intergenic regions already exists in /inst/extdata/ or at + ", + call. = FALSE, immediate. = TRUE ) } @@ -77,77 +72,67 @@ WriteCloseByAllRegions <- function( ) } - arrayType <- match.arg(arrayType) - genome <- match.arg(genome) - ### The GRanges Object ### - if(!is.null(manifest_gr)) { - CpGlocations.gr <- manifest_gr - } else { - - manifest <- paste( - switch(arrayType, "450k" = "HM450", "EPIC" = "EPIC"), - genome, "manifest", - sep = "." - ) - CpGlocations.gr <- ImportSesameData(manifest) - - } + arrayType <- match.arg(arrayType) + genome <- match.arg(genome) + manifest <- paste( + switch(arrayType, "450k" = "HM450", "EPIC" = "EPIC"), + genome, "manifest", + sep = "." + ) + CpGlocations.gr <- ImportSesameData(manifest) - ### Convert input from GRanges to list of vectors of CpGs ### - hits <- as.data.frame( findOverlaps(regions, CpGlocations.gr) ) - # NOTE 2021-10-22: we need to re-write this line to remove the dplyr - # dependency. This is the only place in the whole package where we import - # dplyr or plyr functions. - hits <- dplyr::filter( - dplyr::group_by(hits, .data$queryHits), - dplyr::n() >= 3 + ### Convert input from GRanges to list of vectors of CpGs ### + hits_df <- as.data.frame( + findOverlaps(query = regions, subject = CpGlocations.gr) ) - hits$probes <- names(CpGlocations.gr)[hits$subjectHits] - region3CpGs_ls <- split(hits$probes, hits$queryHits) + hits_df$probes <- names(CpGlocations.gr)[hits_df$subjectHits] + regionCpGs_ls <- split(hits_df$probes, hits_df$queryHits) + regionCpGs_ls[lengths(regionCpGs_ls) < minCpGs] <- NULL + - ### Find close by clusters in all the regions ### - ### 45.92571 secs for 1000 regions + ### Find close by clusters in all the regions ### + # 45.92571 secs for 1000 regions closeByRegions_ls <- lapply( - X = region3CpGs_ls, + X = regionCpGs_ls, FUN = CloseBySingleRegion, - genome, - arrayType, - maxGap, - minCpGs + manifest_gr = CpGlocations.gr, + maxGap = maxGap, + minCpGs = minCpGs ) - ### Remove 'NULL' elements of the list ### + # Remove 'NULL' elements of the list closeByRegionsNoNull_ls <- unlist(closeByRegions_ls, recursive = FALSE) - ### Order CpGs in each cluster by location to name the cluster ### - ### 8.202557 secs for 162 regions, after unlisting 1000 regions from previous step + + ### Order CpGs in each cluster by location to name the cluster ### + # 8.202557 secs for 162 regions, after unlisting 1000 regions closeByRegionsOrderedDF_ls <- lapply( X = closeByRegionsNoNull_ls, FUN = OrderCpGsByLocation, - genome, - arrayType, - manifest_gr, - ignoreStrand, + manifest_gr = CpGlocations.gr, + ignoreStrand = ignoreStrand, output = "dataframe" ) - ### Name each cluster with genomic region ### - closeByRegionsNames_ls <- lapply( - closeByRegionsOrderedDF_ls, NameRegion + # Name each cluster with genomic region + closeByRegionsNames_char <- vapply( + closeByRegionsOrderedDF_ls, + FUN = NameRegion, + FUN.VALUE = character(1) ) - ### Order CpGs in each cluster by location ### + + ### Order CpGs in each cluster by location ### closeByRegionsOrdered_ls <- lapply( - closeByRegionsOrderedDF_ls, function(x) x[ , "cpg"] + closeByRegionsOrderedDF_ls, FUN = `[[`, "cpg" ) - - names(closeByRegionsOrdered_ls) <- closeByRegionsNames_ls + names(closeByRegionsOrdered_ls) <- closeByRegionsNames_char - ### Select and return output ### + ### Select and return output ### message("Writing to file ", fileName) saveRDS(closeByRegionsOrdered_ls, fileName) diff --git a/R/util1_OrderCpGsByLocation.R b/R/util1_OrderCpGsByLocation.R index 68146f5..ebadc75 100644 --- a/R/util1_OrderCpGsByLocation.R +++ b/R/util1_OrderCpGsByLocation.R @@ -50,7 +50,6 @@ OrderCpGsByLocation <- function( # Available manifest files are # "EPIC.hg19.manifest" "EPIC.hg38.manifest" - # "HM27.hg19.manifest" "HM27.hg38.manifest" # "HM450.hg19.manifest" "HM450.hg38.manifest" manifest <- paste( switch(arrayType, "450k" = "HM450", "EPIC" = "EPIC"), @@ -73,12 +72,12 @@ OrderCpGsByLocation <- function( # Snow workers can't find the sort() method for GRanges objects; see # https://github.com/TransBioInfoLab/coMethDMR/issues/13 CpGs.gr <- sort.GenomicRanges( - CpGlocations.gr[ CpGs_char[goodCpGs_lgl] ], + x = CpGlocations.gr[ CpGs_char[goodCpGs_lgl] ], ignore.strand = ignoreStrand ) - ### Select and return output ### + ### Select and return output ### if (output == "dataframe") { CpGsOrdered_df <- as.data.frame(CpGs.gr)[ , c("seqnames", "start")] @@ -89,4 +88,5 @@ OrderCpGsByLocation <- function( } else { as.character(names(CpGs.gr)) } + } diff --git a/R/util3_SplitCpGsSubregionsDataFrameToList.R b/R/util3_SplitCpGsSubregionsDataFrameToList.R index f64cf72..663a3cf 100644 --- a/R/util3_SplitCpGsSubregionsDataFrameToList.R +++ b/R/util3_SplitCpGsSubregionsDataFrameToList.R @@ -55,17 +55,17 @@ SplitCpGDFbyRegion <- function( CpGsSubregions_df$ProbeID, CpGsSubregions_df$Subregion ) - # Output dataframes with annotation for each subregions + # Order each subregion subRegionAnnotationDF_ls <- lapply( X = subRegion_ls, FUN = OrderCpGsByLocation, - genome, - arrayType, - manifest_gr, + genome = genome, + arrayType = arrayType, + manifest_gr = manifest_gr, output = "dataframe" ) - ### Name the comethylated subregions ### + # Name the comethylated subregions names(subRegion_ls) <- lapply(subRegionAnnotationDF_ls, NameRegion) subRegion_ls diff --git a/R/util4_RegionsToRanges.R b/R/util4_RegionsToRanges.R index c955730..c0d7bab 100644 --- a/R/util4_RegionsToRanges.R +++ b/R/util4_RegionsToRanges.R @@ -22,7 +22,7 @@ RegionsToRanges <- function(regionName_char){ GRanges( seqnames = as.factor(chr), - ranges = IRanges(as.numeric(start), as.numeric(end)) + ranges = IRanges(start = as.numeric(start), end = as.numeric(end)) ) } diff --git a/man/WriteCloseByAllRegions.Rd b/man/WriteCloseByAllRegions.Rd index 5051591..ad88fb7 100644 --- a/man/WriteCloseByAllRegions.Rd +++ b/man/WriteCloseByAllRegions.Rd @@ -9,7 +9,6 @@ WriteCloseByAllRegions( regions, genome = c("hg19", "hg38"), arrayType = c("450k", "EPIC"), - manifest_gr = NULL, ignoreStrand = TRUE, maxGap = 200, minCpGs = 3, @@ -26,11 +25,6 @@ saved.} \item{arrayType}{Type of array: "450k" or "EPIC"} -\item{manifest_gr}{A GRanges object with the genome manifest (as returned by -\code{\link[ExperimentHub]{ExperimentHub}} or by -\code{\link{ImportSesameData}}). This function by default ignores this -argument in favour of the \code{genome} and \code{arrayType} arguments.} - \item{ignoreStrand}{Whether strand can be ignored, default is TRUE} \item{maxGap}{an integer, genomic locations within maxGap from each other