I think this is all we need for Issue #14; more edits per Issue #18 a…

…nd Issue #19
TransBioInfoLab · Mar 29, 2022 · b3bc8c0 · b3bc8c0
1 parent 77522f6
commit b3bc8c0
Show file tree

Hide file tree

Showing 7 changed files with 52 additions and 79 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: coMethDMR
 Title: Accurate identification of co-methylated and differentially methylated
   regions in epigenome-wide association studies
-Version: 0.99.15
+Version: 0.99.16
 Authors@R: c(
   person("Fernanda", "Veitzman", email = "[email protected]", role = c("cre")),
   person("Lissette", "Gomez", email = "[email protected]", role = c("aut")),
@@ -30,13 +30,11 @@ Imports:
   AnnotationHub,
   BiocParallel,
   bumphunter,
-  dplyr,
   ExperimentHub,
   GenomicRanges,
   IRanges,
   lmerTest,
   methods,
-  rlang,
   stats,
   utils
 Suggests: 

diff --git a/NAMESPACE b/NAMESPACE
@@ -36,12 +36,8 @@ importFrom(IRanges,IRanges)
 importFrom(IRanges,subsetByOverlaps)
 importFrom(bumphunter,clusterMaker)
 importFrom(bumphunter,getSegments)
-importFrom(dplyr,filter)
-importFrom(dplyr,group_by)
-importFrom(dplyr,n)
 importFrom(lmerTest,lmer)
 importFrom(methods,is)
-importFrom(rlang,.data)
 importFrom(stats,as.formula)
 importFrom(stats,coef)
 importFrom(stats,cor)

diff --git a/R/WriteCloseByAllRegions.R b/R/WriteCloseByAllRegions.R
@@ -5,10 +5,6 @@
 #' @param regions GRanges of input genomic regions
 #' @param genome Human genome of reference: hg19 or hg38
 #' @param arrayType Type of array: "450k" or "EPIC"
-#' @param manifest_gr A GRanges object with the genome manifest (as returned by
-#'   \code{\link[ExperimentHub]{ExperimentHub}} or by
-#'   \code{\link{ImportSesameData}}). This function by default ignores this
-#'   argument in favour of the \code{genome} and \code{arrayType} arguments.
 #' @param ignoreStrand Whether strand can be ignored, default is TRUE
 #' @param maxGap an integer, genomic locations within maxGap from each other
 #'    are placed into the same cluster
@@ -25,8 +21,6 @@
 #'
 #' @export
 #'
-#' @importFrom dplyr group_by filter n
-#' @importFrom rlang .data
 #' @importFrom GenomicRanges findOverlaps
 #' @importFrom methods is
 #' @examples
@@ -54,19 +48,20 @@ WriteCloseByAllRegions <- function(
   regions,
   genome = c("hg19","hg38"),
   arrayType = c("450k","EPIC"),
-  manifest_gr = NULL,
   ignoreStrand = TRUE,
   maxGap = 200,
   minCpGs = 3,
   ...
 ){
+  # browser()
 
   ###  Check Inputs  ###
   if (maxGap == 200 & minCpGs == 3) {
-    stop(
+    warning(
       "A file of close by CpGs for maxGap = 200 and minCpGs = 3 for genic and
-      intergenic regions already exist at /inst/extdata/",
-      call. = FALSE
+  intergenic regions already exists in /inst/extdata/ or at 
+  <https://github.com/TransBioInfoLab/coMethDMR_data/tree/main/data>",
+      call. = FALSE, immediate. = TRUE
     )
   }
 
@@ -77,77 +72,67 @@ WriteCloseByAllRegions <- function(
     )
   }
 
-  arrayType <- match.arg(arrayType)
-  genome <- match.arg(genome)
-
 
   ###  The GRanges Object  ###
-  if(!is.null(manifest_gr)) {
-    CpGlocations.gr <- manifest_gr
-  } else {
-
-    manifest <- paste(
-      switch(arrayType, "450k" = "HM450", "EPIC" = "EPIC"),
-      genome, "manifest",
-      sep = "."
-    )
-    CpGlocations.gr <- ImportSesameData(manifest)  
-
-  }
+  arrayType <- match.arg(arrayType)
+  genome <- match.arg(genome)
+  manifest <- paste(
+    switch(arrayType, "450k" = "HM450", "EPIC" = "EPIC"),
+    genome, "manifest",
+    sep = "."
+  )
+  CpGlocations.gr <- ImportSesameData(manifest)  
 
 
-  ### Convert input from GRanges to list of vectors of CpGs ###
-  hits <- as.data.frame( findOverlaps(regions, CpGlocations.gr) )
-  # NOTE 2021-10-22: we need to re-write this line to remove the dplyr
-  #   dependency. This is the only place in the whole package where we import
-  #   dplyr or plyr functions.
-  hits <- dplyr::filter(
-    dplyr::group_by(hits, .data$queryHits),
-    dplyr::n() >= 3
+  ###  Convert input from GRanges to list of vectors of CpGs  ###
+  hits_df <- as.data.frame(
+    findOverlaps(query = regions, subject = CpGlocations.gr)
   )
-  hits$probes <- names(CpGlocations.gr)[hits$subjectHits]
-  region3CpGs_ls <- split(hits$probes, hits$queryHits)
+  hits_df$probes <- names(CpGlocations.gr)[hits_df$subjectHits]
+  regionCpGs_ls <- split(hits_df$probes, hits_df$queryHits)
+  regionCpGs_ls[lengths(regionCpGs_ls) < minCpGs] <- NULL
+
 
-  ### Find close by clusters in all the regions ###
-  ### 45.92571 secs for 1000 regions
+  ###  Find close by clusters in all the regions  ###
+  # 45.92571 secs for 1000 regions
   closeByRegions_ls <- lapply(
-    X = region3CpGs_ls,
+    X = regionCpGs_ls,
     FUN = CloseBySingleRegion,
-    genome,
-    arrayType,
-    maxGap,
-    minCpGs
+    manifest_gr = CpGlocations.gr,
+    maxGap = maxGap,
+    minCpGs = minCpGs
   )
 
-  ### Remove 'NULL' elements of the list ###
+  # Remove 'NULL' elements of the list
   closeByRegionsNoNull_ls <- unlist(closeByRegions_ls, recursive = FALSE)
 
-  ### Order CpGs in each cluster by location to name the cluster ###
-  ### 8.202557 secs for 162 regions, after unlisting 1000 regions from previous step
+
+  ###  Order CpGs in each cluster by location to name the cluster  ###
+  # 8.202557 secs for 162 regions, after unlisting 1000 regions
   closeByRegionsOrderedDF_ls <- lapply(
     X = closeByRegionsNoNull_ls,
     FUN = OrderCpGsByLocation,
-    genome,
-    arrayType,
-    manifest_gr,
-    ignoreStrand,
+    manifest_gr = CpGlocations.gr,
+    ignoreStrand = ignoreStrand,
     output = "dataframe"
   )
 
-  ### Name each cluster with genomic region ###
-  closeByRegionsNames_ls <- lapply(
-    closeByRegionsOrderedDF_ls, NameRegion
+  # Name each cluster with genomic region
+  closeByRegionsNames_char <- vapply(
+    closeByRegionsOrderedDF_ls,
+    FUN = NameRegion,
+    FUN.VALUE = character(1)
   )
 
-  ### Order CpGs in each cluster by location ###
+
+  ###  Order CpGs in each cluster by location  ###
   closeByRegionsOrdered_ls <- lapply(
-    closeByRegionsOrderedDF_ls, function(x) x[ , "cpg"]
+    closeByRegionsOrderedDF_ls, FUN = `[[`, "cpg"
   )
-
-  names(closeByRegionsOrdered_ls) <- closeByRegionsNames_ls
+  names(closeByRegionsOrdered_ls) <- closeByRegionsNames_char
 
 
-  ### Select and return output ###
+  ###  Select and return output  ###
   message("Writing to file ", fileName)
   saveRDS(closeByRegionsOrdered_ls, fileName)
 

diff --git a/R/util1_OrderCpGsByLocation.R b/R/util1_OrderCpGsByLocation.R
@@ -50,7 +50,6 @@ OrderCpGsByLocation <- function(
 
     # Available manifest files are
     # "EPIC.hg19.manifest"  "EPIC.hg38.manifest"
-    # "HM27.hg19.manifest"  "HM27.hg38.manifest"
     # "HM450.hg19.manifest" "HM450.hg38.manifest"
     manifest <- paste(
       switch(arrayType, "450k" = "HM450", "EPIC" = "EPIC"),
@@ -73,12 +72,12 @@ OrderCpGsByLocation <- function(
   # Snow workers can't find the sort() method for GRanges objects; see
   #   https://github.com/TransBioInfoLab/coMethDMR/issues/13
   CpGs.gr <- sort.GenomicRanges(
-    CpGlocations.gr[ CpGs_char[goodCpGs_lgl] ],
+    x = CpGlocations.gr[ CpGs_char[goodCpGs_lgl] ],
     ignore.strand = ignoreStrand
   )
 
 
-  ### Select and return output ###
+  ###  Select and return output  ###
   if (output == "dataframe") {
 
     CpGsOrdered_df <- as.data.frame(CpGs.gr)[ , c("seqnames", "start")]
@@ -89,4 +88,5 @@ OrderCpGsByLocation <- function(
   } else {
     as.character(names(CpGs.gr))
   }
+
 }
diff --git a/R/util3_SplitCpGsSubregionsDataFrameToList.R b/R/util3_SplitCpGsSubregionsDataFrameToList.R
@@ -55,17 +55,17 @@ SplitCpGDFbyRegion <- function(
     CpGsSubregions_df$ProbeID, CpGsSubregions_df$Subregion
   )
 
-  # Output dataframes with annotation for each subregions
+  # Order each subregion
   subRegionAnnotationDF_ls <- lapply(
     X = subRegion_ls,
     FUN = OrderCpGsByLocation,
-    genome,
-    arrayType,
-    manifest_gr,
+    genome = genome,
+    arrayType = arrayType,
+    manifest_gr = manifest_gr,
     output = "dataframe"
   )
 
-  ### Name the comethylated subregions ###
+  # Name the comethylated subregions
   names(subRegion_ls) <- lapply(subRegionAnnotationDF_ls, NameRegion)
 
   subRegion_ls

diff --git a/R/util4_RegionsToRanges.R b/R/util4_RegionsToRanges.R
@@ -22,7 +22,7 @@ RegionsToRanges <- function(regionName_char){
 
   GRanges(
     seqnames = as.factor(chr),
-    ranges = IRanges(as.numeric(start), as.numeric(end))
+    ranges = IRanges(start = as.numeric(start), end = as.numeric(end))
   )
 
 }
diff --git a/man/WriteCloseByAllRegions.Rd b/man/WriteCloseByAllRegions.Rd