Skip to content

Commit

Permalink
I think this is all we need for Issue #14; more edits per Issue #18 a…
Browse files Browse the repository at this point in the history
…nd Issue #19
  • Loading branch information
gabrielodom committed Mar 29, 2022
1 parent 77522f6 commit b3bc8c0
Show file tree
Hide file tree
Showing 7 changed files with 52 additions and 79 deletions.
4 changes: 1 addition & 3 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: coMethDMR
Title: Accurate identification of co-methylated and differentially methylated
regions in epigenome-wide association studies
Version: 0.99.15
Version: 0.99.16
Authors@R: c(
person("Fernanda", "Veitzman", email = "[email protected]", role = c("cre")),
person("Lissette", "Gomez", email = "[email protected]", role = c("aut")),
Expand Down Expand Up @@ -30,13 +30,11 @@ Imports:
AnnotationHub,
BiocParallel,
bumphunter,
dplyr,
ExperimentHub,
GenomicRanges,
IRanges,
lmerTest,
methods,
rlang,
stats,
utils
Suggests:
Expand Down
4 changes: 0 additions & 4 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -36,12 +36,8 @@ importFrom(IRanges,IRanges)
importFrom(IRanges,subsetByOverlaps)
importFrom(bumphunter,clusterMaker)
importFrom(bumphunter,getSegments)
importFrom(dplyr,filter)
importFrom(dplyr,group_by)
importFrom(dplyr,n)
importFrom(lmerTest,lmer)
importFrom(methods,is)
importFrom(rlang,.data)
importFrom(stats,as.formula)
importFrom(stats,coef)
importFrom(stats,cor)
Expand Down
99 changes: 42 additions & 57 deletions R/WriteCloseByAllRegions.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,6 @@
#' @param regions GRanges of input genomic regions
#' @param genome Human genome of reference: hg19 or hg38
#' @param arrayType Type of array: "450k" or "EPIC"
#' @param manifest_gr A GRanges object with the genome manifest (as returned by
#' \code{\link[ExperimentHub]{ExperimentHub}} or by
#' \code{\link{ImportSesameData}}). This function by default ignores this
#' argument in favour of the \code{genome} and \code{arrayType} arguments.
#' @param ignoreStrand Whether strand can be ignored, default is TRUE
#' @param maxGap an integer, genomic locations within maxGap from each other
#' are placed into the same cluster
Expand All @@ -25,8 +21,6 @@
#'
#' @export
#'
#' @importFrom dplyr group_by filter n
#' @importFrom rlang .data
#' @importFrom GenomicRanges findOverlaps
#' @importFrom methods is
#' @examples
Expand Down Expand Up @@ -54,19 +48,20 @@ WriteCloseByAllRegions <- function(
regions,
genome = c("hg19","hg38"),
arrayType = c("450k","EPIC"),
manifest_gr = NULL,
ignoreStrand = TRUE,
maxGap = 200,
minCpGs = 3,
...
){
# browser()

### Check Inputs ###
if (maxGap == 200 & minCpGs == 3) {
stop(
warning(
"A file of close by CpGs for maxGap = 200 and minCpGs = 3 for genic and
intergenic regions already exist at /inst/extdata/",
call. = FALSE
intergenic regions already exists in /inst/extdata/ or at
<https://github.com/TransBioInfoLab/coMethDMR_data/tree/main/data>",
call. = FALSE, immediate. = TRUE
)
}

Expand All @@ -77,77 +72,67 @@ WriteCloseByAllRegions <- function(
)
}

arrayType <- match.arg(arrayType)
genome <- match.arg(genome)


### The GRanges Object ###
if(!is.null(manifest_gr)) {
CpGlocations.gr <- manifest_gr
} else {

manifest <- paste(
switch(arrayType, "450k" = "HM450", "EPIC" = "EPIC"),
genome, "manifest",
sep = "."
)
CpGlocations.gr <- ImportSesameData(manifest)

}
arrayType <- match.arg(arrayType)
genome <- match.arg(genome)
manifest <- paste(
switch(arrayType, "450k" = "HM450", "EPIC" = "EPIC"),
genome, "manifest",
sep = "."
)
CpGlocations.gr <- ImportSesameData(manifest)


### Convert input from GRanges to list of vectors of CpGs ###
hits <- as.data.frame( findOverlaps(regions, CpGlocations.gr) )
# NOTE 2021-10-22: we need to re-write this line to remove the dplyr
# dependency. This is the only place in the whole package where we import
# dplyr or plyr functions.
hits <- dplyr::filter(
dplyr::group_by(hits, .data$queryHits),
dplyr::n() >= 3
### Convert input from GRanges to list of vectors of CpGs ###
hits_df <- as.data.frame(
findOverlaps(query = regions, subject = CpGlocations.gr)
)
hits$probes <- names(CpGlocations.gr)[hits$subjectHits]
region3CpGs_ls <- split(hits$probes, hits$queryHits)
hits_df$probes <- names(CpGlocations.gr)[hits_df$subjectHits]
regionCpGs_ls <- split(hits_df$probes, hits_df$queryHits)
regionCpGs_ls[lengths(regionCpGs_ls) < minCpGs] <- NULL


### Find close by clusters in all the regions ###
### 45.92571 secs for 1000 regions
### Find close by clusters in all the regions ###
# 45.92571 secs for 1000 regions
closeByRegions_ls <- lapply(
X = region3CpGs_ls,
X = regionCpGs_ls,
FUN = CloseBySingleRegion,
genome,
arrayType,
maxGap,
minCpGs
manifest_gr = CpGlocations.gr,
maxGap = maxGap,
minCpGs = minCpGs
)

### Remove 'NULL' elements of the list ###
# Remove 'NULL' elements of the list
closeByRegionsNoNull_ls <- unlist(closeByRegions_ls, recursive = FALSE)

### Order CpGs in each cluster by location to name the cluster ###
### 8.202557 secs for 162 regions, after unlisting 1000 regions from previous step

### Order CpGs in each cluster by location to name the cluster ###
# 8.202557 secs for 162 regions, after unlisting 1000 regions
closeByRegionsOrderedDF_ls <- lapply(
X = closeByRegionsNoNull_ls,
FUN = OrderCpGsByLocation,
genome,
arrayType,
manifest_gr,
ignoreStrand,
manifest_gr = CpGlocations.gr,
ignoreStrand = ignoreStrand,
output = "dataframe"
)

### Name each cluster with genomic region ###
closeByRegionsNames_ls <- lapply(
closeByRegionsOrderedDF_ls, NameRegion
# Name each cluster with genomic region
closeByRegionsNames_char <- vapply(
closeByRegionsOrderedDF_ls,
FUN = NameRegion,
FUN.VALUE = character(1)
)

### Order CpGs in each cluster by location ###

### Order CpGs in each cluster by location ###
closeByRegionsOrdered_ls <- lapply(
closeByRegionsOrderedDF_ls, function(x) x[ , "cpg"]
closeByRegionsOrderedDF_ls, FUN = `[[`, "cpg"
)

names(closeByRegionsOrdered_ls) <- closeByRegionsNames_ls
names(closeByRegionsOrdered_ls) <- closeByRegionsNames_char


### Select and return output ###
### Select and return output ###
message("Writing to file ", fileName)
saveRDS(closeByRegionsOrdered_ls, fileName)

Expand Down
6 changes: 3 additions & 3 deletions R/util1_OrderCpGsByLocation.R
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,6 @@ OrderCpGsByLocation <- function(

# Available manifest files are
# "EPIC.hg19.manifest" "EPIC.hg38.manifest"
# "HM27.hg19.manifest" "HM27.hg38.manifest"
# "HM450.hg19.manifest" "HM450.hg38.manifest"
manifest <- paste(
switch(arrayType, "450k" = "HM450", "EPIC" = "EPIC"),
Expand All @@ -73,12 +72,12 @@ OrderCpGsByLocation <- function(
# Snow workers can't find the sort() method for GRanges objects; see
# https://github.com/TransBioInfoLab/coMethDMR/issues/13
CpGs.gr <- sort.GenomicRanges(
CpGlocations.gr[ CpGs_char[goodCpGs_lgl] ],
x = CpGlocations.gr[ CpGs_char[goodCpGs_lgl] ],
ignore.strand = ignoreStrand
)


### Select and return output ###
### Select and return output ###
if (output == "dataframe") {

CpGsOrdered_df <- as.data.frame(CpGs.gr)[ , c("seqnames", "start")]
Expand All @@ -89,4 +88,5 @@ OrderCpGsByLocation <- function(
} else {
as.character(names(CpGs.gr))
}

}
10 changes: 5 additions & 5 deletions R/util3_SplitCpGsSubregionsDataFrameToList.R
Original file line number Diff line number Diff line change
Expand Up @@ -55,17 +55,17 @@ SplitCpGDFbyRegion <- function(
CpGsSubregions_df$ProbeID, CpGsSubregions_df$Subregion
)

# Output dataframes with annotation for each subregions
# Order each subregion
subRegionAnnotationDF_ls <- lapply(
X = subRegion_ls,
FUN = OrderCpGsByLocation,
genome,
arrayType,
manifest_gr,
genome = genome,
arrayType = arrayType,
manifest_gr = manifest_gr,
output = "dataframe"
)

### Name the comethylated subregions ###
# Name the comethylated subregions
names(subRegion_ls) <- lapply(subRegionAnnotationDF_ls, NameRegion)

subRegion_ls
Expand Down
2 changes: 1 addition & 1 deletion R/util4_RegionsToRanges.R
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ RegionsToRanges <- function(regionName_char){

GRanges(
seqnames = as.factor(chr),
ranges = IRanges(as.numeric(start), as.numeric(end))
ranges = IRanges(start = as.numeric(start), end = as.numeric(end))
)

}
6 changes: 0 additions & 6 deletions man/WriteCloseByAllRegions.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit b3bc8c0

Please sign in to comment.