nsheff · joseverdezoto · Oct 21, 2019 · Oct 21, 2019 · Oct 22, 2019 · Oct 22, 2019
diff --git a/.travis.yml b/.travis.yml
@@ -3,6 +3,7 @@ language: r
 r: devel
 sudo: false
 cache: packages
+warnings_are_errors: false
 use_bioc: true
 bioc_required: true
 
diff --git a/R/loadPEPdb.R b/R/loadPEPdb.R
@@ -0,0 +1,73 @@
+############################################################################
+# PEP-loading function 
+
+############################################################################
+
+
+#' Given a PEP-formatted database, this function will display its metadata and 
+#' load the genomic regions for runLOLA() overlap calculations
+#' @param configPath Psth to the PEP config.yaml file  
+#' @param useCache   uses simpleCache to cache and load the results    
+#' @return A list containing the config location, config file,   
+#' samples annotation sheet and the regions Granges list 
+#' @export
+#' @examples 
+#' ConfigPath = system.file("extdata/hg19/ucsc_examplePEP/project_config.yaml",
+#'                         package="LOLA")
+#' PEPdb = loadPEPdb(ConfigPath)
+#' 
+
+
+
+loadPEPdb = function(configPath, useCache=TRUE){
+	configLoc = file.path(path=configPath)
+	if (!file.exists(configLoc)) {
+		stop("Could not find .yaml config file")
+	} else {
+		# Use pepr to read in the PEP metadata
+		pepObject = pepr::Project(file = configLoc)
+		configFile = pepr::config(pepObject)
+		smpl = pepr::samples(pepObject)
+		# Rename columns to make annotation LOLA compatible 
+		colnames(smpl)[colnames(smpl)=="file_name"] = "filename"
+		colnames(smpl)[colnames(smpl)=="cell_type"] = "cellType"
+		colnames(smpl)[colnames(smpl)=="exp_protocol"] = "collection"
+		colnames(smpl)[colnames(smpl)=="data_source"] = "dataSource"
+		# Make samples annotation a df and lapply through each file path
+		samplesdf = as.data.frame(smpl)
+		configFolder = dirname(configPath)
+		if (useCache & requireNamespace("simpleCache", quietly=TRUE)){
+			simpleCache::simpleCache("chromRanges", { 
+			lapply(samplesdf$output_file_path, LOLA::readBed)}, 
+			cacheDir=file.path(path=configFolder), 
+			recreate=FALSE) 
+		} else {
+			if (nrow(samplesAnnotation) > 100) {
+				# tell the user they should install simpleCache 
+				message("You should install simpleCache 
+					to save time when you load the 
+					database next")
+			}
+			chromRanges = lapply(samplesdf$output_file_path, LOLA::readBed)    
+		}
+		regions = GRangesList(chromRanges)
+		igdDBlocation = configFile$iGD_db
+		igdIndex = fread(configFile$iGD_index)
+		igdIndexdf = as.data.frame(igdIndex)
+		return(list(configLocation = configLoc,
+				configYAML = configFile,
+				regionAnno = smpl[, -c("genome")],
+				regionGRL = regions,
+				iGDRefDatabase = igdDBlocation,
+				iGDRefIndex = igdIndexdf))
+	}
+}
+
+
+
+
+
+
+
+
+
diff --git a/R/runLOLA_iGD.R b/R/runLOLA_iGD.R
@@ -0,0 +1,248 @@
+######################################################################
+# ENRICHMENT - Actual workhorse enrichment calculation functions
+######################################################################
+
+#' Enrichment Calculation
+#'
+#' Workhorse function that uses iGD to calculate overlaps between userSets,
+#' and then uses a fisher's exact test rank them by significance
+#' of the overlap.
+#'
+#' @param userSets		Regions of interest (Can be GRanges objects or a list)
+#' @param userUniverse	Regions tested for inclusion in userSets (Can be a GRanges object)
+#' @param pepRegionDB	Region DB to check for overlap, output list object from loadPEPdb()
+#' @param cores	Number of processors
+#' @param redefineUserSets	run redefineUserSets() on your userSets?
+#' @param direction    Defaults to "enrichment", but may also accept
+#'     "depletion", which will swap the direction of the fisher test (use
+#'     'greater' or less' value passed to the 'alternative' option of
+#'     fisher.test)
+#' @return Data.table with enrichment results. Rows correspond to individual
+#' pairwise fisher's tests comparing a single userSet with a single databaseSet.
+#' The columns in this data.table are: userSet and dbSet: index into their
+#' respective input region sets. pvalueLog: -log10(pvalue) from the fisher's exact
+#' result; oddsRatio: result from the fisher's exact test; support: number of
+#' regions in userSet overlapping databaseSet; rnkPV, rnkOR, rnkSup: rank in this
+#' table of p-value, oddsRatio, and Support respectively. The --value is the
+#' negative natural log of the p-value returned from a one-sided fisher's exact
+#' test. maxRnk, meanRnk: max and mean of the 3 previous ranks, providing a
+#' combined ranking system. b, c, d: 3 other values completing the 2x2 contingency
+#' table (with support). The remaining columns describe the dbSet for the row.
+#'
+#' If you have the qvalue package installed from bioconductor, runLOLA will add
+#' a q-value transformation to provide FDR scores automatically.
+#' @export
+#' @example
+#' R/examples/example.R
+
+
+
+####### Define new runLOLA function
+runLOLA2 = function(userSets, userUniverse, pepRegionDB, cores=1,
+                   redefineUserSets=FALSE, direction="enrichment") {
+
+  ### Data sanity checks ###
+  #Confirm we received GRangesList objects, convert from GRanges obj if necessary.
+
+  if(!(is(userSets, "GRanges") || is(userSets, "GRangesList"))) {
+    stop("userSets should be a GRanges object or GRanges list. Check object class")
+  } else if (is(userSets, "GRanges")) {
+    userSets = GRangesList(userSets)
+  }
+  if(!(is(userUniverse, "GRanges"))) {
+    stop("userUniverse should be a GRanges object. Check object class")
+  }
+
+
+  # Silence R CMD check Notes:
+  support=d=b=userSet=pValueLog=rnkSup=rnkPV=rnkOR=NULL
+  oddsRatio=maxRnk=meanRnk=dbSet=description=NULL
+  annotationDT = pepRegionDB$regionAnno
+  IGDreferenceLoc = pepRegionDB$iGDRefDatabase
+  IGDindex = pepRegionDB$iGDRefIndex
+
+  if (direction == "depletion") {
+    fisherAlternative = "less"
+  } else {
+    fisherAlternative = "greater"
+  }
+
+  annotationDT[, dbSet := seq_len(nrow(annotationDT))]
+  setkey(annotationDT, dbSet)
+
+
+  #setLapplyAlias(cores)
+
+
+  #if (any(is.null(names(testSetsGRL)))) {
+    #names(testSetsGRL) = seq_along(testSetsGRL)
+  #}
+
+  if (redefineUserSets) { #redefine user sets in terms of universe?
+    userSets =	redefineUserSets(userSets, userUniverse, cores=cores)
+    userSets = listToGRangesList(userSets)
+  }
+
+  if (! any( isDisjoint( userSets) ) ) {
+    message("You have non-disjoint userSets.")
+  }
+
+  ### Construct significance tests ###
+  message("Calculating unit set overlaps...")
+
+
+  ##########################
+  # iGD OVERLAP CALCULATIONS
+
+  # OLD WAY geneSetDatabaseOverlap =
+  #lapplyAlias( (userSets), countOverlapsRev, testSetsGRL, minoverlap=minOverlap)
+
+
+  # Produce a vector to match order of overlaps with order of files in annotation
+  igdIndexFiles = IGDindex$File
+  annotation = pepRegionDB$regionAnno
+  AnnoFileNames = sapply(annotation$output_file_path, basename)
+  correctRefOrder = match(AnnoFileNames, igdIndexFiles)
+
+  # Iterate through userSets to convert each GRanges obj into a df.
+  # Convert iGD db to an IGDr object and perform the overlap calculation.
+  # For each user set, the following overlap calculation will return a vector 
+  # of length nrow(IGDindex) with the number of regions in each set overlapping 
+  # bins in the iGD database.
+
+  userSetsLength = unlist(lapply((userSets), length))
+  userSetsData = lapply(userSets, as.data.frame)
+  IGDrefDB = IGDr::IGDr(IGDreferenceLoc)
+  IGDoverlapList = list()
+  for (i in seq_along(userSetsData)) {
+    chroms = userSetsData[[i]]$seqnames
+    starts = userSetsData[[i]]$start
+    ends = userSetsData[[i]]$end
+    userSetLength = nrow(userSetsData[[i]])
+    IGDoverlapList[[i]] = IGDr::search_nr(IGDrefDB, userSetLength, chroms, starts, ends)
+    IGDoverlapList[[i]] = IGDoverlapList[[i]][correctRefOrder]
+  }
+
+
+  # This will become "support" -- the number of regions in the
+  # userSet (which I implicitly assume is ALSO the number of regions
+  # in the universe) that overlap anything in each database set.
+  # Turn results into an overlap matrix. It is
+  # dbSets (rows) by userSets (columns), counting overlap.
+
+  #overlapMatrix = do.call(cbind, IGDrefDatabaseOverlap)
+  IGDoverlapMatrix = do.call(cbind, IGDoverlapList)
+
+  message("Calculating universe set overlaps...")
+  # Now for each test set, how many items *in the universe* does
+  # it overlap? This will go into the calculation for c
+
+  #faster. Returns number of items in userUniverse.
+
+  # Convert useruniverse into a df and perform overlap calculations with igd ref database.
+
+  universeData = as.data.frame(userUniverse, row.names=seq_along(userUniverse)) # universe names req below
+  universeNames = universeData$seqnames
+  universeStart = universeData$start
+  universeEnd = universeData$end
+  universeRegionN = nrow(universeData)
+
+  IGDuniverseOverlap = IGDr::search_nr(IGDrefDB, universeRegionN, 
+                                       universeNames, universeStart, universeEnd)
+  IGDuniverseOverlap = IGDuniverseOverlap[correctRefOrder]
+  names(IGDuniverseOverlap) = seq_along(IGDuniverseOverlap)
+
+  # Returns number of items in test set (not used:)
+  #testSetsOverlapUniverse = countOverlapsAny(testSetsGRL, userUniverse)
+  # Total size of the universe (could get size of either original GRanges or df since only 1 file)
+  universeLength = length(userUniverse)
+
+  # To build the fisher matrix, support is 'a'
+
+  scoreTable = data.table(reshape2::melt(t(IGDoverlapMatrix), variable.factor=FALSE))
+
+  setnames(scoreTable, c("Var1", "Var2", "value"), c("userSet", "dbSet", "support"))
+
+  # reshape2 has an annoying habit of converting strings into factors, which
+  # is undesirable. If the userSets are named with strings, make sure they stay
+  # character. Integers are already handled appropriately.
+
+  if ("factor" %in% class(scoreTable[, userSet])) {
+    scoreTable$userSet = as.character(scoreTable$userSet)
+  }
+
+
+  message("Calculating Fisher scores...")
+  # b = the # of items *in the universe* (1 bed file) that overlap each dbSet (bed file),
+  # less the support; This is the number of items in the universe
+  # that are in the dbSet ONLY (not in userSet)
+  # c = the size of userSet, less the support; This is the opposite:
+  # Items in the userSet ONLY (not in the dbSet)
+
+  scoreTable[,c("b", "c"):=list(
+    b=IGDuniverseOverlap[match(dbSet, names(IGDuniverseOverlap))]-support, c=userSetsLength-support)]
+
+
+  # This is the regions in the universe, but not in dbSet nor userSet.
+  scoreTable[,d:=universeLength-support-b-c]
+  if( scoreTable[,any(b<0)] ) { # Inappropriate universe.
+    warning(cleanws("Negative b entry in table. This means either: 1) Your user sets
+                    contain items outside your universe; or 2) your universe has a region that
+                    overlaps multiple user set regions, interfering with the universe set overlap
+                    calculation."))
+
+    return(scoreTable)
+  }
+  if( scoreTable[,any(c<0)] ) {
+    warning("Negative c entry in table. Bug with userSetsLength; this should not happen.")
+    return(scoreTable)
+  }
+
+  # Code for Fisher test looks fine as it is.
+  scoreTable[,c("pValueLog", "oddsRatio") := 
+               fisher.test(matrix(c(support,b,c,d), 2, 2), alternative=fisherAlternative)[c("p.value", "estimate")], by=list(userSet,dbSet)]
+
+  # Include qvalue if package exists.
+  if (requireNamespace("qvalue", quietly=TRUE)) {
+    # Wrap in try block since this is not vital.
+    # if you want qvalues...
+    tryCatch( {
+      scoreTable[,qValue:=qvalue::qvalue(pValueLog)$qvalue]
+    }, error = function(e) { warning("Problem in FDR calculation with qvalue.") })
+  } else {
+    # Another possibility for the future:
+    # scoreTable[,qValue:=qValues = pmin(pValues*length(pValues),1)]
+  }
+  scoreTable[, pValueLog:=-log10(pValueLog + 10^-322)]
+  ### Finalize and Rank results ###
+  scoreTable[, rnkSup:=rank(-support, ties.method="min"), by=userSet]
+  scoreTable[, rnkPV:=rank(-pValueLog, ties.method="min"), by=userSet]
+  scoreTable[, rnkOR:=rank(-oddsRatio, ties.method="min"), by=userSet]
+  scoreTable[, maxRnk:=max(c(rnkSup, rnkPV, rnkOR)), by=list(userSet,dbSet)]
+  scoreTable[, meanRnk:=signif(mean(c(rnkSup, rnkPV, rnkOR)), 3), by=list(userSet,dbSet)]
+
+  # Append description column
+  setkeyv(scoreTable, "dbSet")
+  scoreTable = scoreTable[annotationDT]
+
+  # limit description to 80 characters
+  scoreTable[,description:=substr(description, 0, 80)]
+
+  orderedCols = c("userSet", "dbSet", "collection", "pValueLog", "oddsRatio",
+                  "support", "rnkPV", "rnkOR", "rnkSup", "maxRnk", "meanRnk", "b", "c", "d",
+                  "description", "cellType", "tissue", "antibody", "treatment", "dataSource", "filename")
+  unorderedCols = setdiff(colnames(scoreTable), orderedCols)
+
+  setcolorder(scoreTable, c(orderedCols, unorderedCols))
+
+  scoreTable[order(pValueLog, -meanRnk, decreasing=TRUE),]
+}
+
+
+
+
+
+
+
+
+
diff --git a/inst/extdata/hg19/ucsc_examplePEP/project_config.yaml b/inst/extdata/hg19/ucsc_examplePEP/project_config.yaml
@@ -0,0 +1,7 @@
+metadata:
+        sample_table: samples_annotation.csv
+
+derived_attributes: [output_file_path]
+data_sources:
+        source1: ../ucsc_example/regions/{file_name}
+
diff --git a/inst/extdata/hg19/ucsc_examplePEP/samples_annotation.csv b/inst/extdata/hg19/ucsc_examplePEP/samples_annotation.csv
@@ -0,0 +1,6 @@
+sample_name,file_name,genome,exp_protocol,cell_type,description,data_source,output_file_path 
+region1,cpgIslandExt.bed,hg19,ChiPseq,NA,CpGislands from UCSC annotation,UCSC db,source1
+region2,laminB1Lads.bed,hg19,ChiPseq,NA,laminB1 data from UCSC annotation,UCSC db,source1
+region3,numtSAssembled.bed,ChiPseq,NA,hg19,numt data from UCSC annotation,UCSC db,source1
+region4,vistaEnhancers.bed,ChiPseq,NA,hg19,enhancers data from UCSC annotation,UCSC db,source1
+region5,vistaEnhancers_colNames.bed,ChiPseq,NA,hg19,enhancers col_names data,UCSC db,source1