From a9999d7d0e49a6374419bd3e0cefc121017375db Mon Sep 17 00:00:00 2001 From: Mervin Fansler Date: Mon, 20 Sep 2021 02:22:05 -0400 Subject: [PATCH] closes issue #2 --- DESCRIPTION | 6 +- NAMESPACE | 7 ++ R/io.R | 30 ++++++++ R/merge.R | 150 ++++++++++++++++++++++++++++++++++++ man/dot-propagateMap.Rd | 20 +++++ man/exportMergeTable.Rd | 37 +++++++++ man/generateMergeTable.Rd | 49 ++++++++++++ tests/testthat/test-merge.R | 97 +++++++++++++++++++++++ 8 files changed, 394 insertions(+), 2 deletions(-) create mode 100644 R/merge.R create mode 100644 man/dot-propagateMap.Rd create mode 100644 man/exportMergeTable.Rd create mode 100644 man/generateMergeTable.Rd create mode 100644 tests/testthat/test-merge.R diff --git a/DESCRIPTION b/DESCRIPTION index c97cd6b..5419bb0 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -6,7 +6,7 @@ Description: Various mRNA sequencing library preparation methods generate when assigning such reads to isoforms. This implements some convenience methods for readily generating such truncated annotations and their corresponding sequences. -Version: 0.2.2 +Version: 0.3.0 Authors@R: person(given = "Mervin", family = "Fansler", @@ -25,7 +25,9 @@ Imports: S4Vectors, rtracklayer, BiocParallel, - methods + stats, + methods, + utils Roxygen: list(markdown = TRUE) RoxygenNote: 7.1.1 biocViews: diff --git a/NAMESPACE b/NAMESPACE index ae2bda3..dfce842 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -2,8 +2,11 @@ export(exportFASTA) export(exportGTF) +export(exportMergeTable) +export(generateMergeTable) export(truncateTxome) export(txdbToGRangesList) +exportMethods(generateMergeTable) exportMethods(truncateTxome) importFrom(AnnotationDbi,select) importFrom(BiocGenerics,paste) @@ -21,9 +24,11 @@ importFrom(GenomicRanges,"start<-") importFrom(GenomicRanges,GRanges) importFrom(GenomicRanges,GRangesList) importFrom(GenomicRanges,end) +importFrom(GenomicRanges,findOverlaps) importFrom(GenomicRanges,intersect) importFrom(GenomicRanges,mcols) importFrom(GenomicRanges,reduce) +importFrom(GenomicRanges,resize) importFrom(GenomicRanges,seqnames) importFrom(GenomicRanges,start) importFrom(GenomicRanges,strand) @@ -35,3 +40,5 @@ importFrom(S4Vectors,nrun) importFrom(methods,setGeneric) importFrom(methods,setMethod) importFrom(rtracklayer,export) +importFrom(stats,setNames) +importFrom(utils,write.table) diff --git a/R/io.R b/R/io.R index 1b9a204..27b90bc 100644 --- a/R/io.R +++ b/R/io.R @@ -93,3 +93,33 @@ exportFASTA <- function (txdb, genome, file) { writeXStringSet(seqs, file, format="fasta") invisible(txdb) } + +#' Export Merge Table for Transcriptome +#' +#' @param txdb a \code{TxDb} object representing a transcriptome annotation +#' @param file output TSV file +#' @param minDistance the minimum separation to regard overlapping transcripts +#' as unique. +#' @return The \code{txdb} argument is invisibly returned. +#' +#' @examples +#' library(TxDb.Scerevisiae.UCSC.sacCer3.sgdGene) +#' +#' ## load annotation +#' txdb <- TxDb.Scerevisiae.UCSC.sacCer3.sgdGene +#' +#' ## restrict to 'chrI' transcripts (makes for briefer example runtime) +#' seqlevels(txdb) <- c("chrI") +#' +#' ## last 500 nts per tx +#' txdb_w500 <- truncateTxome(txdb) +#' +#' exportMergeTable(txdb_w500, "sacCer3.sgdGene.w500.merge.tsv") +#' +#' @importFrom utils write.table +#' @export +exportMergeTable <- function (txdb, file, minDistance=200L) { + df <- generateMergeTable(txdb, minDistance=minDistance) + write.table(df, file, sep="\t", row.names=FALSE, quote=FALSE) + invisible(txdb) +} diff --git a/R/merge.R b/R/merge.R new file mode 100644 index 0000000..331d824 --- /dev/null +++ b/R/merge.R @@ -0,0 +1,150 @@ +#' @rdname generateMergeTable +#' @param txdb an object representing a transcriptome +#' @param minDistance the minimum separation to regard overlapping transcripts +#' as unique +#' +#' @return a \code{data.frame} with three columns +#' - \code{tx_in} the input transcript +#' - \code{tx_out} the transcript merged into +#' - \code{gene_out} the gene merged into +#' +#' @importFrom methods setGeneric +#' @export +setGeneric("generateMergeTable", signature=c("txdb", "minDistance"), + function(txdb, minDistance=200) standardGeneric("generateMergeTable") +) + +#' Generate Merge Table +#' +#' @rdname generateMergeTable +#' @param txdb an object representing a transcriptome +#' @param minDistance the minimum separation to regard overlapping transcripts +#' as unique +#' +#' @return a \code{data.frame} with three columns +#' - \code{tx_in} the input transcript +#' - \code{tx_out} the transcript merged into +#' - \code{gene_out} the gene merged into +#' +#' @examples +#' library(TxDb.Scerevisiae.UCSC.sacCer3.sgdGene) +#' +#' ## load annotation +#' txdb <- TxDb.Scerevisiae.UCSC.sacCer3.sgdGene +#' +#' ## restrict to 'chrI' transcripts +#' seqlevels(txdb) <- c("chrI") +#' +#' ## last 500 nts per tx +#' txdb_w500 <- truncateTxome(txdb) +#' txdb_w500 +#' +#' ## last 100 nts per tx +#' txdb_w100 <- truncateTxome(txdb, maxTxLength=100) +#' txdb_w100 +#' +#' @importFrom GenomicRanges mcols resize findOverlaps +#' @importFrom GenomicFeatures transcripts +#' @importFrom methods setMethod +#' @importFrom stats setNames +#' @export +setMethod("generateMergeTable", "TxDb", function(txdb, minDistance=200L) { + grTxs <- transcripts(txdb, + columns=c("gene_id", "tx_id", "tx_name")) + + ## use only ends + grTxs <- resize(grTxs, width=minDistance, fix="end", ignore.strand=FALSE) + + ## generate raw overlaps + overlaps <- as.data.frame(findOverlaps(grTxs, ignore.strand=FALSE, + drop.self=TRUE)) + overlaps["tx_in"] <- unlist(grTxs$tx_name[overlaps$queryHits]) + overlaps["tx_out"] <- unlist(grTxs$tx_name[overlaps$subjectHits]) + overlaps["gene_in"] <- unlist(grTxs$gene_id[overlaps$queryHits]) + overlaps["gene_out"] <- unlist(grTxs$gene_id[overlaps$subjectHits]) + + ## filter unmatched genes + overlaps <- overlaps[overlaps$gene_in == overlaps$gene_out,] + + ## keep downstream + overlaps["strand"] <- as.character(strand(grTxs))[overlaps$queryHits] + overlaps["end_in"] <- ifelse(overlaps$strand == "+", + end(grTxs[overlaps$queryHits]), + -start(grTxs[overlaps$queryHits])) + overlaps["end_out"] <- ifelse(overlaps$strand == "+", + end(grTxs[overlaps$subjectHits]), + -start(grTxs[overlaps$subjectHits])) + overlaps <- overlaps[overlaps$end_in <= overlaps$end_out,] + + ## if equal, prioritize early transcript + ## e.g., a transcript ENSMUST000000001 will replace ENSMUST000001000 + idxEqual <- which(overlaps$end_in == overlaps$end_out) + if (length(idxEqual) > 0) { + isLaterTx <- overlaps[idxEqual, "tx_in"] < overlaps[idxEqual, "tx_out"] + overlaps <- overlaps[-idxEqual[isLaterTx], ] + } + + ## pick unique out + groupedOverlaps <- split(overlaps, overlaps$queryHits) + singleOverlaps <- lapply(groupedOverlaps, + function (df) { + df[with(df, order(-end_out, tx_out)),][1,] + }) + dfMerge <- do.call(rbind, singleOverlaps)[, c("tx_in", "tx_out")] + dfMerge <- .propagateMap(dfMerge) + + ## include self-maps + unmergedTxs <- grTxs$tx_name[!(grTxs$tx_name %in% dfMerge$tx_in)] + dfMerge <- rbind(dfMerge, data.frame(tx_in=unmergedTxs, tx_out=unmergedTxs)) + + ## append gene + txToGeneMap <- setNames(object=unlist(grTxs$gene_id), nm=grTxs$tx_name) + dfMerge['gene_out'] <- txToGeneMap[as.character(dfMerge$tx_out)] + + ## reorder and drop rownames + dfMerge <- dfMerge[with(dfMerge, order(tx_in)),] + rownames(dfMerge) <- NULL + + dfMerge +} +) + + +#' Propagate Transcript Merge Map +#' +#' @param df a \code{data.frame} with columns \code{tx_in} and \code{tx_out} +#' @param MAXITERS a numeric controlling the maximum number of iterations +#' +#' @return a converged \code{data.frame}, such that, \code{tx_out} is not +#' present in any \code{tx_in} +#' +#' @importFrom stats setNames +.propagateMap <- function (df, MAXITERS=1000) { + ## NOTE: This method assumes no loops. To provide execution safety, we + ## enforce a maximum number of iterations. + iteration <- 0 + + ## while there are any outputs in the inputs, + while(any(df$tx_out %in% df$tx_in) && iteration < MAXITERS) { + iteration <- iteration + 1 + + ## generate merge map for simple translation + mergeMap <- setNames(object=df$tx_out, nm=df$tx_in) + + ## these will be propagated + idxMappable <- df$tx_out %in% df$tx_in + + ## outputs to translate + oldOut <- as.character(df$tx_out[idxMappable]) + + ## translations + newOut <- mergeMap[oldOut] + + ## replace old with new + df[idxMappable, "tx_out"] <- newOut + } + ## did we fail to converge? + stopifnot(iteration <= MAXITERS && !any(df$tx_out %in% df$tx_in)) + + df +} diff --git a/man/dot-propagateMap.Rd b/man/dot-propagateMap.Rd new file mode 100644 index 0000000..a9ec203 --- /dev/null +++ b/man/dot-propagateMap.Rd @@ -0,0 +1,20 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/merge.R +\name{.propagateMap} +\alias{.propagateMap} +\title{Propagate Transcript Merge Map} +\usage{ +.propagateMap(df, MAXITERS = 1000) +} +\arguments{ +\item{df}{a \code{data.frame} with columns \code{tx_in} and \code{tx_out}} + +\item{MAXITERS}{a numeric controlling the maximum number of iterations} +} +\value{ +a converged \code{data.frame}, such that, \code{tx_out} is not +present in any \code{tx_in} +} +\description{ +Propagate Transcript Merge Map +} diff --git a/man/exportMergeTable.Rd b/man/exportMergeTable.Rd new file mode 100644 index 0000000..ef57042 --- /dev/null +++ b/man/exportMergeTable.Rd @@ -0,0 +1,37 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/io.R +\name{exportMergeTable} +\alias{exportMergeTable} +\title{Export Merge Table for Transcriptome} +\usage{ +exportMergeTable(txdb, file, minDistance = 200L) +} +\arguments{ +\item{txdb}{a \code{TxDb} object representing a transcriptome annotation} + +\item{file}{output TSV file} + +\item{minDistance}{the minimum separation to regard overlapping transcripts +as unique.} +} +\value{ +The \code{txdb} argument is invisibly returned. +} +\description{ +Export Merge Table for Transcriptome +} +\examples{ +library(TxDb.Scerevisiae.UCSC.sacCer3.sgdGene) + +## load annotation +txdb <- TxDb.Scerevisiae.UCSC.sacCer3.sgdGene + +## restrict to 'chrI' transcripts (makes for briefer example runtime) +seqlevels(txdb) <- c("chrI") + +## last 500 nts per tx +txdb_w500 <- truncateTxome(txdb) + +exportMergeTable(txdb_w500, "sacCer3.sgdGene.w500.merge.tsv") + +} diff --git a/man/generateMergeTable.Rd b/man/generateMergeTable.Rd new file mode 100644 index 0000000..e616fdd --- /dev/null +++ b/man/generateMergeTable.Rd @@ -0,0 +1,49 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/merge.R +\name{generateMergeTable} +\alias{generateMergeTable} +\alias{generateMergeTable,TxDb-method} +\title{Generate Merge Table} +\usage{ +generateMergeTable(txdb, minDistance = 200) + +\S4method{generateMergeTable}{TxDb}(txdb, minDistance = 200L) +} +\arguments{ +\item{txdb}{an object representing a transcriptome} + +\item{minDistance}{the minimum separation to regard overlapping transcripts +as unique} +} +\value{ +a \code{data.frame} with three columns +- \code{tx_in} the input transcript +- \code{tx_out} the transcript merged into +- \code{gene_out} the gene merged into + +a \code{data.frame} with three columns +- \code{tx_in} the input transcript +- \code{tx_out} the transcript merged into +- \code{gene_out} the gene merged into +} +\description{ +Generate Merge Table +} +\examples{ +library(TxDb.Scerevisiae.UCSC.sacCer3.sgdGene) + +## load annotation +txdb <- TxDb.Scerevisiae.UCSC.sacCer3.sgdGene + +## restrict to 'chrI' transcripts +seqlevels(txdb) <- c("chrI") + +## last 500 nts per tx +txdb_w500 <- truncateTxome(txdb) +txdb_w500 + +## last 100 nts per tx +txdb_w100 <- truncateTxome(txdb, maxTxLength=100) +txdb_w100 + +} diff --git a/tests/testthat/test-merge.R b/tests/testthat/test-merge.R new file mode 100644 index 0000000..6976611 --- /dev/null +++ b/tests/testthat/test-merge.R @@ -0,0 +1,97 @@ +library(GenomicRanges) +library(GenomicFeatures) + +############ +## Mock Data +############ + +## Single Exon Gene +gr_contig <- GRanges( + seqnames=rep("chr1", 5), + strand="+", + ranges=IRanges(start=5000, + width=c(1000, 1000, 1000, 900, 900)), + type=c("gene", "transcript", "exon", "transcript", "exon"), + ID=c("gene_1", "tx_1", "exon_1", "tx_2", "exon_2"), + Parent=c(NA, "gene_1", "tx_1", "gene_1", "tx_2"), + gene_id="gene_1", + tx_id=c(NA, "tx_1", "tx_1", "tx_2", "tx_2"), + exon_id=c(NA, NA, "exon_1", NA, "exon_2")) + +txdb_contig <- makeTxDbFromGRanges(gr_contig) + +## Negative Strand +gr_contig_neg <- GRanges( + seqnames=rep("chr1", 5), + strand="-", + ranges=IRanges(end=5000, + width=c(1000, 1000, 1000, 900, 900)), + type=c("gene", "transcript", "exon", "transcript", "exon"), + ID=c("gene_1", "tx_1", "exon_1", "tx_2", "exon_2"), + Parent=c(NA, "gene_1", "tx_1", "gene_1", "tx_2"), + gene_id="gene_1", + tx_id=c(NA, "tx_1", "tx_1", "tx_2", "tx_2"), + exon_id=c(NA, NA, "exon_1", NA, "exon_2")) + +txdb_contig_neg <- makeTxDbFromGRanges(gr_contig_neg) + +## Overlapping Genes +gr_multigene <- GRanges( + seqnames=rep("chr1", 6), + strand="+", + ranges=IRanges(end=5000, + width=c(1000, 1000, 1000, 1200, 1200, 1200)), + type=c("gene", "transcript", "exon", + "gene", "transcript", "exon"), + ID=c("gene_1", "tx_1", "exon_1", + "gene_2", "tx_2", "exon_2"), + Parent=c(NA, "gene_1", "tx_1", + NA, "gene_2", "tx_2"), + gene_id=c("gene_1", "gene_1", "gene_1", + "gene_2", "gene_2", "gene_2"), + tx_id=c(NA, "tx_1", "tx_1", + NA, "tx_2", "tx_2"), + exon_id=c(NA, NA, "exon_1", + NA, NA, "exon_2")) + +txdb_multigene <- makeTxDbFromGRanges(gr_multigene) + +######## +## Tests +######## + +test_that("nearby transcripts are merged, positive strand", { + df <- generateMergeTable(txdb_contig, minDistance=200) + n_txdb_txs <- length(transcripts(txdb_contig)) + n_txs_in <- length(unique(df$tx_in)) + n_txs_out <- length(unique(df$tx_out)) + expect_equal(n_txs_in, n_txdb_txs) + expect_equal(n_txs_out, 1) +}) + +test_that("nearby transcripts are merged, negative strand", { + df <- generateMergeTable(txdb_contig_neg, minDistance=200) + n_txdb_txs <- length(transcripts(txdb_contig)) + n_txs_in <- length(unique(df$tx_in)) + n_txs_out <- length(unique(df$tx_out)) + expect_equal(n_txs_in, n_txdb_txs) + expect_equal(n_txs_out, 1) +}) + +test_that("far transcripts are unmerged, positive strand", { + df <- generateMergeTable(txdb_contig_neg, minDistance=50) + n_txdb_txs <- length(transcripts(txdb_contig)) + n_txs_in <- length(unique(df$tx_in)) + n_txs_out <- length(unique(df$tx_out)) + expect_equal(n_txs_in, n_txdb_txs) + expect_equal(n_txs_out, 2) +}) + +test_that("far transcripts are unmerged, negative strand", { + df <- generateMergeTable(txdb_contig_neg, minDistance=50) + n_txdb_txs <- length(transcripts(txdb_contig)) + n_txs_in <- length(unique(df$tx_in)) + n_txs_out <- length(unique(df$tx_out)) + expect_equal(n_txs_in, n_txdb_txs) + expect_equal(n_txs_out, 2) +})