Skip to content

Commit

Permalink
closes issue #2
Browse files Browse the repository at this point in the history
  • Loading branch information
mfansler committed Sep 20, 2021
1 parent 4f515b2 commit a9999d7
Show file tree
Hide file tree
Showing 8 changed files with 394 additions and 2 deletions.
6 changes: 4 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ Description: Various mRNA sequencing library preparation methods generate
when assigning such reads to isoforms. This implements some convenience
methods for readily generating such truncated annotations and their
corresponding sequences.
Version: 0.2.2
Version: 0.3.0
Authors@R:
person(given = "Mervin",
family = "Fansler",
Expand All @@ -25,7 +25,9 @@ Imports:
S4Vectors,
rtracklayer,
BiocParallel,
methods
stats,
methods,
utils
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.1.1
biocViews:
Expand Down
7 changes: 7 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,11 @@

export(exportFASTA)
export(exportGTF)
export(exportMergeTable)
export(generateMergeTable)
export(truncateTxome)
export(txdbToGRangesList)
exportMethods(generateMergeTable)
exportMethods(truncateTxome)
importFrom(AnnotationDbi,select)
importFrom(BiocGenerics,paste)
Expand All @@ -21,9 +24,11 @@ importFrom(GenomicRanges,"start<-")
importFrom(GenomicRanges,GRanges)
importFrom(GenomicRanges,GRangesList)
importFrom(GenomicRanges,end)
importFrom(GenomicRanges,findOverlaps)
importFrom(GenomicRanges,intersect)
importFrom(GenomicRanges,mcols)
importFrom(GenomicRanges,reduce)
importFrom(GenomicRanges,resize)
importFrom(GenomicRanges,seqnames)
importFrom(GenomicRanges,start)
importFrom(GenomicRanges,strand)
Expand All @@ -35,3 +40,5 @@ importFrom(S4Vectors,nrun)
importFrom(methods,setGeneric)
importFrom(methods,setMethod)
importFrom(rtracklayer,export)
importFrom(stats,setNames)
importFrom(utils,write.table)
30 changes: 30 additions & 0 deletions R/io.R
Original file line number Diff line number Diff line change
Expand Up @@ -93,3 +93,33 @@ exportFASTA <- function (txdb, genome, file) {
writeXStringSet(seqs, file, format="fasta")
invisible(txdb)
}

#' Export Merge Table for Transcriptome
#'
#' @param txdb a \code{TxDb} object representing a transcriptome annotation
#' @param file output TSV file
#' @param minDistance the minimum separation to regard overlapping transcripts
#' as unique.
#' @return The \code{txdb} argument is invisibly returned.
#'
#' @examples
#' library(TxDb.Scerevisiae.UCSC.sacCer3.sgdGene)
#'
#' ## load annotation
#' txdb <- TxDb.Scerevisiae.UCSC.sacCer3.sgdGene
#'
#' ## restrict to 'chrI' transcripts (makes for briefer example runtime)
#' seqlevels(txdb) <- c("chrI")
#'
#' ## last 500 nts per tx
#' txdb_w500 <- truncateTxome(txdb)
#'
#' exportMergeTable(txdb_w500, "sacCer3.sgdGene.w500.merge.tsv")
#'
#' @importFrom utils write.table
#' @export
exportMergeTable <- function (txdb, file, minDistance=200L) {
df <- generateMergeTable(txdb, minDistance=minDistance)
write.table(df, file, sep="\t", row.names=FALSE, quote=FALSE)
invisible(txdb)
}
150 changes: 150 additions & 0 deletions R/merge.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
#' @rdname generateMergeTable
#' @param txdb an object representing a transcriptome
#' @param minDistance the minimum separation to regard overlapping transcripts
#' as unique
#'
#' @return a \code{data.frame} with three columns
#' - \code{tx_in} the input transcript
#' - \code{tx_out} the transcript merged into
#' - \code{gene_out} the gene merged into
#'
#' @importFrom methods setGeneric
#' @export
setGeneric("generateMergeTable", signature=c("txdb", "minDistance"),
function(txdb, minDistance=200) standardGeneric("generateMergeTable")
)

#' Generate Merge Table
#'
#' @rdname generateMergeTable
#' @param txdb an object representing a transcriptome
#' @param minDistance the minimum separation to regard overlapping transcripts
#' as unique
#'
#' @return a \code{data.frame} with three columns
#' - \code{tx_in} the input transcript
#' - \code{tx_out} the transcript merged into
#' - \code{gene_out} the gene merged into
#'
#' @examples
#' library(TxDb.Scerevisiae.UCSC.sacCer3.sgdGene)
#'
#' ## load annotation
#' txdb <- TxDb.Scerevisiae.UCSC.sacCer3.sgdGene
#'
#' ## restrict to 'chrI' transcripts
#' seqlevels(txdb) <- c("chrI")
#'
#' ## last 500 nts per tx
#' txdb_w500 <- truncateTxome(txdb)
#' txdb_w500
#'
#' ## last 100 nts per tx
#' txdb_w100 <- truncateTxome(txdb, maxTxLength=100)
#' txdb_w100
#'
#' @importFrom GenomicRanges mcols resize findOverlaps
#' @importFrom GenomicFeatures transcripts
#' @importFrom methods setMethod
#' @importFrom stats setNames
#' @export
setMethod("generateMergeTable", "TxDb", function(txdb, minDistance=200L) {
grTxs <- transcripts(txdb,
columns=c("gene_id", "tx_id", "tx_name"))

## use only ends
grTxs <- resize(grTxs, width=minDistance, fix="end", ignore.strand=FALSE)

## generate raw overlaps
overlaps <- as.data.frame(findOverlaps(grTxs, ignore.strand=FALSE,
drop.self=TRUE))
overlaps["tx_in"] <- unlist(grTxs$tx_name[overlaps$queryHits])
overlaps["tx_out"] <- unlist(grTxs$tx_name[overlaps$subjectHits])
overlaps["gene_in"] <- unlist(grTxs$gene_id[overlaps$queryHits])
overlaps["gene_out"] <- unlist(grTxs$gene_id[overlaps$subjectHits])

## filter unmatched genes
overlaps <- overlaps[overlaps$gene_in == overlaps$gene_out,]

## keep downstream
overlaps["strand"] <- as.character(strand(grTxs))[overlaps$queryHits]
overlaps["end_in"] <- ifelse(overlaps$strand == "+",
end(grTxs[overlaps$queryHits]),
-start(grTxs[overlaps$queryHits]))
overlaps["end_out"] <- ifelse(overlaps$strand == "+",
end(grTxs[overlaps$subjectHits]),
-start(grTxs[overlaps$subjectHits]))
overlaps <- overlaps[overlaps$end_in <= overlaps$end_out,]

## if equal, prioritize early transcript
## e.g., a transcript ENSMUST000000001 will replace ENSMUST000001000
idxEqual <- which(overlaps$end_in == overlaps$end_out)
if (length(idxEqual) > 0) {
isLaterTx <- overlaps[idxEqual, "tx_in"] < overlaps[idxEqual, "tx_out"]
overlaps <- overlaps[-idxEqual[isLaterTx], ]
}

## pick unique out
groupedOverlaps <- split(overlaps, overlaps$queryHits)
singleOverlaps <- lapply(groupedOverlaps,
function (df) {
df[with(df, order(-end_out, tx_out)),][1,]
})
dfMerge <- do.call(rbind, singleOverlaps)[, c("tx_in", "tx_out")]
dfMerge <- .propagateMap(dfMerge)

## include self-maps
unmergedTxs <- grTxs$tx_name[!(grTxs$tx_name %in% dfMerge$tx_in)]
dfMerge <- rbind(dfMerge, data.frame(tx_in=unmergedTxs, tx_out=unmergedTxs))

## append gene
txToGeneMap <- setNames(object=unlist(grTxs$gene_id), nm=grTxs$tx_name)
dfMerge['gene_out'] <- txToGeneMap[as.character(dfMerge$tx_out)]

## reorder and drop rownames
dfMerge <- dfMerge[with(dfMerge, order(tx_in)),]
rownames(dfMerge) <- NULL

dfMerge
}
)


#' Propagate Transcript Merge Map
#'
#' @param df a \code{data.frame} with columns \code{tx_in} and \code{tx_out}
#' @param MAXITERS a numeric controlling the maximum number of iterations
#'
#' @return a converged \code{data.frame}, such that, \code{tx_out} is not
#' present in any \code{tx_in}
#'
#' @importFrom stats setNames
.propagateMap <- function (df, MAXITERS=1000) {
## NOTE: This method assumes no loops. To provide execution safety, we
## enforce a maximum number of iterations.
iteration <- 0

## while there are any outputs in the inputs,
while(any(df$tx_out %in% df$tx_in) && iteration < MAXITERS) {
iteration <- iteration + 1

## generate merge map for simple translation
mergeMap <- setNames(object=df$tx_out, nm=df$tx_in)

## these will be propagated
idxMappable <- df$tx_out %in% df$tx_in

## outputs to translate
oldOut <- as.character(df$tx_out[idxMappable])

## translations
newOut <- mergeMap[oldOut]

## replace old with new
df[idxMappable, "tx_out"] <- newOut
}
## did we fail to converge?
stopifnot(iteration <= MAXITERS && !any(df$tx_out %in% df$tx_in))

df
}
20 changes: 20 additions & 0 deletions man/dot-propagateMap.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

37 changes: 37 additions & 0 deletions man/exportMergeTable.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

49 changes: 49 additions & 0 deletions man/generateMergeTable.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit a9999d7

Please sign in to comment.