From 2645d8e63c15a880edb1d17ae6f9b35f40576f3e Mon Sep 17 00:00:00 2001 From: Yossi Farjoun Date: Sun, 11 Dec 2022 01:32:59 -0500 Subject: [PATCH 01/16] New tool written. Normalize Coverage. needs basic and unit testing... --- .../personal/yfarjoun/NormalizeCoverage.scala | 158 ++++++++++++++++++ 1 file changed, 158 insertions(+) create mode 100644 src/main/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverage.scala diff --git a/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverage.scala b/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverage.scala new file mode 100644 index 000000000..e002f6d4a --- /dev/null +++ b/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverage.scala @@ -0,0 +1,158 @@ +/* + * The MIT License + * + * Copyright (c) 2022 Fulcrum Genomics + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + */ +package com.fulcrumgenomics.personal.yfarjoun + +import com.fulcrumgenomics.FgBioDef._ +import com.fulcrumgenomics.bam.api.{SamOrder, SamRecord, SamSource, SamWriter} +import com.fulcrumgenomics.bam.{Bams, Template} +import com.fulcrumgenomics.cmdline.{ClpGroups, FgBioTool} +import com.fulcrumgenomics.commons.util.LazyLogging +import com.fulcrumgenomics.personal.yfarjoun.NormalizeCoverageOptions._ +import com.fulcrumgenomics.sopt._ +import com.fulcrumgenomics.util.NumericTypes.PhredScore +import com.fulcrumgenomics.util.{IntervalListSource, Io} +import htsjdk.samtools.util.{Interval, OverlapDetector} + +import scala.collection.mutable + +object NormalizeCoverageOptions { + /** Various default values for the Coverage Normalizer. */ + val DefaultMinMapQ: PhredScore = 30.toByte +} + +@clp(description = + """ + Normalizes coverage of an input bam to be at a given coverage for every base in the reference/input interval list. + """, + group = ClpGroups.SamOrBam) +class NormalizeCoverage( + @arg(flag = 'i', doc = "The input SAM or BAM file.") val input: PathToBam, + @arg(flag = 'o', doc = "Output BAM file to write.") val output: PathToBam, + @arg(flag = 'l', doc = "The interval list over which to normalize coverage. " + + "If provided, templates whose primary reads do not overlap with this region will be dropped entirely.") val targetIntervals: Option[PathToIntervals] = None, + @arg(flag = 'm', doc = "MAPQ lower bound for considering read-template.") val minMQ: PhredScore = DefaultMinMapQ, + @arg(flag = 'c', doc = "Target coverage.") val coverageTarget: Int, + @arg(flag = 's', doc = "Order in which to emit the records.") val sortOrder: SamOrder = SamOrder.Coordinate, + @arg(flag = 'm', doc = "Max records in RAM.") val maxRecordsInRam: Int = SamWriter.DefaultMaxRecordsInRam + + ) extends FgBioTool with LazyLogging { + + + def templateToCoverageTuple(template: Template): (Int, Int) = { + template + //take primary reads + .primaryReads + //make sure they are good + .filter(readFilterForCoverage) + + //replace each read with its start/end tuple + .map(r => (r.start, r.end)) reduce ((_, _) => { + case ((r1_s: Int, r1_e: Int), (r2_s: Int, r2_e: Int)) => + (Seq(r1_s, r2_s).min, Seq(r1_e, r2_e).max) + }) + } + + def primariesOverlapTargets(template: Template, overlapDetector: Option[OverlapDetector[Interval]]): Boolean = { + if (overlapDetector.isEmpty) { + true + } else { + template.primaryReads + .map(r => overlapDetector.get.overlapsAny(r.asSam)) + //if any read overlaps target + .reduce(_ || _) + } + } + + def minCoverage(template: Template, cov: Map[String, mutable.IndexedSeq[Short]]): Short = { + val coverage: IndexedSeq[Short] = cov.getOrElse(template.r1.get.refName, Nil).toIndexedSeq + templateToCoverageTuple(template) match { + case (start: Int, end: Int) => { + coverage.slice(start - 1, end).minOption.getOrElse(0) + } + } + } + + def incrementCoverage(template: Template, cov: Map[String, mutable.IndexedSeq[Short]]): Unit = { + val coverage: mutable.IndexedSeq[Short] = cov.getOrElse(template.r1.get.refName, Nil) + val test: Short = coverage(0) + coverage.update(1, test) + + templateToCoverageTuple(template) match { + case (start: Int, end: Int) => + Range(start - 1, end) + .map(i => i.toShort) + .foreach(j => coverage.update(j) = (coverage(j) + 1).toShort) + } + } + + def readFilterForCoverage(record: SamRecord): Boolean = { + record.mapped && + record.properlyPaired && + !record.duplicate && + record.pf + } + + def minMapQ(template: Template): PhredScore = { + template.primaryReads.filter(readFilterForCoverage) + .map(_.mapq) + .minOption + .getOrElse(0) + .toByte + } + + override def execute(): Unit = { + Io.assertReadable(input) + Io.assertCanWriteFile(output) + targetIntervals.foreach(Io.assertReadable) + assert(coverageTarget > 0, s"Target must be >0 found $coverageTarget.") + + val in = SamSource(input) + val out = SamWriter(output, in.header.clone(), sort = Some(sortOrder), maxRecordsInRam = maxRecordsInRam) + + val overlapDetector: Option[OverlapDetector[Interval]] = targetIntervals.map(f => IntervalListSource(f)) + .map(i => OverlapDetector.create[Interval](i.toIntervalList.getIntervals)) + + val templateIterator = Bams.templateSortedIterator(in, maxRecordsInRam) + val coverages = in.dict.map(seq => seq.name -> mutable.IndexedSeq[Short](seq.length.toShort)).toMap + + templateIterator + //see if primary reads overlap region of interest + .filter(t => primariesOverlapTargets(t, overlapDetector)) + //check that all the reads in the template pass minMQ + .filter(t => minMapQ(t) >= minMQ) + // only include template whose primary reads are all on the same reference + .filter(t => t.primaryReads.map(r => r.refName).distinct.length == 1) + //only consider reads that cover regions that need coverage, i.e. that + //at least one base that they cover is less than the target + .filter(t => minCoverage(t, coverages) < coverageTarget) + // + .foreach(t => { + //increment coverages and emit template + incrementCoverage(t, coverages) + //write + t.allReads.foreach(out.write) + }) + } +} From aaf18b7e57411ed59df288e4b9208c17749bf8d2 Mon Sep 17 00:00:00 2001 From: Yossi Farjoun Date: Sun, 11 Dec 2022 01:33:45 -0500 Subject: [PATCH 02/16] braces --- .../fulcrumgenomics/personal/yfarjoun/NormalizeCoverage.scala | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverage.scala b/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverage.scala index e002f6d4a..fb4c3d0d1 100644 --- a/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverage.scala +++ b/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverage.scala @@ -88,9 +88,8 @@ class NormalizeCoverage( def minCoverage(template: Template, cov: Map[String, mutable.IndexedSeq[Short]]): Short = { val coverage: IndexedSeq[Short] = cov.getOrElse(template.r1.get.refName, Nil).toIndexedSeq templateToCoverageTuple(template) match { - case (start: Int, end: Int) => { + case (start: Int, end: Int) => coverage.slice(start - 1, end).minOption.getOrElse(0) - } } } From b4dc937aae13cea01cfa4040274083f1ebd220ad Mon Sep 17 00:00:00 2001 From: Yossi Farjoun Date: Sun, 11 Dec 2022 16:15:56 -0500 Subject: [PATCH 03/16] -fixed a few things that prevented compilation. -cleaned up signature of functions -made the target specify the exact region of interest --- .../personal/yfarjoun/NormalizeCoverage.scala | 108 +++++++++++------- 1 file changed, 66 insertions(+), 42 deletions(-) diff --git a/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverage.scala b/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverage.scala index fb4c3d0d1..1cca76bac 100644 --- a/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverage.scala +++ b/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverage.scala @@ -29,12 +29,14 @@ import com.fulcrumgenomics.bam.api.{SamOrder, SamRecord, SamSource, SamWriter} import com.fulcrumgenomics.bam.{Bams, Template} import com.fulcrumgenomics.cmdline.{ClpGroups, FgBioTool} import com.fulcrumgenomics.commons.util.LazyLogging +import com.fulcrumgenomics.fasta.SequenceDictionary import com.fulcrumgenomics.personal.yfarjoun.NormalizeCoverageOptions._ import com.fulcrumgenomics.sopt._ import com.fulcrumgenomics.util.NumericTypes.PhredScore import com.fulcrumgenomics.util.{IntervalListSource, Io} -import htsjdk.samtools.util.{Interval, OverlapDetector} +import htsjdk.samtools.util.{Interval, Locatable, OverlapDetector} +import scala.jdk.CollectionConverters._ import scala.collection.mutable object NormalizeCoverageOptions { @@ -44,7 +46,7 @@ object NormalizeCoverageOptions { @clp(description = """ - Normalizes coverage of an input bam to be at a given coverage for every base in the reference/input interval list. +Normalizes coverage of an input bam to be at a given coverage for every base in the reference/input interval list. """, group = ClpGroups.SamOrBam) class NormalizeCoverage( @@ -52,58 +54,78 @@ class NormalizeCoverage( @arg(flag = 'o', doc = "Output BAM file to write.") val output: PathToBam, @arg(flag = 'l', doc = "The interval list over which to normalize coverage. " + "If provided, templates whose primary reads do not overlap with this region will be dropped entirely.") val targetIntervals: Option[PathToIntervals] = None, - @arg(flag = 'm', doc = "MAPQ lower bound for considering read-template.") val minMQ: PhredScore = DefaultMinMapQ, + @arg(name = "mq" , doc = "MAPQ lower bound for considering read-template.") val minMQ: PhredScore = DefaultMinMapQ, @arg(flag = 'c', doc = "Target coverage.") val coverageTarget: Int, @arg(flag = 's', doc = "Order in which to emit the records.") val sortOrder: SamOrder = SamOrder.Coordinate, @arg(flag = 'm', doc = "Max records in RAM.") val maxRecordsInRam: Int = SamWriter.DefaultMaxRecordsInRam ) extends FgBioTool with LazyLogging { + var overlapDetector: OverlapDetector[Interval] = _ + val coverage: mutable.Map[String, mutable.IndexedSeq[Short]] = mutable.Map() - def templateToCoverageTuple(template: Template): (Int, Int) = { - template + def initialize(bam: PathToBam, intervalsMaybe: Option[PathToIntervals]): Unit = { + val in = SamSource(bam) + + //QUESTIOM: do I need to close and 'f'? + overlapDetector = intervalsMaybe.map(f => IntervalListSource(f)) + .map(i => OverlapDetector.create[Interval](i.toIntervalList.getIntervals)) + .getOrElse(makeOverlapDetectorFromDictionary(in.dict)) + in.dict.foreach(seq => coverage.update(seq.name, mutable.IndexedSeq[Short](seq.length.toShort))) + in.close() + } + + def union(i1: Locatable, i2: Locatable): Locatable = { + new Interval(i1.getContig, Math.min(i1.getStart, i2.getStart), Math.max(i1.getEnd, i2.getEnd)) + } + + def readToSubsettedLocus(r: Locatable): Option[Locatable] = { + val overlappingIntervals: Set[Interval] = overlapDetector.getOverlaps(r).asScala.toSet + val unionOverlapMaybe = overlappingIntervals.reduceOption(union) + unionOverlapMaybe.map(unionOverlap => new Interval( + r.getContig, + Math.max(r.getStart, unionOverlap.getStart), + Math.min(r.getEnd, unionOverlap.getEnd))) + } + + def templateToCoverageTuple(template: Template): Option[Locatable] = { + val locus: Locatable = template //take primary reads .primaryReads //make sure they are good .filter(readFilterForCoverage) //replace each read with its start/end tuple - .map(r => (r.start, r.end)) reduce ((_, _) => { - case ((r1_s: Int, r1_e: Int), (r2_s: Int, r2_e: Int)) => - (Seq(r1_s, r2_s).min, Seq(r1_e, r2_e).max) - }) + .map[Locatable](r => r.asSam) + + //find the union of all the reads + .reduce(union) + + //Subset the read to the overlapping intervals + readToSubsettedLocus(locus) } - def primariesOverlapTargets(template: Template, overlapDetector: Option[OverlapDetector[Interval]]): Boolean = { - if (overlapDetector.isEmpty) { - true - } else { - template.primaryReads - .map(r => overlapDetector.get.overlapsAny(r.asSam)) - //if any read overlaps target - .reduce(_ || _) - } + def primariesOverlapTargets(template: Template): Boolean = { + template.primaryReads.exists(r => overlapDetector.overlapsAny(r.asSam)) } - def minCoverage(template: Template, cov: Map[String, mutable.IndexedSeq[Short]]): Short = { - val coverage: IndexedSeq[Short] = cov.getOrElse(template.r1.get.refName, Nil).toIndexedSeq - templateToCoverageTuple(template) match { - case (start: Int, end: Int) => - coverage.slice(start - 1, end).minOption.getOrElse(0) - } + //returns Short.MaxValue if template doesn't overlap with requested intervals + def minCoverage(template: Template): Short = { + val cov: IndexedSeq[Short] = coverage(template.r1.get.refName).toIndexedSeq + val covLocusMaybe: Option[Locatable] = templateToCoverageTuple(template) + + val retVal:Short=covLocusMaybe.map[Short](covLocus => cov.slice(covLocus.getStart - 1, covLocus.getEnd).minOption[Short].getOrElse(0)).getOrElse(Short.MaxValue) + retVal } - def incrementCoverage(template: Template, cov: Map[String, mutable.IndexedSeq[Short]]): Unit = { - val coverage: mutable.IndexedSeq[Short] = cov.getOrElse(template.r1.get.refName, Nil) - val test: Short = coverage(0) - coverage.update(1, test) - - templateToCoverageTuple(template) match { - case (start: Int, end: Int) => - Range(start - 1, end) - .map(i => i.toShort) - .foreach(j => coverage.update(j) = (coverage(j) + 1).toShort) - } + def incrementCoverage(template: Template): Unit = { + val cov: mutable.IndexedSeq[Short] = coverage(template.r1.get.refName) + val covLocusMaybe: Option[Locatable] = templateToCoverageTuple(template) + + covLocusMaybe.foreach(covLocus => + Range(covLocus.getStart - 1, covLocus.getEnd) + .map(i => i.toShort) + .foreach(j => cov.update(j, (cov(j) + 1).toShort))) } def readFilterForCoverage(record: SamRecord): Boolean = { @@ -121,35 +143,37 @@ class NormalizeCoverage( .toByte } + def makeOverlapDetectorFromDictionary(dict: SequenceDictionary): OverlapDetector[Interval] = { + OverlapDetector.create[Interval](dict.iterator.map(seq => new Interval(seq.name, 1, seq.length)).toList.asJava) + } + override def execute(): Unit = { Io.assertReadable(input) Io.assertCanWriteFile(output) targetIntervals.foreach(Io.assertReadable) assert(coverageTarget > 0, s"Target must be >0 found $coverageTarget.") + initialize(input, targetIntervals) + val in = SamSource(input) val out = SamWriter(output, in.header.clone(), sort = Some(sortOrder), maxRecordsInRam = maxRecordsInRam) - val overlapDetector: Option[OverlapDetector[Interval]] = targetIntervals.map(f => IntervalListSource(f)) - .map(i => OverlapDetector.create[Interval](i.toIntervalList.getIntervals)) - val templateIterator = Bams.templateSortedIterator(in, maxRecordsInRam) - val coverages = in.dict.map(seq => seq.name -> mutable.IndexedSeq[Short](seq.length.toShort)).toMap templateIterator //see if primary reads overlap region of interest - .filter(t => primariesOverlapTargets(t, overlapDetector)) + .filter(t => primariesOverlapTargets(t)) //check that all the reads in the template pass minMQ .filter(t => minMapQ(t) >= minMQ) // only include template whose primary reads are all on the same reference .filter(t => t.primaryReads.map(r => r.refName).distinct.length == 1) //only consider reads that cover regions that need coverage, i.e. that //at least one base that they cover is less than the target - .filter(t => minCoverage(t, coverages) < coverageTarget) + .filter(t => minCoverage(t) < coverageTarget) // .foreach(t => { //increment coverages and emit template - incrementCoverage(t, coverages) + incrementCoverage(t) //write t.allReads.foreach(out.write) }) From e05a510384a4b40d8d03fec9f534ae0556812a7e Mon Sep 17 00:00:00 2001 From: Yossi Farjoun Date: Sun, 11 Dec 2022 23:04:28 -0500 Subject: [PATCH 04/16] - file streams are closed when done (output most importantly...) - passes snifftest. - needs tests. --- .../personal/yfarjoun/NormalizeCoverage.scala | 85 +++++++++++-------- 1 file changed, 48 insertions(+), 37 deletions(-) diff --git a/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverage.scala b/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverage.scala index 1cca76bac..5a737ddd2 100644 --- a/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverage.scala +++ b/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverage.scala @@ -30,11 +30,12 @@ import com.fulcrumgenomics.bam.{Bams, Template} import com.fulcrumgenomics.cmdline.{ClpGroups, FgBioTool} import com.fulcrumgenomics.commons.util.LazyLogging import com.fulcrumgenomics.fasta.SequenceDictionary -import com.fulcrumgenomics.personal.yfarjoun.NormalizeCoverageOptions._ +import com.fulcrumgenomics.personal.yfarjoun.NormalizeCoverageOptions.DefaultMinMapQ import com.fulcrumgenomics.sopt._ import com.fulcrumgenomics.util.NumericTypes.PhredScore import com.fulcrumgenomics.util.{IntervalListSource, Io} import htsjdk.samtools.util.{Interval, Locatable, OverlapDetector} +import scala.util.Using import scala.jdk.CollectionConverters._ import scala.collection.mutable @@ -44,6 +45,7 @@ object NormalizeCoverageOptions { val DefaultMinMapQ: PhredScore = 30.toByte } + @clp(description = """ Normalizes coverage of an input bam to be at a given coverage for every base in the reference/input interval list. @@ -54,8 +56,8 @@ class NormalizeCoverage( @arg(flag = 'o', doc = "Output BAM file to write.") val output: PathToBam, @arg(flag = 'l', doc = "The interval list over which to normalize coverage. " + "If provided, templates whose primary reads do not overlap with this region will be dropped entirely.") val targetIntervals: Option[PathToIntervals] = None, - @arg(name = "mq" , doc = "MAPQ lower bound for considering read-template.") val minMQ: PhredScore = DefaultMinMapQ, - @arg(flag = 'c', doc = "Target coverage.") val coverageTarget: Int, + @arg(name = "mq", doc = "MAPQ lower bound for considering read-template.") val minMQ: PhredScore = DefaultMinMapQ, + @arg(flag = 'c', doc = "Target coverage.") val coverageTarget: Short, @arg(flag = 's', doc = "Order in which to emit the records.") val sortOrder: SamOrder = SamOrder.Coordinate, @arg(flag = 'm', doc = "Max records in RAM.") val maxRecordsInRam: Int = SamWriter.DefaultMaxRecordsInRam @@ -65,14 +67,20 @@ class NormalizeCoverage( val coverage: mutable.Map[String, mutable.IndexedSeq[Short]] = mutable.Map() def initialize(bam: PathToBam, intervalsMaybe: Option[PathToIntervals]): Unit = { - val in = SamSource(bam) - - //QUESTIOM: do I need to close and 'f'? - overlapDetector = intervalsMaybe.map(f => IntervalListSource(f)) - .map(i => OverlapDetector.create[Interval](i.toIntervalList.getIntervals)) - .getOrElse(makeOverlapDetectorFromDictionary(in.dict)) - in.dict.foreach(seq => coverage.update(seq.name, mutable.IndexedSeq[Short](seq.length.toShort))) - in.close() + Using(SamSource(bam)) { in => { + + Using.Manager { ils => { + overlapDetector = intervalsMaybe.map(f => ils(IntervalListSource(f))) + .map(i => OverlapDetector.create[Interval](i.toIntervalList.getIntervals)) + .getOrElse(makeOverlapDetectorFromDictionary(in.dict)) + in.dict.foreach(seq => { + coverage(seq.name) = mutable.IndexedSeq.fill(seq.length)(0.toShort); + } + ) + } + } + } + } } def union(i1: Locatable, i2: Locatable): Locatable = { @@ -114,18 +122,19 @@ class NormalizeCoverage( val cov: IndexedSeq[Short] = coverage(template.r1.get.refName).toIndexedSeq val covLocusMaybe: Option[Locatable] = templateToCoverageTuple(template) - val retVal:Short=covLocusMaybe.map[Short](covLocus => cov.slice(covLocus.getStart - 1, covLocus.getEnd).minOption[Short].getOrElse(0)).getOrElse(Short.MaxValue) - retVal + val ret = covLocusMaybe.map(covLocus => cov.slice(covLocus.getStart - 1, covLocus.getEnd).minOption.getOrElse(0.toShort)).getOrElse(Short.MaxValue) + ret } + // increments the coverage over the range of the template. + // will not increment beyond Short.MaxValue def incrementCoverage(template: Template): Unit = { val cov: mutable.IndexedSeq[Short] = coverage(template.r1.get.refName) val covLocusMaybe: Option[Locatable] = templateToCoverageTuple(template) covLocusMaybe.foreach(covLocus => Range(covLocus.getStart - 1, covLocus.getEnd) - .map(i => i.toShort) - .foreach(j => cov.update(j, (cov(j) + 1).toShort))) + .foreach(j => cov.update(j, Math.min(Short.MaxValue, cov(j) + 1).toShort))) } def readFilterForCoverage(record: SamRecord): Boolean = { @@ -155,27 +164,29 @@ class NormalizeCoverage( initialize(input, targetIntervals) - val in = SamSource(input) - val out = SamWriter(output, in.header.clone(), sort = Some(sortOrder), maxRecordsInRam = maxRecordsInRam) - - val templateIterator = Bams.templateSortedIterator(in, maxRecordsInRam) - - templateIterator - //see if primary reads overlap region of interest - .filter(t => primariesOverlapTargets(t)) - //check that all the reads in the template pass minMQ - .filter(t => minMapQ(t) >= minMQ) - // only include template whose primary reads are all on the same reference - .filter(t => t.primaryReads.map(r => r.refName).distinct.length == 1) - //only consider reads that cover regions that need coverage, i.e. that - //at least one base that they cover is less than the target - .filter(t => minCoverage(t) < coverageTarget) - // - .foreach(t => { - //increment coverages and emit template - incrementCoverage(t) - //write - t.allReads.foreach(out.write) - }) + Using.Manager { use => + val in = use(SamSource(input)) + val out = use(SamWriter(output, in.header.clone(), sort = Some(sortOrder), maxRecordsInRam = maxRecordsInRam)) + + val templateIterator = Bams.templateSortedIterator(in, maxRecordsInRam) + + templateIterator + //see if primary reads overlap region of interest + .filter(t => primariesOverlapTargets(t)) + //check that all the reads in the template pass minMQ + .filter(t => minMapQ(t) >= minMQ) + // only include template whose primary reads are all on the same reference + .filter(t => t.primaryReads.map(r => r.refName).distinct.length == 1) + //only consider reads that cover regions that need coverage, i.e. that + //at least one base that they cover is less than the target + .filter(t => minCoverage(t) < coverageTarget) + // + .foreach(t => { + //increment coverages and emit template + incrementCoverage(t) + //write + t.allReads.foreach(out.write) + }) + } } } From f9a45d7e8504a6386012ec254d97b6ad871ccf01 Mon Sep 17 00:00:00 2001 From: Yossi Farjoun Date: Mon, 12 Dec 2022 00:04:39 -0500 Subject: [PATCH 05/16] - refactoring and trying to write tests... --- .../personal/yfarjoun/NormalizeCoverage.scala | 164 ++++++++++-------- .../NormalizeCoverageOptionsTest.scala | 39 +++++ .../yfarjoun/NormalizeCoverageTest.scala | 56 ++++++ 3 files changed, 185 insertions(+), 74 deletions(-) create mode 100644 src/test/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverageOptionsTest.scala create mode 100644 src/test/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverageTest.scala diff --git a/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverage.scala b/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverage.scala index 5a737ddd2..c7bcf720e 100644 --- a/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverage.scala +++ b/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverage.scala @@ -30,17 +30,52 @@ import com.fulcrumgenomics.bam.{Bams, Template} import com.fulcrumgenomics.cmdline.{ClpGroups, FgBioTool} import com.fulcrumgenomics.commons.util.LazyLogging import com.fulcrumgenomics.fasta.SequenceDictionary -import com.fulcrumgenomics.personal.yfarjoun.NormalizeCoverageOptions.DefaultMinMapQ +import com.fulcrumgenomics.personal.yfarjoun.NormalizeCoverageOptions._ import com.fulcrumgenomics.sopt._ import com.fulcrumgenomics.util.NumericTypes.PhredScore import com.fulcrumgenomics.util.{IntervalListSource, Io} import htsjdk.samtools.util.{Interval, Locatable, OverlapDetector} -import scala.util.Using +import htsjdk.tribble.SimpleFeature +import java.util +import scala.util.Using import scala.jdk.CollectionConverters._ import scala.collection.mutable object NormalizeCoverageOptions { + + def getIntervalsFromDictionary(dict: SequenceDictionary): util.List[Interval] = { + dict.iterator.map(seq => new Interval(seq.name, 1, seq.length)).toList.asJava + } + + def minMapQ(template: Template): PhredScore = { + template.primaryReads.filter(readFilterForCoverage) + .map(_.mapq) + .minOption + .getOrElse(0) + .toByte + } + + def readFilterForCoverage(record: SamRecord): Boolean = { + record.mapped && + record.properlyPaired && + !record.duplicate && + record.pf + } + + def subsetReadToLocus(r:Locatable,overlappingIntervals: Set[Interval]): Option[Interval] = { + val unionOverlapMaybe = overlappingIntervals.reduceOption(union) + unionOverlapMaybe.map(unionOverlap => new Interval( + r.getContig, + Math.max(r.getStart, unionOverlap.getStart), + Math.min(r.getEnd, unionOverlap.getEnd))) + } + + // calculates the locus covered by two loci (assuming they are on the same reference) + def union(i1: Locatable, i2: Locatable): Locatable = { + new SimpleFeature(i1.getContig, Math.min(i1.getStart, i2.getStart), Math.max(i1.getEnd, i2.getEnd)) + } + /** Various default values for the Coverage Normalizer. */ val DefaultMinMapQ: PhredScore = 30.toByte } @@ -63,43 +98,36 @@ class NormalizeCoverage( ) extends FgBioTool with LazyLogging { - var overlapDetector: OverlapDetector[Interval] = _ + + val overlapDetector: OverlapDetector[Interval] = new OverlapDetector[Interval](0, 0) + + //keeps track of the coverage already added to output val coverage: mutable.Map[String, mutable.IndexedSeq[Short]] = mutable.Map() + //sets up the overlap detector and the coverage tracker def initialize(bam: PathToBam, intervalsMaybe: Option[PathToIntervals]): Unit = { - Using(SamSource(bam)) { in => { - - Using.Manager { ils => { - overlapDetector = intervalsMaybe.map(f => ils(IntervalListSource(f))) - .map(i => OverlapDetector.create[Interval](i.toIntervalList.getIntervals)) - .getOrElse(makeOverlapDetectorFromDictionary(in.dict)) - in.dict.foreach(seq => { - coverage(seq.name) = mutable.IndexedSeq.fill(seq.length)(0.toShort); - } - ) - } + Using.Manager { use => + val in = use(SamSource(bam)) + val intervals: util.List[Interval] = intervalsMaybe.map(f => use(IntervalListSource(f))) + .map(i => i.toIntervalList.getIntervals) + .getOrElse(getIntervalsFromDictionary(in.dict)) + overlapDetector.addAll(intervals, intervals) + + in.dict.foreach(seq => { + coverage(seq.name) = mutable.IndexedSeq.fill(seq.length)(0.toShort) } + ) } - } - } - - def union(i1: Locatable, i2: Locatable): Locatable = { - new Interval(i1.getContig, Math.min(i1.getStart, i2.getStart), Math.max(i1.getEnd, i2.getEnd)) } - def readToSubsettedLocus(r: Locatable): Option[Locatable] = { - val overlappingIntervals: Set[Interval] = overlapDetector.getOverlaps(r).asScala.toSet - val unionOverlapMaybe = overlappingIntervals.reduceOption(union) - unionOverlapMaybe.map(unionOverlap => new Interval( - r.getContig, - Math.max(r.getStart, unionOverlap.getStart), - Math.min(r.getEnd, unionOverlap.getEnd))) - } + def readToSubsettedLocus(r: Locatable): Option[Locatable] = subsetReadToLocus(r, overlapDetector.getOverlaps(r).asScala.toSet) - def templateToCoverageTuple(template: Template): Option[Locatable] = { + def templateToCoverageLocatable(template: Template): Option[Locatable] = { val locus: Locatable = template + //take primary reads .primaryReads + //make sure they are good .filter(readFilterForCoverage) @@ -120,47 +148,24 @@ class NormalizeCoverage( //returns Short.MaxValue if template doesn't overlap with requested intervals def minCoverage(template: Template): Short = { val cov: IndexedSeq[Short] = coverage(template.r1.get.refName).toIndexedSeq - val covLocusMaybe: Option[Locatable] = templateToCoverageTuple(template) + val covLocusMaybe: Option[Locatable] = templateToCoverageLocatable(template) - val ret = covLocusMaybe.map(covLocus => cov.slice(covLocus.getStart - 1, covLocus.getEnd).minOption.getOrElse(0.toShort)).getOrElse(Short.MaxValue) - ret + covLocusMaybe.map(covLocus => cov.slice(covLocus.getStart - 1, covLocus.getEnd).minOption.getOrElse(0.toShort)).getOrElse(Short.MaxValue) } // increments the coverage over the range of the template. // will not increment beyond Short.MaxValue def incrementCoverage(template: Template): Unit = { val cov: mutable.IndexedSeq[Short] = coverage(template.r1.get.refName) - val covLocusMaybe: Option[Locatable] = templateToCoverageTuple(template) + val covLocusMaybe: Option[Locatable] = templateToCoverageLocatable(template) covLocusMaybe.foreach(covLocus => Range(covLocus.getStart - 1, covLocus.getEnd) .foreach(j => cov.update(j, Math.min(Short.MaxValue, cov(j) + 1).toShort))) } - def readFilterForCoverage(record: SamRecord): Boolean = { - record.mapped && - record.properlyPaired && - !record.duplicate && - record.pf - } - - def minMapQ(template: Template): PhredScore = { - template.primaryReads.filter(readFilterForCoverage) - .map(_.mapq) - .minOption - .getOrElse(0) - .toByte - } - - def makeOverlapDetectorFromDictionary(dict: SequenceDictionary): OverlapDetector[Interval] = { - OverlapDetector.create[Interval](dict.iterator.map(seq => new Interval(seq.name, 1, seq.length)).toList.asJava) - } - override def execute(): Unit = { - Io.assertReadable(input) - Io.assertCanWriteFile(output) - targetIntervals.foreach(Io.assertReadable) - assert(coverageTarget > 0, s"Target must be >0 found $coverageTarget.") + checkArguments() initialize(input, targetIntervals) @@ -168,25 +173,36 @@ class NormalizeCoverage( val in = use(SamSource(input)) val out = use(SamWriter(output, in.header.clone(), sort = Some(sortOrder), maxRecordsInRam = maxRecordsInRam)) - val templateIterator = Bams.templateSortedIterator(in, maxRecordsInRam) - - templateIterator - //see if primary reads overlap region of interest - .filter(t => primariesOverlapTargets(t)) - //check that all the reads in the template pass minMQ - .filter(t => minMapQ(t) >= minMQ) - // only include template whose primary reads are all on the same reference - .filter(t => t.primaryReads.map(r => r.refName).distinct.length == 1) - //only consider reads that cover regions that need coverage, i.e. that - //at least one base that they cover is less than the target - .filter(t => minCoverage(t) < coverageTarget) - // - .foreach(t => { - //increment coverages and emit template - incrementCoverage(t) - //write - t.allReads.foreach(out.write) - }) + val templateIterator: Iterator[Template] = Bams.templateSortedIterator(in, maxRecordsInRam) + + templatesToReads(templateIterator).foreach(out.write) } } + + private def checkArguments(): Unit = { + Io.assertReadable(input) + Io.assertCanWriteFile(output) + targetIntervals.foreach(Io.assertReadable) + assert(coverageTarget > 0, s"Target must be >0 found $coverageTarget.") + } + + def templatesToReads(templateIterator: Iterator[Template]): Iterator[SamRecord] = { + templateIterator + //see if primary reads overlap region of interest + .filter(t => primariesOverlapTargets(t)) + //check that all the reads in the template pass minMQ + .filter(t => minMapQ(t) >= minMQ) + // only include template whose primary reads are all on the same reference + .filter(t => t.primaryReads.map(r => r.refName).distinct.length == 1) + //only consider reads that cover regions that need coverage, i.e. that + //at least one base that they cover is less than the target + .filter(t => minCoverage(t) < coverageTarget) + // + .flatMap(t => { + //increment coverages and emit template + incrementCoverage(t) + //output + t.allReads.iterator + }) + } } diff --git a/src/test/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverageOptionsTest.scala b/src/test/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverageOptionsTest.scala new file mode 100644 index 000000000..b5e9ce213 --- /dev/null +++ b/src/test/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverageOptionsTest.scala @@ -0,0 +1,39 @@ +package com.fulcrumgenomics.personal.yfarjoun + +import com.fulcrumgenomics.FgBioDef.PathToBam +import com.fulcrumgenomics.testing.UnitSpec +import com.fulcrumgenomics.commons.CommonsDef.PathToBam + +class NormalizeCoverageOptionsTest extends UnitSpec { + + "BinomialDistribution.coefficient" should "calculate the correct number of combinations given k=0 or k=n" in { + + } + val nc= new NormalizeCoverage(PathToBam("/dev/null"),Path(""),Path("")) + "Union" should " correctly find the union of two locatables" in { + NormalizeCoverage.union + + + } + + test("testGetIntervalsFromDictionary") { + + } + + test("testReadFilterForCoverage") { + + } + + test("testMinMapQ") { + + } + + test("testDefaultMinMapQ") { + + } + + test("testSubsetReadToLocus") { + + } + +} diff --git a/src/test/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverageTest.scala b/src/test/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverageTest.scala new file mode 100644 index 000000000..27065360b --- /dev/null +++ b/src/test/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverageTest.scala @@ -0,0 +1,56 @@ +/* + * The MIT License + * + * Copyright (c) 2022 Fulcrum Genomics LLC + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +package com.fulcrumgenomics.personal.yfarjoun + +import com.fulcrumgenomics.coord.LocatableOrderingTest._ +import com.fulcrumgenomics.fasta.{SequenceDictionary, SequenceMetadata} +import com.fulcrumgenomics.testing.{SamBuilder, UnitSpec} +import htsjdk.samtools.util.{Interval, Locatable} + +/** Companion object for [[LocatableOrderingTest]]. */ +object LocatableOrderingTest { + + /** The reference sequence name for chromosome 1. */ + private val Chr1: String = "chr1" + + /** The reference sequence name for chromosome 2. */ + private val Chr2: String = "chr2" + + /** The sequence dictionary for the ordering. */ + private val Dict: SequenceDictionary = SequenceDictionary(SequenceMetadata(Chr1, length = 10_000), SequenceMetadata(Chr2, length = 100_000)) + + private val Builder:SamBuilder = new SamBuilder(sd=Some(Dict)) + Builder.addPair() +} + +/** Unit tests for [[NormalizeCoverage]]. */ + +class NormalizeCoverageTest extends UnitSpec { + + + + + +} From 1738b25950b5ba60074734f6bacd03ec8a043e57 Mon Sep 17 00:00:00 2001 From: Yossi Farjoun Date: Mon, 12 Dec 2022 22:58:52 -0500 Subject: [PATCH 06/16] WIP --- .../personal/yfarjoun/NormalizeCoverage.scala | 6 +- .../NormalizeCoverageOptionsTest.scala | 80 ++++++++++++++----- 2 files changed, 65 insertions(+), 21 deletions(-) diff --git a/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverage.scala b/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverage.scala index c7bcf720e..ddb9b3529 100644 --- a/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverage.scala +++ b/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverage.scala @@ -30,7 +30,7 @@ import com.fulcrumgenomics.bam.{Bams, Template} import com.fulcrumgenomics.cmdline.{ClpGroups, FgBioTool} import com.fulcrumgenomics.commons.util.LazyLogging import com.fulcrumgenomics.fasta.SequenceDictionary -import com.fulcrumgenomics.personal.yfarjoun.NormalizeCoverageOptions._ +import com.fulcrumgenomics.personal.yfarjoun.NormalizeCoverage._ import com.fulcrumgenomics.sopt._ import com.fulcrumgenomics.util.NumericTypes.PhredScore import com.fulcrumgenomics.util.{IntervalListSource, Io} @@ -42,7 +42,7 @@ import scala.util.Using import scala.jdk.CollectionConverters._ import scala.collection.mutable -object NormalizeCoverageOptions { +object NormalizeCoverage { def getIntervalsFromDictionary(dict: SequenceDictionary): util.List[Interval] = { dict.iterator.map(seq => new Interval(seq.name, 1, seq.length)).toList.asJava @@ -63,7 +63,7 @@ object NormalizeCoverageOptions { record.pf } - def subsetReadToLocus(r:Locatable,overlappingIntervals: Set[Interval]): Option[Interval] = { + def subsetReadToLocus(r:Locatable, overlappingIntervals: Set[Interval]): Option[Interval] = { val unionOverlapMaybe = overlappingIntervals.reduceOption(union) unionOverlapMaybe.map(unionOverlap => new Interval( r.getContig, diff --git a/src/test/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverageOptionsTest.scala b/src/test/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverageOptionsTest.scala index b5e9ce213..8b94d3b47 100644 --- a/src/test/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverageOptionsTest.scala +++ b/src/test/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverageOptionsTest.scala @@ -1,39 +1,83 @@ package com.fulcrumgenomics.personal.yfarjoun -import com.fulcrumgenomics.FgBioDef.PathToBam +//import com.fulcrumgenomics.FgBioDef.PathToBam import com.fulcrumgenomics.testing.UnitSpec -import com.fulcrumgenomics.commons.CommonsDef.PathToBam +import htsjdk.samtools.util.Locatable +//import com.fulcrumgenomics.commons.CommonsDef.PathToBam +import htsjdk.samtools.util.Interval + +import org.scalatest._ +import matchers._ + +trait LocatableMatchers { + + class SameLocusAs(right: Locatable) extends Matcher[Locatable] { + + def apply(left: Locatable) = { + MatchResult( + left.getContig.equals(right.getContig) && + left.getStart.equals(right.getStart) && + left.getEnd.equals(right.getEnd), + s"""Locatable $left is not the same as $right"""", + s"""Locatable $left is the same as $right"""" + ) + } + } -class NormalizeCoverageOptionsTest extends UnitSpec { + def haveSameLocusAs(right: Locatable) = new SameLocusAs(right) +} - "BinomialDistribution.coefficient" should "calculate the correct number of combinations given k=0 or k=n" in { +object LocatableMatchers extends LocatableMatchers - } - val nc= new NormalizeCoverage(PathToBam("/dev/null"),Path(""),Path("")) - "Union" should " correctly find the union of two locatables" in { - NormalizeCoverage.union +class NormalizeCoverageOptionsTest extends UnitSpec with LocatableMatchers { +// val nc = new NormalizeCoverage(PathToBam("/dev/null"), Path(""), Path("")) - } + val i1: Interval = new Interval("chr1", 100, 200) + val i2: Interval = new Interval("chr1", 100, 300) + val i3: Interval = new Interval("chr1", 300, 400) - test("testGetIntervalsFromDictionary") { + var union_1_2: Locatable = NormalizeCoverage.union(i1, i2) + var union_2_1: Locatable = NormalizeCoverage.union(i2, i1) + var union_1_3: Locatable = NormalizeCoverage.union(i1, i3) + var union_3_1: Locatable = NormalizeCoverage.union(i3, i1) + var union_2_3: Locatable = NormalizeCoverage.union(i2, i3) + var union_3_2: Locatable = NormalizeCoverage.union(i3, i2) - } + "Union" should " correctly find the union of two locatables" in { - test("testReadFilterForCoverage") { + union_1_2 should haveSameLocusAs(new Interval("chr1",100,300)) + union_1_3 should haveSameLocusAs(new Interval("chr1",100,400)) + union_2_3 should haveSameLocusAs(new Interval("chr1",100,400)) + } + it should "be symmetric" in { + union_1_2 should haveSameLocusAs(union_2_1) + union_1_3 should haveSameLocusAs(union_3_1) + union_3_2 should haveSameLocusAs(union_2_3) } - test("testMinMapQ") { + var subset_1_2: Option[Locatable] = NormalizeCoverage.subsetReadToLocus(i1, Set(i2)) + var subset_2_1: Option[Locatable] = NormalizeCoverage.subsetReadToLocus(i2, Set(i1)) + var subset_1_3: Option[Locatable] = NormalizeCoverage.subsetReadToLocus(i1, Set(i3)) + var subset_3_1: Option[Locatable] = NormalizeCoverage.subsetReadToLocus(i3, Set(i1)) + var subset_2_3: Option[Locatable] = NormalizeCoverage.subsetReadToLocus(i2, Set(i3)) + var subset_3_2: Option[Locatable] = NormalizeCoverage.subsetReadToLocus(i3, Set(i2)) - } + "subsetReadToLocus" should "correctly subset one locatable to another" in { + subset_1_2 should not be empty + subset_1_2.get should haveSameLocusAs(i1) - test("testDefaultMinMapQ") { + subset_2_1 should not be empty + subset_2_1.get should haveSameLocusAs(i1) - } + subset_1_3 shouldBe empty + subset_3_1 shouldBe empty - test("testSubsetReadToLocus") { + subset_2_3 should not be empty + subset_2_3.get should haveSameLocusAs(new Interval("chr1",300,300)) + subset_3_2 should not be empty + subset_3_2.get should haveSameLocusAs(new Interval("chr1",300,300)) } - } From 18ed65bb5b3684a3c15d2fdd69aa10989381f3f5 Mon Sep 17 00:00:00 2001 From: Yossi Farjoun Date: Wed, 14 Dec 2022 20:46:22 -0500 Subject: [PATCH 07/16] - moved to Seq rather than Set - removed requirement that reads are all on the same template - now looking at actual reads, not span of the two cannonical --- .../personal/yfarjoun/NormalizeCoverage.scala | 98 +++++++++++++------ 1 file changed, 67 insertions(+), 31 deletions(-) diff --git a/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverage.scala b/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverage.scala index ddb9b3529..8b5287454 100644 --- a/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverage.scala +++ b/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverage.scala @@ -49,7 +49,7 @@ object NormalizeCoverage { } def minMapQ(template: Template): PhredScore = { - template.primaryReads.filter(readFilterForCoverage) + template.allReads.filter(readFilterForCoverage) .map(_.mapq) .minOption .getOrElse(0) @@ -57,25 +57,56 @@ object NormalizeCoverage { } def readFilterForCoverage(record: SamRecord): Boolean = { - record.mapped && - record.properlyPaired && + !record.secondary && // I'm assuming here that secondary reads to not count towards coverage, but supplementary do + record.mapped && !record.duplicate && record.pf } - def subsetReadToLocus(r:Locatable, overlappingIntervals: Set[Interval]): Option[Interval] = { - val unionOverlapMaybe = overlappingIntervals.reduceOption(union) - unionOverlapMaybe.map(unionOverlap => new Interval( - r.getContig, - Math.max(r.getStart, unionOverlap.getStart), - Math.min(r.getEnd, unionOverlap.getEnd))) + def subsetReadToLocus(rs: Seq[Locatable], overlappingIntervals: Seq[Locatable]): Seq[Locatable] = { + val unionOverlap: Seq[Locatable] = squish(overlappingIntervals) + rs.flatMap(r => { + unionOverlap.filter(l => l.overlaps(r)) + .map(l => intersect(l, r).getOrElse(Set.empty[Locatable])) + }) + } + + def union(i1: Locatable, i2: Locatable): Set[Locatable] = { + if (i1.withinDistanceOf(i2, 1)) + Set(new SimpleFeature(i1.getContig, Math.min(i1.getStart, i2.getStart), Math.max(i1.getEnd, i2.getEnd))) + else + Set(i1, i2) } // calculates the locus covered by two loci (assuming they are on the same reference) - def union(i1: Locatable, i2: Locatable): Locatable = { - new SimpleFeature(i1.getContig, Math.min(i1.getStart, i2.getStart), Math.max(i1.getEnd, i2.getEnd)) + def union(i1: Locatable, i2: Set[Locatable]): Set[Locatable] = { + i2.flatMap(i => union(i1, i)) + } + + def intersect(i1: Locatable, i2: Locatable): Option[Locatable] = { + if (!i1.contigsMatch(i2)) + None + else + Some(new SimpleFeature( + i1.getContig, + Math.max(i1.getStart, i2.getStart), + Math.min(i1.getEnd, i2.getEnd))) + } + +// //result will not have overlapping locatables, but will cover the same original territory +// def squish(ls: IterableOnce[SamRecord]): Seq[Locatable] = { +// squish(ls.iterator.map(x=>x.asSam)) +// } + //result will not have overlapping locatables, but will cover the same original territory + def squish(ls: IterableOnce[Locatable]): Seq[Locatable] = { + ls.iterator.foldLeft(Seq.empty[Locatable])(union) + } + + def min(s1: Short, s2: Short): Short = { + s1 min s2 } + /** Various default values for the Coverage Normalizer. */ val DefaultMinMapQ: PhredScore = 30.toByte } @@ -120,49 +151,56 @@ class NormalizeCoverage( } } - def readToSubsettedLocus(r: Locatable): Option[Locatable] = subsetReadToLocus(r, overlapDetector.getOverlaps(r).asScala.toSet) + def readToSubsettedLocus(rs: Seq[Locatable]): Seq[Locatable] = { + val squished = squish(rs) + val overlapping: Seq[Locatable] = rs.flatMap(r => overlapDetector.getOverlaps(r).asScala) + subsetReadToLocus(squished, overlapping) + } - def templateToCoverageLocatable(template: Template): Option[Locatable] = { - val locus: Locatable = template + def templateToCoverageLocatable(template: Template): Seq[Locatable] = { + val locus: Seq[Locatable] = template - //take primary reads - .primaryReads + //take all reads + .allReads //make sure they are good .filter(readFilterForCoverage) //replace each read with its start/end tuple - .map[Locatable](r => r.asSam) - - //find the union of all the reads - .reduce(union) + .map[Locatable](r => r.asSam).toSeq //Subset the read to the overlapping intervals readToSubsettedLocus(locus) } + def primariesOverlapTargets(template: Template): Boolean = { - template.primaryReads.exists(r => overlapDetector.overlapsAny(r.asSam)) + template.allReads.filter(readFilterForCoverage).exists(r => overlapDetector.overlapsAny(r.asSam)) } //returns Short.MaxValue if template doesn't overlap with requested intervals def minCoverage(template: Template): Short = { - val cov: IndexedSeq[Short] = coverage(template.r1.get.refName).toIndexedSeq - val covLocusMaybe: Option[Locatable] = templateToCoverageLocatable(template) - - covLocusMaybe.map(covLocus => cov.slice(covLocus.getStart - 1, covLocus.getEnd).minOption.getOrElse(0.toShort)).getOrElse(Short.MaxValue) + //FIX + template.allReads.filter(readFilterForCoverage).map(read => { + val cov: IndexedSeq[Short] = coverage(read.refName).toIndexedSeq + val covLocusSet: Seq[Locatable] = templateToCoverageLocatable(template) + val covPerTemplate = covLocusSet.flatMap(covLocus => cov.slice(covLocus.getStart - 1, covLocus.getEnd).minOption) + covPerTemplate.reduce(min) + }).reduceOption(min) + .getOrElse(Short.MaxValue) } // increments the coverage over the range of the template. // will not increment beyond Short.MaxValue def incrementCoverage(template: Template): Unit = { - val cov: mutable.IndexedSeq[Short] = coverage(template.r1.get.refName) - val covLocusMaybe: Option[Locatable] = templateToCoverageLocatable(template) + squish(template.allReads.filter(readFilterForCoverage).map(x=>x.asSam)).foreach(read=>{ + val cov: mutable.IndexedSeq[Short] = coverage(read.getContig) + val covLocusSet: Seq[Locatable] = templateToCoverageLocatable(template) - covLocusMaybe.foreach(covLocus => + covLocusSet.foreach(covLocus => Range(covLocus.getStart - 1, covLocus.getEnd) .foreach(j => cov.update(j, Math.min(Short.MaxValue, cov(j) + 1).toShort))) - } + })} override def execute(): Unit = { checkArguments() @@ -192,8 +230,6 @@ class NormalizeCoverage( .filter(t => primariesOverlapTargets(t)) //check that all the reads in the template pass minMQ .filter(t => minMapQ(t) >= minMQ) - // only include template whose primary reads are all on the same reference - .filter(t => t.primaryReads.map(r => r.refName).distinct.length == 1) //only consider reads that cover regions that need coverage, i.e. that //at least one base that they cover is less than the target .filter(t => minCoverage(t) < coverageTarget) From cb69b8400b7cb26546b42779080fcacaac72ed80 Mon Sep 17 00:00:00 2001 From: Yossi Farjoun Date: Wed, 14 Dec 2022 22:29:28 -0500 Subject: [PATCH 08/16] - randomize reads. --- .../scala/com/fulcrumgenomics/bam/Bams.scala | 19 ++++- .../fulcrumgenomics/bam/RandomizeBam.scala | 59 ++++++++------ .../personal/yfarjoun/NormalizeCoverage.scala | 78 +++++++++---------- .../personal/yfarjoun/RandomTemplate.scala | 45 +++++++++++ .../NormalizeCoverageOptionsTest.scala | 40 +++++----- 5 files changed, 153 insertions(+), 88 deletions(-) create mode 100644 src/main/scala/com/fulcrumgenomics/personal/yfarjoun/RandomTemplate.scala diff --git a/src/main/scala/com/fulcrumgenomics/bam/Bams.scala b/src/main/scala/com/fulcrumgenomics/bam/Bams.scala index 9b5c257bb..a9f7cfd4d 100644 --- a/src/main/scala/com/fulcrumgenomics/bam/Bams.scala +++ b/src/main/scala/com/fulcrumgenomics/bam/Bams.scala @@ -301,21 +301,32 @@ object Bams extends LazyLogging { * @param tmpDir an optional temp directory to use for temporary sorting files if needed * @return an Iterator of Template objects */ + def templateIterator(iterator: Iterator[SamRecord], header: SAMFileHeader, maxInMemory: Int, tmpDir: DirPath): SelfClosingIterator[Template] = { val queryIterator = queryGroupedIterator(iterator, header, maxInMemory, tmpDir) + templateIteratorPresorted(queryIterator) + } + + /** + * Converts a Iterator of SamRecoreds to an iterator of Template without sorting. + * + * @param iterator + */ + def templateIteratorPresorted(readsIterator: SelfClosingIterator[SamRecord]): SelfClosingIterator[Template] = { + + val templatesIterator = new Iterator[Template] { + override def hasNext: Boolean = readsIterator.hasNext - val iter = new Iterator[Template] { - override def hasNext: Boolean = queryIterator.hasNext override def next(): Template = { require(hasNext, "next() called on empty iterator") - Template(queryIterator) + Template(readsIterator) } } - new SelfClosingIterator(iter, () => queryIterator.close()) + new SelfClosingIterator(templatesIterator, () => readsIterator.close()) } /** Return an iterator over records sorted and grouped into [[Template]] objects. Although a queryname sort is diff --git a/src/main/scala/com/fulcrumgenomics/bam/RandomizeBam.scala b/src/main/scala/com/fulcrumgenomics/bam/RandomizeBam.scala index 7c69134f6..eac75ea89 100644 --- a/src/main/scala/com/fulcrumgenomics/bam/RandomizeBam.scala +++ b/src/main/scala/com/fulcrumgenomics/bam/RandomizeBam.scala @@ -28,59 +28,70 @@ package com.fulcrumgenomics.bam * Created by tfenne on 6/23/16. */ -// import com.fulcrumgenomics.FgBioDef._ - - import com.fulcrumgenomics.FgBioDef._ +import com.fulcrumgenomics.bam.RandomizeBam.Randomize import com.fulcrumgenomics.bam.api._ import com.fulcrumgenomics.cmdline.{ClpGroups, FgBioTool} -import com.fulcrumgenomics.commons.util.LazyLogging +import com.fulcrumgenomics.commons.util.{LazyLogging, Logger} import com.fulcrumgenomics.sopt.{arg, clp} import com.fulcrumgenomics.util.{Io, ProgressLogger, Sorter} +import htsjdk.samtools.SAMFileHeader import htsjdk.samtools.SAMFileHeader.{GroupOrder, SortOrder} +import htsjdk.samtools.SBIIndex.Header import htsjdk.samtools.util.Murmur3 -@clp(group=ClpGroups.SamOrBam, description= -""" - |Randomizes the order of reads in a SAM or BAM file. Randomization is done by sorting - |on a hash of the `queryname` (and bases and quals if not query-grouping). By default - |reads with the same query name are grouped together in the output file; this can be - |turned off by specifying --query-group=false. -""") -class RandomizeBam( - @arg(flag='i', doc="The input SAM or BAM file.") val input: PathToBam = Io.StdIn, - @arg(flag='o', doc="The output SAM or BAM file.") val output: PathToBam = Io.StdOut, - @arg(flag='s', doc="Random seed.") val seed: Int = 42, - @arg(flag='q', doc="Group together reads by queryname.") val queryGroup: Boolean = true -) extends FgBioTool with LazyLogging { +object RandomizeBam { - Io.assertCanWriteFile(input) - Io.assertCanWriteFile(output) - override def execute(): Unit = { - val in = SamSource(input) + def Randomize(in: Iterator[SamRecord], header: SAMFileHeader, seed: Int, queryGroup: Boolean, logger: Logger):Sorter[SamRecord, _] = { val hasher = new Murmur3(seed) val sorter = { if (queryGroup) { val f = (r: SamRecord) => SamOrder.RandomQueryKey(hasher.hashUnencodedChars(r.name), r.name, r.asSam.getFlags) - new Sorter(2e6.toInt, new SamRecordCodec(in.header), f) + new Sorter(2e6.toInt, new SamRecordCodec(header), f) } else { val f = (r: SamRecord) => { val key = s"${r.name}:${r.basesString}:${r.qualsString}" SamOrder.RandomKey(hasher.hashUnencodedChars(key), r.asSam.getFlags) } - new Sorter(2e6.toInt, new SamRecordCodec(in.header), f) + new Sorter(2e6.toInt, new SamRecordCodec(header), f) } } logger.info("Sorting reads into random order.") - val sortProgress = ProgressLogger(logger, verb="sorted", unit=5e6.toInt) + val sortProgress = ProgressLogger(logger, verb = "sorted", unit = 5e6.toInt) in.foreach(rec => { sorter += rec sortProgress.record() }) + sorter + } +} + + +@clp(group = ClpGroups.SamOrBam, description = + """ + |Randomizes the order of reads in a SAM or BAM file. Randomization is done by sorting + |on a hash of the `queryname` (and bases and quals and flags if not query-grouping). By default + |reads with the same query name are grouped together in the output file; this can be + |turned off by specifying --query-group=false. +""") +class RandomizeBam( + @arg(flag = 'i', doc = "The input SAM or BAM file.") val input: PathToBam = Io.StdIn, + @arg(flag = 'o', doc = "The output SAM or BAM file.") val output: PathToBam = Io.StdOut, + @arg(flag = 's', doc = "Random seed.") val seed: Int = 42, + @arg(flag = 'q', doc = "Group together reads by queryname.") val queryGroup: Boolean = true + ) extends FgBioTool with LazyLogging { + + Io.assertCanWriteFile(input) + Io.assertCanWriteFile(output) + + override def execute(): Unit = { + val in = SamSource(input) + + val sorter = Randomize(in.iterator, in.header, seed, queryGroup, this.logger) logger.info("Writing reads out to output.") val outHeader = in.header.clone() diff --git a/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverage.scala b/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverage.scala index 8b5287454..d4c30a1a1 100644 --- a/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverage.scala +++ b/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverage.scala @@ -25,8 +25,8 @@ package com.fulcrumgenomics.personal.yfarjoun import com.fulcrumgenomics.FgBioDef._ +import com.fulcrumgenomics.bam.Template import com.fulcrumgenomics.bam.api.{SamOrder, SamRecord, SamSource, SamWriter} -import com.fulcrumgenomics.bam.{Bams, Template} import com.fulcrumgenomics.cmdline.{ClpGroups, FgBioTool} import com.fulcrumgenomics.commons.util.LazyLogging import com.fulcrumgenomics.fasta.SequenceDictionary @@ -38,12 +38,13 @@ import htsjdk.samtools.util.{Interval, Locatable, OverlapDetector} import htsjdk.tribble.SimpleFeature import java.util -import scala.util.Using -import scala.jdk.CollectionConverters._ import scala.collection.mutable +import scala.jdk.CollectionConverters._ +import scala.util.Using object NormalizeCoverage { + def getIntervalsFromDictionary(dict: SequenceDictionary): util.List[Interval] = { dict.iterator.map(seq => new Interval(seq.name, 1, seq.length)).toList.asJava } @@ -66,21 +67,20 @@ object NormalizeCoverage { def subsetReadToLocus(rs: Seq[Locatable], overlappingIntervals: Seq[Locatable]): Seq[Locatable] = { val unionOverlap: Seq[Locatable] = squish(overlappingIntervals) rs.flatMap(r => { - unionOverlap.filter(l => l.overlaps(r)) - .map(l => intersect(l, r).getOrElse(Set.empty[Locatable])) + unionOverlap.filter(l => l.overlaps(r)).flatMap(l => intersect(l, r)) }) } - def union(i1: Locatable, i2: Locatable): Set[Locatable] = { + def union(i1: Locatable, i2: Locatable): Seq[Locatable] = { if (i1.withinDistanceOf(i2, 1)) - Set(new SimpleFeature(i1.getContig, Math.min(i1.getStart, i2.getStart), Math.max(i1.getEnd, i2.getEnd))) + Seq(new SimpleFeature(i1.getContig, Math.min(i1.getStart, i2.getStart), Math.max(i1.getEnd, i2.getEnd))) else - Set(i1, i2) + Seq(i1, i2) } // calculates the locus covered by two loci (assuming they are on the same reference) - def union(i1: Locatable, i2: Set[Locatable]): Set[Locatable] = { - i2.flatMap(i => union(i1, i)) + def union(i1: Seq[Locatable], i2: Locatable): Seq[Locatable] = { + i1.flatMap(i => union(i, i2)) } def intersect(i1: Locatable, i2: Locatable): Option[Locatable] = { @@ -93,10 +93,6 @@ object NormalizeCoverage { Math.min(i1.getEnd, i2.getEnd))) } -// //result will not have overlapping locatables, but will cover the same original territory -// def squish(ls: IterableOnce[SamRecord]): Seq[Locatable] = { -// squish(ls.iterator.map(x=>x.asSam)) -// } //result will not have overlapping locatables, but will cover the same original territory def squish(ls: IterableOnce[Locatable]): Seq[Locatable] = { ls.iterator.foldLeft(Seq.empty[Locatable])(union) @@ -106,7 +102,6 @@ object NormalizeCoverage { s1 min s2 } - /** Various default values for the Coverage Normalizer. */ val DefaultMinMapQ: PhredScore = 30.toByte } @@ -193,14 +188,34 @@ class NormalizeCoverage( // increments the coverage over the range of the template. // will not increment beyond Short.MaxValue def incrementCoverage(template: Template): Unit = { - squish(template.allReads.filter(readFilterForCoverage).map(x=>x.asSam)).foreach(read=>{ - val cov: mutable.IndexedSeq[Short] = coverage(read.getContig) - val covLocusSet: Seq[Locatable] = templateToCoverageLocatable(template) + squish(template.allReads.filter(readFilterForCoverage).map(x => x.asSam)).foreach(read => { + val cov: mutable.IndexedSeq[Short] = coverage(read.getContig) + val covLocusSet: Seq[Locatable] = templateToCoverageLocatable(template) + + covLocusSet.foreach(covLocus => + Range(covLocus.getStart - 1, covLocus.getEnd) + .foreach(j => cov.update(j, Math.min(Short.MaxValue, cov(j) + 1).toShort))) + }) + } - covLocusSet.foreach(covLocus => - Range(covLocus.getStart - 1, covLocus.getEnd) - .foreach(j => cov.update(j, Math.min(Short.MaxValue, cov(j) + 1).toShort))) - })} + + def templatesToReads(templateIterator: Iterator[Template]): Iterator[SamRecord] = { + templateIterator + //see if primary reads overlap region of interest + .filter(t => primariesOverlapTargets(t)) + //check that all the reads in the template pass minMQ + .filter(t => minMapQ(t) >= minMQ) + //only consider reads that cover regions that need coverage, i.e. that + //at least one base that they cover is less than the target + .filter(t => minCoverage(t) < coverageTarget) + // + .flatMap(t => { + //increment coverages and emit template + incrementCoverage(t) + //output + t.allReads.iterator + }) + } override def execute(): Unit = { checkArguments() @@ -211,7 +226,7 @@ class NormalizeCoverage( val in = use(SamSource(input)) val out = use(SamWriter(output, in.header.clone(), sort = Some(sortOrder), maxRecordsInRam = maxRecordsInRam)) - val templateIterator: Iterator[Template] = Bams.templateSortedIterator(in, maxRecordsInRam) + val templateIterator: Iterator[Template] = RandomTemplate.templateSortedIterator(in.iterator, in.header, maxRecordsInRam) templatesToReads(templateIterator).foreach(out.write) } @@ -224,21 +239,4 @@ class NormalizeCoverage( assert(coverageTarget > 0, s"Target must be >0 found $coverageTarget.") } - def templatesToReads(templateIterator: Iterator[Template]): Iterator[SamRecord] = { - templateIterator - //see if primary reads overlap region of interest - .filter(t => primariesOverlapTargets(t)) - //check that all the reads in the template pass minMQ - .filter(t => minMapQ(t) >= minMQ) - //only consider reads that cover regions that need coverage, i.e. that - //at least one base that they cover is less than the target - .filter(t => minCoverage(t) < coverageTarget) - // - .flatMap(t => { - //increment coverages and emit template - incrementCoverage(t) - //output - t.allReads.iterator - }) - } } diff --git a/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/RandomTemplate.scala b/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/RandomTemplate.scala new file mode 100644 index 000000000..3d9a34bd2 --- /dev/null +++ b/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/RandomTemplate.scala @@ -0,0 +1,45 @@ +package com.fulcrumgenomics.personal.yfarjoun + +import com.fulcrumgenomics.FgBioDef.DirPath +import com.fulcrumgenomics.bam.Bams.templateIterator +import com.fulcrumgenomics.bam.{RandomizeBam, Template} +import com.fulcrumgenomics.bam.api.{SamOrder, SamRecord, SamRecordCodec} +import com.fulcrumgenomics.commons.collection.SelfClosingIterator +import com.fulcrumgenomics.commons.util.Logger +import com.fulcrumgenomics.util.Sorter +import htsjdk.samtools.SAMFileHeader +import htsjdk.samtools.util.Murmur3 + +object RandomTemplate { + + /** Return an iterator over records sorted and grouped into [[Template]] objects. Although a queryname sort is + * guaranteed, the sort order may not be consistent with other queryname sorting implementations, especially in other + * tool kits. See [[templateIterator]] for a [[Template]] iterator which emits templates in a non-guaranteed sort + * order. + * + * @see [[templateIterator]] + * + * @param iterator an iterator from which to consume records + * @param header the header associated with the records + * @param maxInMemory the maximum number of records to keep and sort in memory, if sorting is needed + * @param tmpDir a temp directory to use for temporary sorting files if sorting is needed + * @return an Iterator of queryname sorted Template objects + */ + def templateSortedIterator(iterator: Iterator[SamRecord], + header: SAMFileHeader, + maxInMemory: Int): SelfClosingIterator[Template] = { + + val sorter = RandomizeBam.Randomize(iterator, header, 42, true, new Logger) + val sortedIterator = sorter.iterator + + val iter = new Iterator[Template] { + override def hasNext: Boolean = sortedIterator.hasNext + override def next(): Template = { + require(hasNext, "next() called on empty iterator") + Template(sortedIterator) + } + } + + new SelfClosingIterator[Template](iter, ()=>{sorter.close()}) + } +} diff --git a/src/test/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverageOptionsTest.scala b/src/test/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverageOptionsTest.scala index 8b94d3b47..d088968e8 100644 --- a/src/test/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverageOptionsTest.scala +++ b/src/test/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverageOptionsTest.scala @@ -37,12 +37,12 @@ class NormalizeCoverageOptionsTest extends UnitSpec with LocatableMatchers { val i2: Interval = new Interval("chr1", 100, 300) val i3: Interval = new Interval("chr1", 300, 400) - var union_1_2: Locatable = NormalizeCoverage.union(i1, i2) - var union_2_1: Locatable = NormalizeCoverage.union(i2, i1) - var union_1_3: Locatable = NormalizeCoverage.union(i1, i3) - var union_3_1: Locatable = NormalizeCoverage.union(i3, i1) - var union_2_3: Locatable = NormalizeCoverage.union(i2, i3) - var union_3_2: Locatable = NormalizeCoverage.union(i3, i2) + var union_1_2: Locatable = NormalizeCoverage.union(i1, i2).head + var union_2_1: Locatable = NormalizeCoverage.union(i2, i1).head + var union_1_3: Locatable = NormalizeCoverage.union(i1, i3).head + var union_3_1: Locatable = NormalizeCoverage.union(i3, i1).head + var union_2_3: Locatable = NormalizeCoverage.union(i2, i3).head + var union_3_2: Locatable = NormalizeCoverage.union(i3, i2).head "Union" should " correctly find the union of two locatables" in { @@ -57,27 +57,27 @@ class NormalizeCoverageOptionsTest extends UnitSpec with LocatableMatchers { union_3_2 should haveSameLocusAs(union_2_3) } - var subset_1_2: Option[Locatable] = NormalizeCoverage.subsetReadToLocus(i1, Set(i2)) - var subset_2_1: Option[Locatable] = NormalizeCoverage.subsetReadToLocus(i2, Set(i1)) - var subset_1_3: Option[Locatable] = NormalizeCoverage.subsetReadToLocus(i1, Set(i3)) - var subset_3_1: Option[Locatable] = NormalizeCoverage.subsetReadToLocus(i3, Set(i1)) - var subset_2_3: Option[Locatable] = NormalizeCoverage.subsetReadToLocus(i2, Set(i3)) - var subset_3_2: Option[Locatable] = NormalizeCoverage.subsetReadToLocus(i3, Set(i2)) + var subset_1_2: Seq[Locatable] = NormalizeCoverage.subsetReadToLocus(Seq(i1), Seq(i2)) + var subset_2_1: Seq[Locatable] = NormalizeCoverage.subsetReadToLocus(Seq(i2), Seq(i1)) + var subset_1_3: Seq[Locatable] = NormalizeCoverage.subsetReadToLocus(Seq(i1), Seq(i3)) + var subset_3_1: Seq[Locatable] = NormalizeCoverage.subsetReadToLocus(Seq(i3), Seq(i1)) + var subset_2_3: Seq[Locatable] = NormalizeCoverage.subsetReadToLocus(Seq(i2), Seq(i3)) + var subset_3_2: Seq[Locatable] = NormalizeCoverage.subsetReadToLocus(Seq(i3), Seq(i2)) "subsetReadToLocus" should "correctly subset one locatable to another" in { - subset_1_2 should not be empty - subset_1_2.get should haveSameLocusAs(i1) + subset_1_2 should have size 1 + subset_1_2.head should haveSameLocusAs(i1) - subset_2_1 should not be empty - subset_2_1.get should haveSameLocusAs(i1) + subset_2_1 should have size 1 + subset_2_1.head should haveSameLocusAs(i1) subset_1_3 shouldBe empty subset_3_1 shouldBe empty - subset_2_3 should not be empty - subset_2_3.get should haveSameLocusAs(new Interval("chr1",300,300)) + subset_2_3 should have size 1 + subset_2_3.head should haveSameLocusAs(new Interval("chr1",300,300)) - subset_3_2 should not be empty - subset_3_2.get should haveSameLocusAs(new Interval("chr1",300,300)) + subset_3_2 should have size 1 + subset_3_2.head should haveSameLocusAs(new Interval("chr1",300,300)) } } From 2dc21d18855bd9b74fc5d1921ece510843a7375a Mon Sep 17 00:00:00 2001 From: Yossi Farjoun Date: Wed, 14 Dec 2022 22:30:42 -0500 Subject: [PATCH 09/16] - logger --- .../personal/yfarjoun/NormalizeCoverage.scala | 5 ++++- .../fulcrumgenomics/personal/yfarjoun/RandomTemplate.scala | 5 +++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverage.scala b/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverage.scala index d4c30a1a1..783ee43e4 100644 --- a/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverage.scala +++ b/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverage.scala @@ -226,7 +226,10 @@ class NormalizeCoverage( val in = use(SamSource(input)) val out = use(SamWriter(output, in.header.clone(), sort = Some(sortOrder), maxRecordsInRam = maxRecordsInRam)) - val templateIterator: Iterator[Template] = RandomTemplate.templateSortedIterator(in.iterator, in.header, maxRecordsInRam) + val templateIterator: Iterator[Template] = RandomTemplate.templateSortedIterator(in.iterator, + in.header, + maxRecordsInRam, + logger) templatesToReads(templateIterator).foreach(out.write) } diff --git a/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/RandomTemplate.scala b/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/RandomTemplate.scala index 3d9a34bd2..f35003d3b 100644 --- a/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/RandomTemplate.scala +++ b/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/RandomTemplate.scala @@ -27,9 +27,10 @@ object RandomTemplate { */ def templateSortedIterator(iterator: Iterator[SamRecord], header: SAMFileHeader, - maxInMemory: Int): SelfClosingIterator[Template] = { + maxInMemory: Int, + logger: Logger): SelfClosingIterator[Template] = { - val sorter = RandomizeBam.Randomize(iterator, header, 42, true, new Logger) + val sorter = RandomizeBam.Randomize(iterator, header, 42, true,logger) val sortedIterator = sorter.iterator val iter = new Iterator[Template] { From c48213dccb06ea3c730772d158b1598e0df9ba93 Mon Sep 17 00:00:00 2001 From: Yossi Farjoun Date: Thu, 15 Dec 2022 23:10:41 -0500 Subject: [PATCH 10/16] - documentation (start) --- .../personal/yfarjoun/NormalizeCoverage.scala | 55 ++++++++++++++----- 1 file changed, 42 insertions(+), 13 deletions(-) diff --git a/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverage.scala b/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverage.scala index 783ee43e4..e0a3fe047 100644 --- a/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverage.scala +++ b/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverage.scala @@ -93,11 +93,18 @@ object NormalizeCoverage { Math.min(i1.getEnd, i2.getEnd))) } - //result will not have overlapping locatables, but will cover the same original territory + /** + * Takes an iterator of Locatables and reduces it to a non-overlapping Seq of the same. + * + * Result will not have overlapping locatables, but will cover the same original territory + */ def squish(ls: IterableOnce[Locatable]): Seq[Locatable] = { ls.iterator.foldLeft(Seq.empty[Locatable])(union) } + /** + * Gets the minimum value of two input Shorts. Used for left-folding + */ def min(s1: Short, s2: Short): Short = { s1 min s2 } @@ -146,12 +153,26 @@ class NormalizeCoverage( } } + /** + * Take a Seq of Locatables and converts them to a set of non-overlapping Locatables that overlap + * the region of interset. + * + * @param rs + * @return + */ def readToSubsettedLocus(rs: Seq[Locatable]): Seq[Locatable] = { val squished = squish(rs) val overlapping: Seq[Locatable] = rs.flatMap(r => overlapDetector.getOverlaps(r).asScala) subsetReadToLocus(squished, overlapping) } + /** + * reduces an input Template to a Seq of locatables that are non-overlapping and cover the ovelap between + * the region of interest and the the template's coverage-worthy reads. + * + * @param template + * @return a Seq[Template] + */ def templateToCoverageLocatable(template: Template): Seq[Locatable] = { val locus: Seq[Locatable] = template @@ -168,25 +189,35 @@ class NormalizeCoverage( readToSubsettedLocus(locus) } - + /** + * For an input template, returns True if one of its coverage-worthy reads overlap the region of interest + * + * @param template input Template + * @return True only if one of the template's coverage-worthy reads coincide with the region of interest + */ def primariesOverlapTargets(template: Template): Boolean = { template.allReads.filter(readFilterForCoverage).exists(r => overlapDetector.overlapsAny(r.asSam)) } - //returns Short.MaxValue if template doesn't overlap with requested intervals + /** + * provides the minimum value in the coverage map in the region that is both + * a region of interest, and covered by one of coverage-worthy reads in the template. + * + * returns Short.MaxValue if template doesn't overlap with requested intervals + */ def minCoverage(template: Template): Short = { - //FIX template.allReads.filter(readFilterForCoverage).map(read => { val cov: IndexedSeq[Short] = coverage(read.refName).toIndexedSeq val covLocusSet: Seq[Locatable] = templateToCoverageLocatable(template) val covPerTemplate = covLocusSet.flatMap(covLocus => cov.slice(covLocus.getStart - 1, covLocus.getEnd).minOption) covPerTemplate.reduce(min) - }).reduceOption(min) - .getOrElse(Short.MaxValue) + }).reduceOption(min) getOrElse Short.MaxValue } - // increments the coverage over the range of the template. - // will not increment beyond Short.MaxValue + /** + * Increments the coverage map over the range of the relevant reads of the template. + * Will not increment beyond Short.MaxValue to avoid overflow + */ def incrementCoverage(template: Template): Unit = { squish(template.allReads.filter(readFilterForCoverage).map(x => x.asSam)).foreach(read => { val cov: mutable.IndexedSeq[Short] = coverage(read.getContig) @@ -206,11 +237,10 @@ class NormalizeCoverage( //check that all the reads in the template pass minMQ .filter(t => minMapQ(t) >= minMQ) //only consider reads that cover regions that need coverage, i.e. that - //at least one base that they cover is less than the target + //at least one base that they cover is less than the target coverage .filter(t => minCoverage(t) < coverageTarget) - // + //increment coverages and emit reads from template .flatMap(t => { - //increment coverages and emit template incrementCoverage(t) //output t.allReads.iterator @@ -239,7 +269,6 @@ class NormalizeCoverage( Io.assertReadable(input) Io.assertCanWriteFile(output) targetIntervals.foreach(Io.assertReadable) - assert(coverageTarget > 0, s"Target must be >0 found $coverageTarget.") + assert(coverageTarget > 0, s"Target must be >0. Found $coverageTarget.") } - } From 1ab215a745a2ad414c9432bb287a255e022e96ed Mon Sep 17 00:00:00 2001 From: Yossi Farjoun Date: Thu, 22 Dec 2022 01:04:29 -0500 Subject: [PATCH 11/16] - cleaned up code. - added some tests - made slices work... --- .../scala/com/fulcrumgenomics/bam/Bams.scala | 47 ++-- .../fulcrumgenomics/bam/RandomizeBam.scala | 13 +- .../personal/yfarjoun/CoverageManager.scala | 133 +++++++++++ .../personal/yfarjoun/LocusTrack.scala | 56 +++++ .../personal/yfarjoun/NormalizeCoverage.scala | 220 ++++-------------- .../personal/yfarjoun/RandomTemplate.scala | 46 ---- .../yfarjoun/CoverageManagerTest.scala | 62 +++++ .../NormalizeCoverageOptionsTest.scala | 68 ++---- 8 files changed, 355 insertions(+), 290 deletions(-) create mode 100644 src/main/scala/com/fulcrumgenomics/personal/yfarjoun/CoverageManager.scala create mode 100644 src/main/scala/com/fulcrumgenomics/personal/yfarjoun/LocusTrack.scala delete mode 100644 src/main/scala/com/fulcrumgenomics/personal/yfarjoun/RandomTemplate.scala create mode 100644 src/test/scala/com/fulcrumgenomics/personal/yfarjoun/CoverageManagerTest.scala diff --git a/src/main/scala/com/fulcrumgenomics/bam/Bams.scala b/src/main/scala/com/fulcrumgenomics/bam/Bams.scala index a9f7cfd4d..339933a63 100644 --- a/src/main/scala/com/fulcrumgenomics/bam/Bams.scala +++ b/src/main/scala/com/fulcrumgenomics/bam/Bams.scala @@ -301,32 +301,21 @@ object Bams extends LazyLogging { * @param tmpDir an optional temp directory to use for temporary sorting files if needed * @return an Iterator of Template objects */ - def templateIterator(iterator: Iterator[SamRecord], header: SAMFileHeader, maxInMemory: Int, tmpDir: DirPath): SelfClosingIterator[Template] = { val queryIterator = queryGroupedIterator(iterator, header, maxInMemory, tmpDir) - templateIteratorPresorted(queryIterator) - } - - /** - * Converts a Iterator of SamRecoreds to an iterator of Template without sorting. - * - * @param iterator - */ - def templateIteratorPresorted(readsIterator: SelfClosingIterator[SamRecord]): SelfClosingIterator[Template] = { - - val templatesIterator = new Iterator[Template] { - override def hasNext: Boolean = readsIterator.hasNext + val iter = new Iterator[Template] { + override def hasNext: Boolean = queryIterator.hasNext override def next(): Template = { require(hasNext, "next() called on empty iterator") - Template(readsIterator) + Template(queryIterator) } } - new SelfClosingIterator(templatesIterator, () => readsIterator.close()) + new SelfClosingIterator(iter, () => queryIterator.close()) } /** Return an iterator over records sorted and grouped into [[Template]] objects. Although a queryname sort is @@ -377,6 +366,34 @@ object Bams extends LazyLogging { new SelfClosingIterator(_iterator, () => queryIterator.close()) } + /** Return an iterator over records sorted and grouped into [[Template]] objects. Sort order is deterministic, but + * random (based on queryname). See [[templateIterator]] for a [[Template]] iterator which emits templates in a + * non-guaranteed sort order. + * + * @see [[templateIterator]] + * @param iterator an iterator from which to consume records + * @param header the header associated with the records + * @param maxInMemory the maximum number of records to keep and sort in memory, if sorting is needed + * @param tmpDir a temp directory to use for temporary sorting files if sorting is needed + * @return an Iterator of queryname sorted Template objects + */ + def queryGroupedRandomIterator(iterator: Iterator[SamRecord], + header: SAMFileHeader, + maxInMemory: Int, + tmpDir:DirPath=Io.tmpDir): SelfClosingIterator[Template] = { + + val randomSeed=42 + val sorter = RandomizeBam.Randomize(iterator, header, randomSeed, maxInMemory, queryGroup = true, tmpDir) + val sortedIterator = sorter.iterator + + val sortedHead = header.clone() + sortedHead.setGroupOrder(GroupOrder.query) + sortedHead.setSortOrder(SortOrder.unsorted) + + Bams.templateIterator(sortedIterator, sortedHead, maxInMemory, tmpDir) + + } + /** Returns an iterator over the records in the given iterator such that the order of the records returned is * determined by the value of the given SAM tag, which can optionally be transformed. * diff --git a/src/main/scala/com/fulcrumgenomics/bam/RandomizeBam.scala b/src/main/scala/com/fulcrumgenomics/bam/RandomizeBam.scala index eac75ea89..c7143025b 100644 --- a/src/main/scala/com/fulcrumgenomics/bam/RandomizeBam.scala +++ b/src/main/scala/com/fulcrumgenomics/bam/RandomizeBam.scala @@ -37,26 +37,25 @@ import com.fulcrumgenomics.sopt.{arg, clp} import com.fulcrumgenomics.util.{Io, ProgressLogger, Sorter} import htsjdk.samtools.SAMFileHeader import htsjdk.samtools.SAMFileHeader.{GroupOrder, SortOrder} -import htsjdk.samtools.SBIIndex.Header import htsjdk.samtools.util.Murmur3 object RandomizeBam { - def Randomize(in: Iterator[SamRecord], header: SAMFileHeader, seed: Int, queryGroup: Boolean, logger: Logger):Sorter[SamRecord, _] = { + def Randomize(in: Iterator[SamRecord], header: SAMFileHeader, seed: Int, maxNumInRam:Int, queryGroup: Boolean, tmpDir:DirPath= Io.tmpDir): Sorter[SamRecord, _] = { val hasher = new Murmur3(seed) - + val logger: Logger = new Logger(this.getClass) val sorter = { if (queryGroup) { val f = (r: SamRecord) => SamOrder.RandomQueryKey(hasher.hashUnencodedChars(r.name), r.name, r.asSam.getFlags) - new Sorter(2e6.toInt, new SamRecordCodec(header), f) + new Sorter(maxNumInRam, new SamRecordCodec(header), f, tmpDir) } else { val f = (r: SamRecord) => { val key = s"${r.name}:${r.basesString}:${r.qualsString}" SamOrder.RandomKey(hasher.hashUnencodedChars(key), r.asSam.getFlags) } - new Sorter(2e6.toInt, new SamRecordCodec(header), f) + new Sorter(maxNumInRam, new SamRecordCodec(header), f, tmpDir) } } @@ -85,13 +84,13 @@ class RandomizeBam( @arg(flag = 'q', doc = "Group together reads by queryname.") val queryGroup: Boolean = true ) extends FgBioTool with LazyLogging { - Io.assertCanWriteFile(input) + Io.assertReadable(input) Io.assertCanWriteFile(output) override def execute(): Unit = { val in = SamSource(input) - val sorter = Randomize(in.iterator, in.header, seed, queryGroup, this.logger) + val sorter = Randomize(in.iterator, in.header, seed, 2e6.toInt, queryGroup) logger.info("Writing reads out to output.") val outHeader = in.header.clone() diff --git a/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/CoverageManager.scala b/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/CoverageManager.scala new file mode 100644 index 000000000..75e911e5f --- /dev/null +++ b/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/CoverageManager.scala @@ -0,0 +1,133 @@ +package com.fulcrumgenomics.personal.yfarjoun + +import com.fulcrumgenomics.FgBioDef.javaIterableToIterator +import com.fulcrumgenomics.bam.Template +import com.fulcrumgenomics.bam.api.SamRecord +import com.fulcrumgenomics.personal.yfarjoun.CoverageManager.{coverageReadsFromTemplate, readFilterForCoverage} +import com.fulcrumgenomics.util.NumericTypes.PhredScore +import htsjdk.samtools.util.{Interval, IntervalList, OverlapDetector} + +import scala.jdk.CollectionConverters.{IteratorHasAsJava, IteratorHasAsScala} + +object CoverageManager { + + def minMapQ(template: Template): PhredScore = { + template.allReads.filter(readFilterForCoverage) + .map(_.mapq) + .minOption + .getOrElse(0) + .toByte + } + + def readFilterForCoverage(record: SamRecord): Boolean = { + !record.secondary && // I'm assuming here that secondary reads to not count towards coverage, but supplementary do + record.mapped && + !record.duplicate && + record.pf + } + + /** + * Takes an iterator of Intervals and reduces it to a non-overlapping Seq of the same. + * + * Result will not have overlapping Intervals, but will cover the same original territory + */ + def squish(ls: IterableOnce[Interval]): IterableOnce[Interval] = { + new IntervalList.IntervalMergerIterator(ls.iterator.asJava,true,false,false) + }.asScala + + /** + * Gets the minimum value of two input Shorts. Used for left-folding + */ + def min(s1: Short, s2: Short): Short = { + s1 min s2 + } + + def coverageReadsFromTemplate(t: Template): Iterator[SamRecord] = { + t.allReads.filter(readFilterForCoverage) + } +} + +/** + * + */ +class CoverageManager(val intervals: IntervalList) { + + private val coverage: OverlapDetector[LocusTrack] = new OverlapDetector[LocusTrack](0, 0) + + IntervalList.getUniqueIntervals(intervals, false) + .foreach(i => coverage.addLhs(new LocusTrack(i), i)) + + /** + * For an input template, returns True if one of its coverage-worthy reads overlap the region of interest + * + * @param template input Template + * @return True only if one of the template's coverage-worthy reads coincide with the region of interest + */ + def primariesOverlapTargets(template: Template): Boolean = { + coverageReadsFromTemplate(template).exists(r => overlapsAny(r)) + } + + def overlapsAny(read: SamRecord): Boolean = { + coverage.overlapsAny(convertSamToInterval(read)) + } + + def overlapsAny(Interval: Interval): Boolean = { + coverage.overlapsAny(Interval) + } + + def getCoverageTracks(locus: Interval): Iterator[LocusTrack] = { + coverage.getOverlaps(locus).map(lt => lt.sliceToLocus(locus)) + } + + /** + * provides the minimum value in the coverage map in the region that is both + * a region of interest, and covered by one of coverage-worthy reads in the template. + * + * returns Short.MaxValue if template doesn't overlap with requested intervals + */ + def getMinCoverage(template: Template): Short = { + template.allReads.filter(readFilterForCoverage).map(read => { + getMinCoverage(convertSamToInterval(read)) + }).minOption.getOrElse(Short.MaxValue) + } + + /** + * returns the minimum value in the region of interest and in the locus + * + * returns Short.MaxValue if template doesn't overlap with requested intervals + */ + def getMinCoverage(locus: Interval): Short = { + getCoverageTracks(locus) + .map(lt => lt.track.min) + .minOption. + getOrElse(Short.MaxValue) + } + + def convertSamToInterval(samRecord: SamRecord): Interval = { + new Interval(samRecord.refName, samRecord.start, samRecord.end) + } + + def incrementCoverage(interval: Interval): Unit = { + getCoverageTracks(interval).foreach(locusTrack => + locusTrack.track.indices + .foreach(j => locusTrack.track(j) = Math.min(Short.MaxValue, locusTrack.track(j) + 1).toShort)) + } + + /** + * Increments the coverage map over the range of the relevant reads of the template. + * Will not increment beyond Short.MaxValue to avoid overflow + */ + def incrementCoverage(template: Template): Unit = { + CoverageManager + .squish(template.allReads.filter(readFilterForCoverage).map(convertSamToInterval)) + .iterator.foreach(read => incrementCoverage(read)) + } + + /** mostly for testing, but might come in handy */ + def resetCoverage(): Unit = { + coverage.getAll.foreach(locusTrack => + locusTrack.track.indices + .foreach(j => locusTrack.track.update(j, 0.toShort))) + } +} + diff --git a/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/LocusTrack.scala b/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/LocusTrack.scala new file mode 100644 index 000000000..37b9b7c30 --- /dev/null +++ b/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/LocusTrack.scala @@ -0,0 +1,56 @@ +package com.fulcrumgenomics.personal.yfarjoun + +import htsjdk.samtools.util.{Interval, Locatable} + +import scala.collection.mutable + +object LocusTrack { + val Empty = new LocusTrack(None) + + class ConnectedSlice[A](val mySeq: mutable.Seq[A], val from: Int, val to: Int) extends mutable.Seq[A] { + + assert(from >= 0) + assert(to >= from) + assert(to <= mySeq.size) + + override def update(idx: Int, elem: A): Unit = mySeq.update(idx + from, elem) + + override def apply(i: Int): A = mySeq.apply(i + from) + + override def length: Int = to - from + + override def iterator: Iterator[A] = mySeq.slice(from, to).iterator + } +} + +class LocusTrack(val locus: Option[Interval], val track: mutable.Seq[Short]) { + def this(l: Option[Interval]) = { + this(l, l match { + case None => mutable.Seq.empty + case Some(loc) => new Array[Short](loc.length()) + }) + } + + def this(l: Interval) = { + this(Some(l)) + } + + // return a slice of the track which matches a region overlapping a + // given locus + def sliceToLocus(l: Locatable): LocusTrack = { + locus match { + case None => LocusTrack.Empty + case Some(loc) if !loc.contigsMatch(l) => LocusTrack.Empty + case Some(loc) => + val start: Int = Math.max(loc.getStart, l.getStart) + val end: Int = Math.min(loc.getEnd, l.getEnd) + if (start > end) { + LocusTrack.Empty + } else { + val overlapLocus = new Interval(loc.getContig, start, end) + val overlapTrack = new LocusTrack.ConnectedSlice(track, start - loc.getStart, end + 1 - loc.getStart) + new LocusTrack(Some(overlapLocus), overlapTrack) + } + } + } +} diff --git a/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverage.scala b/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverage.scala index e0a3fe047..00eddd588 100644 --- a/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverage.scala +++ b/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverage.scala @@ -25,95 +25,31 @@ package com.fulcrumgenomics.personal.yfarjoun import com.fulcrumgenomics.FgBioDef._ -import com.fulcrumgenomics.bam.Template import com.fulcrumgenomics.bam.api.{SamOrder, SamRecord, SamSource, SamWriter} +import com.fulcrumgenomics.bam.{Bams, Template} import com.fulcrumgenomics.cmdline.{ClpGroups, FgBioTool} import com.fulcrumgenomics.commons.util.LazyLogging import com.fulcrumgenomics.fasta.SequenceDictionary import com.fulcrumgenomics.personal.yfarjoun.NormalizeCoverage._ import com.fulcrumgenomics.sopt._ +import com.fulcrumgenomics.util.Io import com.fulcrumgenomics.util.NumericTypes.PhredScore -import com.fulcrumgenomics.util.{IntervalListSource, Io} -import htsjdk.samtools.util.{Interval, Locatable, OverlapDetector} -import htsjdk.tribble.SimpleFeature +import htsjdk.samtools.SAMFileHeader +import htsjdk.samtools.SAMFileHeader.{GroupOrder, SortOrder} +import htsjdk.samtools.util.{Interval, IntervalList} -import java.util -import scala.collection.mutable import scala.jdk.CollectionConverters._ -import scala.util.Using object NormalizeCoverage { + /** Various default values for the Coverage Normalizer. */ + val DefaultMinMapQ: PhredScore = 30.toByte - def getIntervalsFromDictionary(dict: SequenceDictionary): util.List[Interval] = { - dict.iterator.map(seq => new Interval(seq.name, 1, seq.length)).toList.asJava - } - - def minMapQ(template: Template): PhredScore = { - template.allReads.filter(readFilterForCoverage) - .map(_.mapq) - .minOption - .getOrElse(0) - .toByte - } - - def readFilterForCoverage(record: SamRecord): Boolean = { - !record.secondary && // I'm assuming here that secondary reads to not count towards coverage, but supplementary do - record.mapped && - !record.duplicate && - record.pf - } - - def subsetReadToLocus(rs: Seq[Locatable], overlappingIntervals: Seq[Locatable]): Seq[Locatable] = { - val unionOverlap: Seq[Locatable] = squish(overlappingIntervals) - rs.flatMap(r => { - unionOverlap.filter(l => l.overlaps(r)).flatMap(l => intersect(l, r)) - }) - } - - def union(i1: Locatable, i2: Locatable): Seq[Locatable] = { - if (i1.withinDistanceOf(i2, 1)) - Seq(new SimpleFeature(i1.getContig, Math.min(i1.getStart, i2.getStart), Math.max(i1.getEnd, i2.getEnd))) - else - Seq(i1, i2) - } - - // calculates the locus covered by two loci (assuming they are on the same reference) - def union(i1: Seq[Locatable], i2: Locatable): Seq[Locatable] = { - i1.flatMap(i => union(i, i2)) - } - - def intersect(i1: Locatable, i2: Locatable): Option[Locatable] = { - if (!i1.contigsMatch(i2)) - None - else - Some(new SimpleFeature( - i1.getContig, - Math.max(i1.getStart, i2.getStart), - Math.min(i1.getEnd, i2.getEnd))) - } - - /** - * Takes an iterator of Locatables and reduces it to a non-overlapping Seq of the same. - * - * Result will not have overlapping locatables, but will cover the same original territory - */ - def squish(ls: IterableOnce[Locatable]): Seq[Locatable] = { - ls.iterator.foldLeft(Seq.empty[Locatable])(union) + def getIntervalsFromDictionary(dict: SequenceDictionary): List[Interval] = { + dict.iterator.map(seq => new Interval(seq.name, 1, seq.length)).toList } - - /** - * Gets the minimum value of two input Shorts. Used for left-folding - */ - def min(s1: Short, s2: Short): Short = { - s1 min s2 - } - - /** Various default values for the Coverage Normalizer. */ - val DefaultMinMapQ: PhredScore = 30.toByte } - @clp(description = """ Normalizes coverage of an input bam to be at a given coverage for every base in the reference/input interval list. @@ -132,116 +68,36 @@ class NormalizeCoverage( ) extends FgBioTool with LazyLogging { - val overlapDetector: OverlapDetector[Interval] = new OverlapDetector[Interval](0, 0) - - //keeps track of the coverage already added to output - val coverage: mutable.Map[String, mutable.IndexedSeq[Short]] = mutable.Map() - //sets up the overlap detector and the coverage tracker - def initialize(bam: PathToBam, intervalsMaybe: Option[PathToIntervals]): Unit = { - Using.Manager { use => - val in = use(SamSource(bam)) - val intervals: util.List[Interval] = intervalsMaybe.map(f => use(IntervalListSource(f))) - .map(i => i.toIntervalList.getIntervals) - .getOrElse(getIntervalsFromDictionary(in.dict)) - overlapDetector.addAll(intervals, intervals) - - in.dict.foreach(seq => { - coverage(seq.name) = mutable.IndexedSeq.fill(seq.length)(0.toShort) - } - ) + def createCoverageManager(bam: PathToBam, intervalsMaybe: Option[PathToIntervals]): CoverageManager = { + val intervals: IntervalList = intervalsMaybe match { + case None => + val dict = SequenceDictionary.extract(bam) + val il: IntervalList = new IntervalList(dict.toSam) + il.addall(getIntervalsFromDictionary(dict).asJava) + il + case Some(intervals) => IntervalList.fromPath(intervals) } + new CoverageManager(intervals) } - /** - * Take a Seq of Locatables and converts them to a set of non-overlapping Locatables that overlap - * the region of interset. - * - * @param rs - * @return - */ - def readToSubsettedLocus(rs: Seq[Locatable]): Seq[Locatable] = { - val squished = squish(rs) - val overlapping: Seq[Locatable] = rs.flatMap(r => overlapDetector.getOverlaps(r).asScala) - subsetReadToLocus(squished, overlapping) - } - - /** - * reduces an input Template to a Seq of locatables that are non-overlapping and cover the ovelap between - * the region of interest and the the template's coverage-worthy reads. - * - * @param template - * @return a Seq[Template] - */ - def templateToCoverageLocatable(template: Template): Seq[Locatable] = { - val locus: Seq[Locatable] = template - - //take all reads - .allReads - //make sure they are good - .filter(readFilterForCoverage) - - //replace each read with its start/end tuple - .map[Locatable](r => r.asSam).toSeq - - //Subset the read to the overlapping intervals - readToSubsettedLocus(locus) + def templateMinMapQ(t: Template): PhredScore = { + CoverageManager.coverageReadsFromTemplate(t).map(r=>r.mapq).min.toByte } - /** - * For an input template, returns True if one of its coverage-worthy reads overlap the region of interest - * - * @param template input Template - * @return True only if one of the template's coverage-worthy reads coincide with the region of interest - */ - def primariesOverlapTargets(template: Template): Boolean = { - template.allReads.filter(readFilterForCoverage).exists(r => overlapDetector.overlapsAny(r.asSam)) - } - - /** - * provides the minimum value in the coverage map in the region that is both - * a region of interest, and covered by one of coverage-worthy reads in the template. - * - * returns Short.MaxValue if template doesn't overlap with requested intervals - */ - def minCoverage(template: Template): Short = { - template.allReads.filter(readFilterForCoverage).map(read => { - val cov: IndexedSeq[Short] = coverage(read.refName).toIndexedSeq - val covLocusSet: Seq[Locatable] = templateToCoverageLocatable(template) - val covPerTemplate = covLocusSet.flatMap(covLocus => cov.slice(covLocus.getStart - 1, covLocus.getEnd).minOption) - covPerTemplate.reduce(min) - }).reduceOption(min) getOrElse Short.MaxValue - } - - /** - * Increments the coverage map over the range of the relevant reads of the template. - * Will not increment beyond Short.MaxValue to avoid overflow - */ - def incrementCoverage(template: Template): Unit = { - squish(template.allReads.filter(readFilterForCoverage).map(x => x.asSam)).foreach(read => { - val cov: mutable.IndexedSeq[Short] = coverage(read.getContig) - val covLocusSet: Seq[Locatable] = templateToCoverageLocatable(template) - - covLocusSet.foreach(covLocus => - Range(covLocus.getStart - 1, covLocus.getEnd) - .foreach(j => cov.update(j, Math.min(Short.MaxValue, cov(j) + 1).toShort))) - }) - } - - - def templatesToReads(templateIterator: Iterator[Template]): Iterator[SamRecord] = { + def filterTemplatesToCoverage(templateIterator: Iterator[Template], coverageManager: CoverageManager, minMapQ: PhredScore): Iterator[SamRecord] = { templateIterator //see if primary reads overlap region of interest - .filter(t => primariesOverlapTargets(t)) + .filter(t => coverageManager.primariesOverlapTargets(t)) //check that all the reads in the template pass minMQ - .filter(t => minMapQ(t) >= minMQ) + .filter(t => templateMinMapQ(t) >= minMapQ) //only consider reads that cover regions that need coverage, i.e. that //at least one base that they cover is less than the target coverage - .filter(t => minCoverage(t) < coverageTarget) + .filter(t => coverageManager.getMinCoverage(t) < coverageTarget) //increment coverages and emit reads from template .flatMap(t => { - incrementCoverage(t) + coverageManager.incrementCoverage(t) //output t.allReads.iterator }) @@ -250,21 +106,27 @@ class NormalizeCoverage( override def execute(): Unit = { checkArguments() - initialize(input, targetIntervals) + val coverageManager = createCoverageManager(input, targetIntervals) - Using.Manager { use => - val in = use(SamSource(input)) - val out = use(SamWriter(output, in.header.clone(), sort = Some(sortOrder), maxRecordsInRam = maxRecordsInRam)) - val templateIterator: Iterator[Template] = RandomTemplate.templateSortedIterator(in.iterator, - in.header, - maxRecordsInRam, - logger) + val in = SamSource(input) + val header:SAMFileHeader=in.header.clone() + header.setGroupOrder(GroupOrder.query) + header.setSortOrder(SortOrder.unsorted) - templatesToReads(templateIterator).foreach(out.write) - } + val out = SamWriter(output, header, sort = Some(sortOrder), maxRecordsInRam = maxRecordsInRam) + + val templateIterator: Iterator[Template] = Bams.queryGroupedRandomIterator(in.iterator, + in.header, + maxRecordsInRam) + + filterTemplatesToCoverage(templateIterator,coverageManager,minMQ).foreach(out.write) + + out.close() + in.close() } + private def checkArguments(): Unit = { Io.assertReadable(input) Io.assertCanWriteFile(output) diff --git a/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/RandomTemplate.scala b/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/RandomTemplate.scala deleted file mode 100644 index f35003d3b..000000000 --- a/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/RandomTemplate.scala +++ /dev/null @@ -1,46 +0,0 @@ -package com.fulcrumgenomics.personal.yfarjoun - -import com.fulcrumgenomics.FgBioDef.DirPath -import com.fulcrumgenomics.bam.Bams.templateIterator -import com.fulcrumgenomics.bam.{RandomizeBam, Template} -import com.fulcrumgenomics.bam.api.{SamOrder, SamRecord, SamRecordCodec} -import com.fulcrumgenomics.commons.collection.SelfClosingIterator -import com.fulcrumgenomics.commons.util.Logger -import com.fulcrumgenomics.util.Sorter -import htsjdk.samtools.SAMFileHeader -import htsjdk.samtools.util.Murmur3 - -object RandomTemplate { - - /** Return an iterator over records sorted and grouped into [[Template]] objects. Although a queryname sort is - * guaranteed, the sort order may not be consistent with other queryname sorting implementations, especially in other - * tool kits. See [[templateIterator]] for a [[Template]] iterator which emits templates in a non-guaranteed sort - * order. - * - * @see [[templateIterator]] - * - * @param iterator an iterator from which to consume records - * @param header the header associated with the records - * @param maxInMemory the maximum number of records to keep and sort in memory, if sorting is needed - * @param tmpDir a temp directory to use for temporary sorting files if sorting is needed - * @return an Iterator of queryname sorted Template objects - */ - def templateSortedIterator(iterator: Iterator[SamRecord], - header: SAMFileHeader, - maxInMemory: Int, - logger: Logger): SelfClosingIterator[Template] = { - - val sorter = RandomizeBam.Randomize(iterator, header, 42, true,logger) - val sortedIterator = sorter.iterator - - val iter = new Iterator[Template] { - override def hasNext: Boolean = sortedIterator.hasNext - override def next(): Template = { - require(hasNext, "next() called on empty iterator") - Template(sortedIterator) - } - } - - new SelfClosingIterator[Template](iter, ()=>{sorter.close()}) - } -} diff --git a/src/test/scala/com/fulcrumgenomics/personal/yfarjoun/CoverageManagerTest.scala b/src/test/scala/com/fulcrumgenomics/personal/yfarjoun/CoverageManagerTest.scala new file mode 100644 index 000000000..8c9c4d4b5 --- /dev/null +++ b/src/test/scala/com/fulcrumgenomics/personal/yfarjoun/CoverageManagerTest.scala @@ -0,0 +1,62 @@ +package com.fulcrumgenomics.personal.yfarjoun + +import com.fulcrumgenomics.fasta.Converters.ToSAMSequenceDictionary +import com.fulcrumgenomics.fasta.{SequenceDictionary, SequenceMetadata} +import com.fulcrumgenomics.testing.UnitSpec +import htsjdk.samtools.util.{Interval, IntervalList} + +import scala.jdk.CollectionConverters.SeqHasAsJava + +class CoverageManagerTest extends UnitSpec { + + private val dict = SequenceDictionary( + SequenceMetadata(name = "chr1", length = 10000), + SequenceMetadata(name = "chr2", length = 50000) + ).asSam + + private val intervals = new IntervalList(this.dict) + + val i1: Interval = new Interval("chr1", 100, 200) + val i2: Interval = new Interval("chr1", 100, 300) + val i3: Interval = new Interval("chr1", 300, 400) + + val i4: Interval = new Interval("chr2", 300, 400) + + val il: IntervalList = new IntervalList(dict) + il.addall(List(i1, i2, i3).asJava) + val covMan: CoverageManager = new CoverageManager(il) + + "CoverageManager" should "start its life with coverage = zero" in { + covMan.getMinCoverage(i1) shouldEqual 0 + covMan.getMinCoverage(i2) shouldEqual 0 + covMan.getMinCoverage(i3) shouldEqual 0 + } + + "CoverageManager.getMinCoverage" should "return Short.MaxValue when out of range" in { + covMan.getMinCoverage(i4) shouldEqual Short.MaxValue + } + + "CoverageManager" should "remain coverage zero when only partially covered" in { + covMan.resetCoverage() + covMan.incrementCoverage(i1) + covMan.getMinCoverage(i2) shouldEqual 0 + covMan.getMinCoverage(i3) shouldEqual 0 + covMan.getMinCoverage(i1) shouldEqual 1 + } + + "CoverageManager.getMinCoverage" should "remain Short.MaxValue when out of range" in { + covMan.getMinCoverage(i4) shouldEqual Short.MaxValue + } + + + "CoverageManager" should "get get to 2 when adding coverage twice" in { + covMan.resetCoverage() + covMan.incrementCoverage(i1) + covMan.incrementCoverage(i1) + covMan.getMinCoverage(i2) shouldEqual 0 + covMan.getMinCoverage(i3) shouldEqual 0 + covMan.getMinCoverage(i1) shouldEqual 2 + } + + +} diff --git a/src/test/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverageOptionsTest.scala b/src/test/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverageOptionsTest.scala index d088968e8..178a85282 100644 --- a/src/test/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverageOptionsTest.scala +++ b/src/test/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverageOptionsTest.scala @@ -13,7 +13,7 @@ trait LocatableMatchers { class SameLocusAs(right: Locatable) extends Matcher[Locatable] { - def apply(left: Locatable) = { + def apply(left: Locatable): MatchResult = { MatchResult( left.getContig.equals(right.getContig) && left.getStart.equals(right.getStart) && @@ -37,47 +37,29 @@ class NormalizeCoverageOptionsTest extends UnitSpec with LocatableMatchers { val i2: Interval = new Interval("chr1", 100, 300) val i3: Interval = new Interval("chr1", 300, 400) - var union_1_2: Locatable = NormalizeCoverage.union(i1, i2).head - var union_2_1: Locatable = NormalizeCoverage.union(i2, i1).head - var union_1_3: Locatable = NormalizeCoverage.union(i1, i3).head - var union_3_1: Locatable = NormalizeCoverage.union(i3, i1).head - var union_2_3: Locatable = NormalizeCoverage.union(i2, i3).head - var union_3_2: Locatable = NormalizeCoverage.union(i3, i2).head - "Union" should " correctly find the union of two locatables" in { - - union_1_2 should haveSameLocusAs(new Interval("chr1",100,300)) - union_1_3 should haveSameLocusAs(new Interval("chr1",100,400)) - union_2_3 should haveSameLocusAs(new Interval("chr1",100,400)) - } - - it should "be symmetric" in { - union_1_2 should haveSameLocusAs(union_2_1) - union_1_3 should haveSameLocusAs(union_3_1) - union_3_2 should haveSameLocusAs(union_2_3) - } - - var subset_1_2: Seq[Locatable] = NormalizeCoverage.subsetReadToLocus(Seq(i1), Seq(i2)) - var subset_2_1: Seq[Locatable] = NormalizeCoverage.subsetReadToLocus(Seq(i2), Seq(i1)) - var subset_1_3: Seq[Locatable] = NormalizeCoverage.subsetReadToLocus(Seq(i1), Seq(i3)) - var subset_3_1: Seq[Locatable] = NormalizeCoverage.subsetReadToLocus(Seq(i3), Seq(i1)) - var subset_2_3: Seq[Locatable] = NormalizeCoverage.subsetReadToLocus(Seq(i2), Seq(i3)) - var subset_3_2: Seq[Locatable] = NormalizeCoverage.subsetReadToLocus(Seq(i3), Seq(i2)) - - "subsetReadToLocus" should "correctly subset one locatable to another" in { - subset_1_2 should have size 1 - subset_1_2.head should haveSameLocusAs(i1) - - subset_2_1 should have size 1 - subset_2_1.head should haveSameLocusAs(i1) - - subset_1_3 shouldBe empty - subset_3_1 shouldBe empty - - subset_2_3 should have size 1 - subset_2_3.head should haveSameLocusAs(new Interval("chr1",300,300)) - - subset_3_2 should have size 1 - subset_3_2.head should haveSameLocusAs(new Interval("chr1",300,300)) - } +// +// var subset_1_2: Seq[Locatable] = CoverageManager.subsetReadToLocus(Seq(i1), Seq(i2)) +// var subset_2_1: Seq[Locatable] = CoverageManager.subsetReadToLocus(Seq(i2), Seq(i1)) +// var subset_1_3: Seq[Locatable] = CoverageManager.subsetReadToLocus(Seq(i1), Seq(i3)) +// var subset_3_1: Seq[Locatable] = CoverageManager.subsetReadToLocus(Seq(i3), Seq(i1)) +// var subset_2_3: Seq[Locatable] = CoverageManager.subsetReadToLocus(Seq(i2), Seq(i3)) +// var subset_3_2: Seq[Locatable] = CoverageManager.subsetReadToLocus(Seq(i3), Seq(i2)) +// +// "subsetReadToLocus" should "correctly subset one locatable to another" in { +// subset_1_2 should have size 1 +// subset_1_2.head should haveSameLocusAs(i1) +// +// subset_2_1 should have size 1 +// subset_2_1.head should haveSameLocusAs(i1) +// +// subset_1_3 shouldBe empty +// subset_3_1 shouldBe empty +// +// subset_2_3 should have size 1 +// subset_2_3.head should haveSameLocusAs(new Interval("chr1",300,300)) +// +// subset_3_2 should have size 1 +// subset_3_2.head should haveSameLocusAs(new Interval("chr1",300,300)) +// } } From 694ddfaeee6c8e26628212f52eee6a9e10a9a66c Mon Sep 17 00:00:00 2001 From: Yossi Farjoun Date: Sat, 24 Dec 2022 23:09:01 -0500 Subject: [PATCH 12/16] - unit tests are pretty complete - need CLP test and we should be good to go.... --- .../personal/yfarjoun/CoverageManager.scala | 75 ++++------ .../personal/yfarjoun/LocusTrack.scala | 6 +- .../personal/yfarjoun/NormalizeCoverage.scala | 29 +--- .../yfarjoun/CoverageManagerTest.scala | 132 ++++++++++++++++-- .../personal/yfarjoun/LocusTrackTest.scala | 68 +++++++++ 5 files changed, 228 insertions(+), 82 deletions(-) create mode 100644 src/test/scala/com/fulcrumgenomics/personal/yfarjoun/LocusTrackTest.scala diff --git a/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/CoverageManager.scala b/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/CoverageManager.scala index 75e911e5f..5ed134064 100644 --- a/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/CoverageManager.scala +++ b/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/CoverageManager.scala @@ -3,7 +3,7 @@ package com.fulcrumgenomics.personal.yfarjoun import com.fulcrumgenomics.FgBioDef.javaIterableToIterator import com.fulcrumgenomics.bam.Template import com.fulcrumgenomics.bam.api.SamRecord -import com.fulcrumgenomics.personal.yfarjoun.CoverageManager.{coverageReadsFromTemplate, readFilterForCoverage} +import com.fulcrumgenomics.personal.yfarjoun.CoverageManager.{convertSamToInterval, readFilterForCoverage} import com.fulcrumgenomics.util.NumericTypes.PhredScore import htsjdk.samtools.util.{Interval, IntervalList, OverlapDetector} @@ -11,19 +11,17 @@ import scala.jdk.CollectionConverters.{IteratorHasAsJava, IteratorHasAsScala} object CoverageManager { - def minMapQ(template: Template): PhredScore = { - template.allReads.filter(readFilterForCoverage) - .map(_.mapq) - .minOption - .getOrElse(0) - .toByte - } - def readFilterForCoverage(record: SamRecord): Boolean = { + def readFilterForCoverage(record: SamRecord, minMapQ: PhredScore): Boolean = { !record.secondary && // I'm assuming here that secondary reads to not count towards coverage, but supplementary do record.mapped && !record.duplicate && - record.pf + record.pf && + record.mapq >= minMapQ + } + + def convertSamToInterval(samRecord: SamRecord): Interval = { + new Interval(samRecord.refName, samRecord.start, samRecord.end) } /** @@ -32,19 +30,9 @@ object CoverageManager { * Result will not have overlapping Intervals, but will cover the same original territory */ def squish(ls: IterableOnce[Interval]): IterableOnce[Interval] = { - new IntervalList.IntervalMergerIterator(ls.iterator.asJava,true,false,false) + new IntervalList.IntervalMergerIterator(ls.iterator.asJava, true, false, false) }.asScala - /** - * Gets the minimum value of two input Shorts. Used for left-folding - */ - def min(s1: Short, s2: Short): Short = { - s1 min s2 - } - - def coverageReadsFromTemplate(t: Template): Iterator[SamRecord] = { - t.allReads.filter(readFilterForCoverage) - } } /** @@ -57,24 +45,6 @@ class CoverageManager(val intervals: IntervalList) { IntervalList.getUniqueIntervals(intervals, false) .foreach(i => coverage.addLhs(new LocusTrack(i), i)) - /** - * For an input template, returns True if one of its coverage-worthy reads overlap the region of interest - * - * @param template input Template - * @return True only if one of the template's coverage-worthy reads coincide with the region of interest - */ - def primariesOverlapTargets(template: Template): Boolean = { - coverageReadsFromTemplate(template).exists(r => overlapsAny(r)) - } - - def overlapsAny(read: SamRecord): Boolean = { - coverage.overlapsAny(convertSamToInterval(read)) - } - - def overlapsAny(Interval: Interval): Boolean = { - coverage.overlapsAny(Interval) - } - def getCoverageTracks(locus: Interval): Iterator[LocusTrack] = { coverage.getOverlaps(locus).map(lt => lt.sliceToLocus(locus)) } @@ -85,8 +55,8 @@ class CoverageManager(val intervals: IntervalList) { * * returns Short.MaxValue if template doesn't overlap with requested intervals */ - def getMinCoverage(template: Template): Short = { - template.allReads.filter(readFilterForCoverage).map(read => { + def getMinCoverage(template: Template, minMapQ: PhredScore): Short = { + template.allReads.filter(readFilterForCoverage(_, minMapQ)).map(read => { getMinCoverage(convertSamToInterval(read)) }).minOption.getOrElse(Short.MaxValue) } @@ -103,10 +73,6 @@ class CoverageManager(val intervals: IntervalList) { getOrElse(Short.MaxValue) } - def convertSamToInterval(samRecord: SamRecord): Interval = { - new Interval(samRecord.refName, samRecord.start, samRecord.end) - } - def incrementCoverage(interval: Interval): Unit = { getCoverageTracks(interval).foreach(locusTrack => locusTrack.track.indices @@ -117,9 +83,9 @@ class CoverageManager(val intervals: IntervalList) { * Increments the coverage map over the range of the relevant reads of the template. * Will not increment beyond Short.MaxValue to avoid overflow */ - def incrementCoverage(template: Template): Unit = { + def incrementCoverage(template: Template, minMapQ: PhredScore): Unit = { CoverageManager - .squish(template.allReads.filter(readFilterForCoverage).map(convertSamToInterval)) + .squish(template.allReads.filter(readFilterForCoverage(_, minMapQ)).map(convertSamToInterval)) .iterator.foreach(read => incrementCoverage(read)) } @@ -129,5 +95,20 @@ class CoverageManager(val intervals: IntervalList) { locusTrack.track.indices .foreach(j => locusTrack.track.update(j, 0.toShort))) } + + def processTemplates(templateIterator: Iterator[Template], minMapQ: PhredScore, coverageTarget: Short): Iterator[SamRecord] = { + templateIterator + // only consider templates that contain reads which cover regions + // that need coverage, i.e. that at least one base that they cover is less + // than the target coverage + .filter(t => getMinCoverage(t, minMapQ) < coverageTarget) + //increment coverages and emit reads from template + .flatMap(t => { + incrementCoverage(t, minMapQ) + //output + t.allReads.iterator + }) + } + } diff --git a/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/LocusTrack.scala b/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/LocusTrack.scala index 37b9b7c30..7d7985eaf 100644 --- a/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/LocusTrack.scala +++ b/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/LocusTrack.scala @@ -39,8 +39,10 @@ class LocusTrack(val locus: Option[Interval], val track: mutable.Seq[Short]) { // given locus def sliceToLocus(l: Locatable): LocusTrack = { locus match { - case None => LocusTrack.Empty - case Some(loc) if !loc.contigsMatch(l) => LocusTrack.Empty + case None => + LocusTrack.Empty + case Some(loc) if !loc.contigsMatch(l) => + LocusTrack.Empty case Some(loc) => val start: Int = Math.max(loc.getStart, l.getStart) val end: Int = Math.min(loc.getEnd, l.getEnd) diff --git a/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverage.scala b/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverage.scala index 00eddd588..ea57b6dac 100644 --- a/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverage.scala +++ b/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverage.scala @@ -22,6 +22,7 @@ * THE SOFTWARE. * */ + package com.fulcrumgenomics.personal.yfarjoun import com.fulcrumgenomics.FgBioDef._ @@ -82,35 +83,13 @@ class NormalizeCoverage( } - def templateMinMapQ(t: Template): PhredScore = { - CoverageManager.coverageReadsFromTemplate(t).map(r=>r.mapq).min.toByte - } - - def filterTemplatesToCoverage(templateIterator: Iterator[Template], coverageManager: CoverageManager, minMapQ: PhredScore): Iterator[SamRecord] = { - templateIterator - //see if primary reads overlap region of interest - .filter(t => coverageManager.primariesOverlapTargets(t)) - //check that all the reads in the template pass minMQ - .filter(t => templateMinMapQ(t) >= minMapQ) - //only consider reads that cover regions that need coverage, i.e. that - //at least one base that they cover is less than the target coverage - .filter(t => coverageManager.getMinCoverage(t) < coverageTarget) - //increment coverages and emit reads from template - .flatMap(t => { - coverageManager.incrementCoverage(t) - //output - t.allReads.iterator - }) - } - override def execute(): Unit = { checkArguments() val coverageManager = createCoverageManager(input, targetIntervals) - val in = SamSource(input) - val header:SAMFileHeader=in.header.clone() + val header: SAMFileHeader = in.header.clone() header.setGroupOrder(GroupOrder.query) header.setSortOrder(SortOrder.unsorted) @@ -120,7 +99,9 @@ class NormalizeCoverage( in.header, maxRecordsInRam) - filterTemplatesToCoverage(templateIterator,coverageManager,minMQ).foreach(out.write) + coverageManager + .processTemplates(templateIterator, minMQ, coverageTarget) + .foreach(out.write) out.close() in.close() diff --git a/src/test/scala/com/fulcrumgenomics/personal/yfarjoun/CoverageManagerTest.scala b/src/test/scala/com/fulcrumgenomics/personal/yfarjoun/CoverageManagerTest.scala index 8c9c4d4b5..0971c90b0 100644 --- a/src/test/scala/com/fulcrumgenomics/personal/yfarjoun/CoverageManagerTest.scala +++ b/src/test/scala/com/fulcrumgenomics/personal/yfarjoun/CoverageManagerTest.scala @@ -1,8 +1,11 @@ package com.fulcrumgenomics.personal.yfarjoun +import com.fulcrumgenomics.bam.api.SamOrder +import com.fulcrumgenomics.bam.{Bams, Template} import com.fulcrumgenomics.fasta.Converters.ToSAMSequenceDictionary import com.fulcrumgenomics.fasta.{SequenceDictionary, SequenceMetadata} -import com.fulcrumgenomics.testing.UnitSpec +import com.fulcrumgenomics.testing.{SamBuilder, UnitSpec} +import com.fulcrumgenomics.util.NumericTypes.PhredScore import htsjdk.samtools.util.{Interval, IntervalList} import scala.jdk.CollectionConverters.SeqHasAsJava @@ -13,8 +16,8 @@ class CoverageManagerTest extends UnitSpec { SequenceMetadata(name = "chr1", length = 10000), SequenceMetadata(name = "chr2", length = 50000) ).asSam - - private val intervals = new IntervalList(this.dict) + val chr1Index: Int = dict.getSequenceIndex("chr1") + val chr2Index: Int = dict.getSequenceIndex("chr2") val i1: Interval = new Interval("chr1", 100, 200) val i2: Interval = new Interval("chr1", 100, 300) @@ -26,17 +29,17 @@ class CoverageManagerTest extends UnitSpec { il.addall(List(i1, i2, i3).asJava) val covMan: CoverageManager = new CoverageManager(il) - "CoverageManager" should "start its life with coverage = zero" in { + "CoverageManager.getMinCoverage" should "start its life with coverage = zero" in { covMan.getMinCoverage(i1) shouldEqual 0 covMan.getMinCoverage(i2) shouldEqual 0 covMan.getMinCoverage(i3) shouldEqual 0 } - "CoverageManager.getMinCoverage" should "return Short.MaxValue when out of range" in { + it should "return Short.MaxValue when out of range" in { covMan.getMinCoverage(i4) shouldEqual Short.MaxValue } - "CoverageManager" should "remain coverage zero when only partially covered" in { + it should "remain coverage zero when only partially covered" in { covMan.resetCoverage() covMan.incrementCoverage(i1) covMan.getMinCoverage(i2) shouldEqual 0 @@ -44,12 +47,11 @@ class CoverageManagerTest extends UnitSpec { covMan.getMinCoverage(i1) shouldEqual 1 } - "CoverageManager.getMinCoverage" should "remain Short.MaxValue when out of range" in { + it should "remain Short.MaxValue when out of range" in { covMan.getMinCoverage(i4) shouldEqual Short.MaxValue } - - "CoverageManager" should "get get to 2 when adding coverage twice" in { + it should "get to 2 when adding coverage twice" in { covMan.resetCoverage() covMan.incrementCoverage(i1) covMan.incrementCoverage(i1) @@ -58,5 +60,117 @@ class CoverageManagerTest extends UnitSpec { covMan.getMinCoverage(i1) shouldEqual 2 } + it should "increment its coverage by 1 correctly from a template with overlapping reads" in { + val builder = new SamBuilder(sort = Some(SamOrder.Queryname)) + builder.header.setSequenceDictionary(dict) + builder.addPair(name = "p1", contig = chr1Index, start1 = 100, start2 = 150) + val minMapQ: PhredScore = 30.toByte + val templates: Seq[Template] = Bams.templateIterator(builder.toSource).toSeq + + covMan.resetCoverage() + templates.foreach(t => covMan.incrementCoverage(t, minMapQ)) + + covMan.getMinCoverage(new Interval("chr1", 150, 200)) shouldEqual 1 + covMan.getMinCoverage(template = templates.head, minMapQ) shouldEqual 1 + } + + it should "Correctly add reads from an easy template" in { + val builder = new SamBuilder(sort = Some(SamOrder.Queryname)) + builder.header.setSequenceDictionary(dict) + builder.addPair(name = "p1", contig = chr1Index, start1 = 100, start2 = 150) + val minMapQ: PhredScore = 30.toByte + val templates: Seq[Template] = Bams.templateIterator(builder.toSource).toSeq + + covMan.resetCoverage() + + // here we should *not* add anything to the coverage, since the coverage target is zero + covMan.processTemplates(templates.iterator, minMapQ, 0).isEmpty shouldBe true + covMan.getMinCoverage(new Interval("chr1", 150, 200)) shouldEqual 0 + covMan.getMinCoverage(template = templates.head, minMapQ) shouldEqual 0 + + // here we *should* add to the coverage, since the coverage target is one + covMan.processTemplates(templates.iterator, minMapQ, 1).isEmpty shouldBe false + covMan.getMinCoverage(new Interval("chr1", 150, 200)) shouldEqual 1 + covMan.getMinCoverage(template = templates.head, minMapQ) shouldEqual 1 + } + + it should "Correctly only consider coverage from the reads of high mapq" in { + val builder = new SamBuilder(sort = Some(SamOrder.Queryname)) + builder.header.setSequenceDictionary(dict) + builder.addPair(name = "p1", contig = chr1Index, start1 = 100, start2 = 150, mapq1 = PhredScore.MinValue,mapq2 = PhredScore.MaxValue) + val minMapQ: PhredScore = 30.toByte + val templates: Seq[Template] = Bams.templateIterator(builder.toSource).toSeq + + covMan.resetCoverage() + + // here we should *not* add anything to the coverage, since the coverage target is zero + covMan.processTemplates(templates.iterator, minMapQ, 0).isEmpty shouldBe true + covMan.getMinCoverage(new Interval("chr1", 150, 200)) shouldEqual 0 + covMan.getMinCoverage(template = templates.head, minMapQ) shouldEqual 0 + + // here we *should* add to the coverage, since the coverage target is one + covMan.processTemplates(templates.iterator, minMapQ, 1).isEmpty shouldBe false + covMan.getMinCoverage(new Interval("chr1", 100, 149)) shouldEqual 0 + covMan.getMinCoverage(new Interval("chr1", 150, 249)) shouldEqual 1 + covMan.getMinCoverage(new Interval("chr1", 0, 250)) shouldEqual 0 + covMan.getMinCoverage(template = templates.head, minMapQ) shouldEqual 1 + + } + + it should "Correctly not add reads that should be filtered " in { + val builder = new SamBuilder(sort = Some(SamOrder.Queryname)) + builder.header.setSequenceDictionary(dict) + builder.addPair(name = "low_mapq", contig = chr1Index, start1 = 100, start2 = 150, mapq1 = 10, mapq2 = 10) + + builder.addPair(name = "secondary", contig = chr1Index, start1 = 100, start2 = 150) + .iterator.foreach(r => r.secondary = true) + + builder.addPair(name = "duplicates", contig = chr1Index, start1 = 100, start2 = 150) + .iterator.foreach(r => r.duplicate = true) + + builder.addPair(name = "non_pf", contig = chr1Index, start1 = 100, start2 = 150) + .iterator.foreach(r => r.pf = false) + + builder.addPair(name = "non-overlapping1", contig = chr1Index, start1 = 400, start2 = 550) + .iterator.foreach(r => r.pf = false) + + builder.addPair(name = "non-overlapping2", contig = chr2Index, start1 = 100, start2 = 150) + .iterator.foreach(r => r.pf = false) + + val minMapQ: PhredScore = 30.toByte + val templates: Seq[Template] = Bams.templateIterator(builder.toSource).toSeq + + covMan.resetCoverage() + + // here we *should not* add to the coverage, since non of the reads pass filteres + covMan.processTemplates(templates.iterator, minMapQ, 1).isEmpty shouldBe true + covMan.getMinCoverage(new Interval("chr1", 150, 200)) shouldEqual 0.toShort + // since there's no region of interest as all the reads get filtered out... + covMan.getMinCoverage(template = templates.head, minMapQ) shouldEqual Short.MaxValue + } + + + it should "Correctly not add reads that should be filtered but include good read from same template" in { + val builder = new SamBuilder(sort = Some(SamOrder.Queryname)) + builder.header.setSequenceDictionary(dict) + + builder.addPair(name = "secondary", contig = chr1Index, start1 = 100, start2 = 150, mapq1 = 40, mapq2 = 40) + + builder.addPair(name = "secondary", contig = chr2Index, start1 = 100, start2 = 150) + .iterator.foreach(r => r.secondary = true) + + val minMapQ: PhredScore = 30.toByte + val templates: Seq[Template] = Bams.templateIterator(builder.toSource).toSeq + + covMan.resetCoverage() + + // here we *should not* add to the coverage, since non of the reads pass filters + val reads = covMan.processTemplates(templates.iterator, minMapQ, 1) + + reads.isEmpty shouldBe false + reads.size shouldBe 4 + covMan.getMinCoverage(new Interval("chr1", 150, 200)) shouldEqual 1.toShort + covMan.getMinCoverage(template = templates.head, minMapQ) shouldEqual 1 + } } diff --git a/src/test/scala/com/fulcrumgenomics/personal/yfarjoun/LocusTrackTest.scala b/src/test/scala/com/fulcrumgenomics/personal/yfarjoun/LocusTrackTest.scala new file mode 100644 index 000000000..cb77a2128 --- /dev/null +++ b/src/test/scala/com/fulcrumgenomics/personal/yfarjoun/LocusTrackTest.scala @@ -0,0 +1,68 @@ +package com.fulcrumgenomics.personal.yfarjoun + +import com.fulcrumgenomics.personal.yfarjoun.LocusTrack.Empty +import com.fulcrumgenomics.testing.UnitSpec +import htsjdk.samtools.util.Interval + +class LocusTrackTest extends UnitSpec { + val i1: Interval = new Interval("chr1", 100, 200) + val i2: Interval = new Interval("chr1", 150, 300) + val i3: Interval = new Interval("chr1", 300, 400) + + val i4: Interval = new Interval("chr2", 300, 400) + + val lt1 = new LocusTrack(i1) + val lt2 = new LocusTrack(i2) + val lt3 = new LocusTrack(i3) + val lt4 = new LocusTrack(i4) + val ltEmpty = new LocusTrack(None) + + "LocusTrack" should "Start off with a seq[Short] of the correct length" in { + lt1.track should have length i1.length() + lt2.track should have length i2.length() + lt3.track should have length i3.length() + lt4.track should have length i4.length() + } + + it should "start off containing only zeros" in { + lt1.track.count(_ == 0) shouldEqual i1.length() + lt2.track.count(_ == 0) shouldEqual i2.length() + lt3.track.count(_ == 0) shouldEqual i3.length() + lt4.track.count(_ == 0) shouldEqual i4.length() + } + + it should "return the emtpy locusTrack when its empty" in{ + ltEmpty.sliceToLocus(i1) shouldBe Empty + lt3.sliceToLocus(i1) shouldBe Empty + lt4.sliceToLocus(i1) shouldBe Empty + } + + it should "get incremented when incrementing a slice" in { + val lt2_slice_i1 = lt2.sliceToLocus(i1) + // this increments the overlap of i1 and i2, so chr1:50-100 + lt2_slice_i1.track.forall(_==0) shouldBe true + lt2_slice_i1.track.indices.foreach(i => lt2_slice_i1.track(i) = (lt2_slice_i1.track(i) + 1).toShort) + lt2_slice_i1.track.forall(_==1) shouldBe true + + val map: Map[Short, Int] = lt2.track + .indices + .groupBy(i => lt2.track(i)) + .map { case (k: Short, v: Seq[Short]) => (k, v.length) } + // 150->200 should be incremented, and 201->300 not. + map shouldEqual Map(0 -> 100, 1 -> 51) + } + + + it should "not fall over when there's no overlap" in { + val lt3_slice_i1 = lt3.sliceToLocus(i1) + // this increments the overlap of i1 and i2, so chr1:50-100 + lt3_slice_i1 shouldBe LocusTrack.Empty + lt3_slice_i1.track.indices + .foreach(i => lt3_slice_i1.track(i) = (lt3_slice_i1.track(i) + 1).toShort) + lt3_slice_i1.track.forall(_==1) shouldBe true + + val map: Map[Short, Int] = lt3.track.indices.groupBy(i => lt3.track(i)).map { case (k: Short, v: Seq[Short]) => (k, v.length) } + // 150->200 should be incremented, and 201->300 not. + map shouldEqual Map(0 -> 101) + } +} From 5f8bdd6e0d2d6c59ef12851311ce0d2fb659e2ee Mon Sep 17 00:00:00 2001 From: Yossi Farjoun Date: Thu, 29 Dec 2022 01:33:27 -0500 Subject: [PATCH 13/16] - unit tests are pretty through - still need to test the CLP directly. --- .../scala/com/fulcrumgenomics/bam/Bams.scala | 18 ++-- .../personal/yfarjoun/CoverageManager.scala | 87 ++++++++++++++----- .../personal/yfarjoun/LocusTrack.scala | 36 +++++++- .../personal/yfarjoun/NormalizeCoverage.scala | 72 +++++++++------ .../com/fulcrumgenomics/bam/BamsTest.scala | 61 +++++++++++++ .../yfarjoun/CoverageManagerTest.scala | 80 +++++++++-------- 6 files changed, 252 insertions(+), 102 deletions(-) diff --git a/src/main/scala/com/fulcrumgenomics/bam/Bams.scala b/src/main/scala/com/fulcrumgenomics/bam/Bams.scala index 339933a63..ab7caba2c 100644 --- a/src/main/scala/com/fulcrumgenomics/bam/Bams.scala +++ b/src/main/scala/com/fulcrumgenomics/bam/Bams.scala @@ -377,21 +377,17 @@ object Bams extends LazyLogging { * @param tmpDir a temp directory to use for temporary sorting files if sorting is needed * @return an Iterator of queryname sorted Template objects */ - def queryGroupedRandomIterator(iterator: Iterator[SamRecord], - header: SAMFileHeader, - maxInMemory: Int, - tmpDir:DirPath=Io.tmpDir): SelfClosingIterator[Template] = { + def templateRandomIterator(iterator: Iterator[SamRecord], + header: SAMFileHeader, + maxInMemory: Int, + tmpDir:DirPath=Io.tmpDir): SelfClosingIterator[Template] = { - val randomSeed=42 - val sorter = RandomizeBam.Randomize(iterator, header, randomSeed, maxInMemory, queryGroup = true, tmpDir) - val sortedIterator = sorter.iterator + val randomSeed = 42 + val sortedIterator = RandomizeBam.Randomize(iterator, header, randomSeed, maxInMemory, queryGroup = true, tmpDir).iterator - val sortedHead = header.clone() - sortedHead.setGroupOrder(GroupOrder.query) - sortedHead.setSortOrder(SortOrder.unsorted) + val sortedHead = SamOrder.RandomQuery.applyTo(header.clone()) Bams.templateIterator(sortedIterator, sortedHead, maxInMemory, tmpDir) - } /** Returns an iterator over the records in the given iterator such that the order of the records returned is diff --git a/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/CoverageManager.scala b/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/CoverageManager.scala index 5ed134064..dbe33d51b 100644 --- a/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/CoverageManager.scala +++ b/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/CoverageManager.scala @@ -12,40 +12,67 @@ import scala.jdk.CollectionConverters.{IteratorHasAsJava, IteratorHasAsScala} object CoverageManager { + /** + * a Filter for reads determining if they count towards coverage. + * + * This filter examins the reads flags and mapping quality. It will pass (i.e. return true) + * primary and supplementary (non-secondary) non-duplicate reads that are mapped (with mapping quality >= input minMapQ,) + * and pass PF. + * + * @param record Input record to filter + * @param minMapQ minimum mapping quality for passing reads + * @return true if record is mapped with mapQ>=minMapQ, non-duplicate, pf, and non-secondary. + */ def readFilterForCoverage(record: SamRecord, minMapQ: PhredScore): Boolean = { - !record.secondary && // I'm assuming here that secondary reads to not count towards coverage, but supplementary do + !record.secondary && record.mapped && !record.duplicate && record.pf && record.mapq >= minMapQ } + /** + * Method that takes in a SamRecord and returns an Interval with the same position. + * will return an empty Interval on contig "" if read is unmapped + * + * @param samRecord SamRecord to convert to an Interval + * @return Interval covering the same position as the alignment of the read + */ def convertSamToInterval(samRecord: SamRecord): Interval = { - new Interval(samRecord.refName, samRecord.start, samRecord.end) + if (samRecord.mapped) { + new Interval(samRecord.refName, samRecord.start, samRecord.end) + } else { + new Interval("", 1, 0) + } } /** - * Takes an iterator of Intervals and reduces it to a non-overlapping Seq of the same. + * Takes an iterator of Intervals and reduces it to a *non-overlapping* Seq of the same. * - * Result will not have overlapping Intervals, but will cover the same original territory + * Result will cover the same original territory, and are guarranteed to not have overlapping Intervals. */ def squish(ls: IterableOnce[Interval]): IterableOnce[Interval] = { new IntervalList.IntervalMergerIterator(ls.iterator.asJava, true, false, false) }.asScala - } /** + * Class for managing coverage over an interval list. + * + * Holds an IntervalList of interest, a target coverage value and a minimum mapping quality for reads to be considered + * as providing coverage. * */ -class CoverageManager(val intervals: IntervalList) { +class CoverageManager(val intervals: IntervalList, val minMapQ:PhredScore, val coverageTarget:Int) { private val coverage: OverlapDetector[LocusTrack] = new OverlapDetector[LocusTrack](0, 0) IntervalList.getUniqueIntervals(intervals, false) .foreach(i => coverage.addLhs(new LocusTrack(i), i)) - def getCoverageTracks(locus: Interval): Iterator[LocusTrack] = { + + + private def getCoverageTracks(locus: Interval): Iterator[LocusTrack] = { coverage.getOverlaps(locus).map(lt => lt.sliceToLocus(locus)) } @@ -55,16 +82,17 @@ class CoverageManager(val intervals: IntervalList) { * * returns Short.MaxValue if template doesn't overlap with requested intervals */ - def getMinCoverage(template: Template, minMapQ: PhredScore): Short = { + private[yfarjoun] def getMinTemplateCoverage(template: Template): Short = { template.allReads.filter(readFilterForCoverage(_, minMapQ)).map(read => { getMinCoverage(convertSamToInterval(read)) }).minOption.getOrElse(Short.MaxValue) } /** - * returns the minimum value in the region of interest and in the locus + * returns the minimum value in the region of interest and in the locus provided * - * returns Short.MaxValue if template doesn't overlap with requested intervals + * returns Short.MaxValue if template doesn't overlap with requested intervals (so that it will + * register as "not needed" to hit coverage no matter what the target is. */ def getMinCoverage(locus: Interval): Short = { getCoverageTracks(locus) @@ -73,7 +101,8 @@ class CoverageManager(val intervals: IntervalList) { getOrElse(Short.MaxValue) } - def incrementCoverage(interval: Interval): Unit = { + + private[yfarjoun] def incrementCoverage(interval: Interval): Unit = { getCoverageTracks(interval).foreach(locusTrack => locusTrack.track.indices .foreach(j => locusTrack.track(j) = Math.min(Short.MaxValue, locusTrack.track(j) + 1).toShort)) @@ -83,32 +112,44 @@ class CoverageManager(val intervals: IntervalList) { * Increments the coverage map over the range of the relevant reads of the template. * Will not increment beyond Short.MaxValue to avoid overflow */ - def incrementCoverage(template: Template, minMapQ: PhredScore): Unit = { + private[yfarjoun] def incrementCoverage(template: Template): Unit = { CoverageManager .squish(template.allReads.filter(readFilterForCoverage(_, minMapQ)).map(convertSamToInterval)) .iterator.foreach(read => incrementCoverage(read)) } - /** mostly for testing, but might come in handy */ - def resetCoverage(): Unit = { + /** Mostly for testing */ + private[yfarjoun] def resetCoverage(): Unit = { coverage.getAll.foreach(locusTrack => locusTrack.track.indices .foreach(j => locusTrack.track.update(j, 0.toShort))) } - def processTemplates(templateIterator: Iterator[Template], minMapQ: PhredScore, coverageTarget: Short): Iterator[SamRecord] = { + /** + * Filters, an Iterator[Template] while recording the resulting coverage + * **with side-effects** as follows: + * + * 1. Only templates that have reads that count towards coverage, AND are needed in order to hit the target coverage + * pass the filters. + * 2. coverage over the overlapping region is incremented (only by 1 if overlapping a read which counts towards coverage) + * 3. templates are returned in their entirety, or not at all. no partial Templates are returned. + * 4. Any template which is returned contains at least one record whose coverage was used towards the target. + * 5. returns a Seq rather than an Iterator so that the side-effects will happen for sure. + * + * @param templateIterator input Template Iterator + * @return filtered and processed templates. + */ + def processTemplates(templateIterator: Iterator[Template]): Seq[Template] = { templateIterator - // only consider templates that contain reads which cover regions + // Only consider templates that contain reads which cover regions // that need coverage, i.e. that at least one base that they cover is less // than the target coverage - .filter(t => getMinCoverage(t, minMapQ) < coverageTarget) + .filter(t => getMinTemplateCoverage(t) < coverageTarget) //increment coverages and emit reads from template - .flatMap(t => { - incrementCoverage(t, minMapQ) - //output - t.allReads.iterator - }) + .map(t => { + incrementCoverage(t) + t + }).toSeq } - } diff --git a/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/LocusTrack.scala b/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/LocusTrack.scala index 7d7985eaf..e81a33812 100644 --- a/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/LocusTrack.scala +++ b/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/LocusTrack.scala @@ -1,24 +1,52 @@ package com.fulcrumgenomics.personal.yfarjoun import htsjdk.samtools.util.{Interval, Locatable} - import scala.collection.mutable object LocusTrack { val Empty = new LocusTrack(None) + + /** + * A small class which implements a "connected slice", i.e. a slice of a mutable.Seq which, when modified + * simply modifies its "parent" Seq. Unlike Seq.slice and Array.slice, copies of data are not made. + * @param mySeq original Seq[A] of which this is a slice + * @param from index from which slice starts (0-based, inclusive.) Must be >=0 + * @param to index to which slice continues (0-based, exclusive). Must be >= from + * @tparam A Type of Seq + */ class ConnectedSlice[A](val mySeq: mutable.Seq[A], val from: Int, val to: Int) extends mutable.Seq[A] { assert(from >= 0) assert(to >= from) assert(to <= mySeq.size) - override def update(idx: Int, elem: A): Unit = mySeq.update(idx + from, elem) + /** + * Updates element idx in slice to value elem. Will (only) update appropriate value in origSeq + * @param idx index of element in the slice to update + * @param elem new value of element + */ + @throws[IndexOutOfBoundsException] + override def update(idx: Int, elem: A): Unit = { + if (idx < 0 || idx >= length) { + throw new IndexOutOfBoundsException(idx) + } + mySeq.update(idx + from, elem) + } - override def apply(i: Int): A = mySeq.apply(i + from) + /** Get the element at the specified index. */ + @throws[IndexOutOfBoundsException] + override def apply(i: Int): A = { + if (i < 0 || i >= length) { + throw new IndexOutOfBoundsException(i) + } + mySeq.apply(i + from) + } - override def length: Int = to - from + override val length: Int = to - from + /** Provides an iterator over the elements between from and to in mySeq. + * */ override def iterator: Iterator[A] = mySeq.slice(from, to).iterator } } diff --git a/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverage.scala b/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverage.scala index ea57b6dac..27a8b918e 100644 --- a/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverage.scala +++ b/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverage.scala @@ -26,8 +26,8 @@ package com.fulcrumgenomics.personal.yfarjoun import com.fulcrumgenomics.FgBioDef._ -import com.fulcrumgenomics.bam.api.{SamOrder, SamRecord, SamSource, SamWriter} -import com.fulcrumgenomics.bam.{Bams, Template} +import com.fulcrumgenomics.bam.Bams +import com.fulcrumgenomics.bam.api.{SamOrder, SamSource, SamWriter} import com.fulcrumgenomics.cmdline.{ClpGroups, FgBioTool} import com.fulcrumgenomics.commons.util.LazyLogging import com.fulcrumgenomics.fasta.SequenceDictionary @@ -35,9 +35,7 @@ import com.fulcrumgenomics.personal.yfarjoun.NormalizeCoverage._ import com.fulcrumgenomics.sopt._ import com.fulcrumgenomics.util.Io import com.fulcrumgenomics.util.NumericTypes.PhredScore -import htsjdk.samtools.SAMFileHeader -import htsjdk.samtools.SAMFileHeader.{GroupOrder, SortOrder} -import htsjdk.samtools.util.{Interval, IntervalList} +import htsjdk.samtools.util.{Interval, IntervalList, SequenceUtil} import scala.jdk.CollectionConverters._ @@ -52,17 +50,29 @@ object NormalizeCoverage { } @clp(description = - """ -Normalizes coverage of an input bam to be at a given coverage for every base in the reference/input interval list. - """, +""" + |Normalizes coverage of an input bam to be at a given coverage for every base in the reference/input interval list. + | + |Only non-secondary, non-duplicate, mapped (with mapping quality >= mq argument), pf reads are considered. A set of reads + |from the same queryname will increment the depth by 1 if it covers the requested region. Templates (the complete collection + |of reads with the same queryname) will be emitted to the output file if they were found to be needed to provide coverage + |for at least one base in the region of interest (or the entire reference specified in the input bam, if no such interval + |is provided.) + | + |The order at which the templates are considered is random (based on the query name) but deterministic. + | + |Reads are assumed to provide coverage from the "alignment start" to the "alignment end" positions. No special + |consideration is provided for deletions or gaps in the alignment. +""", group = ClpGroups.SamOrBam) class NormalizeCoverage( @arg(flag = 'i', doc = "The input SAM or BAM file.") val input: PathToBam, - @arg(flag = 'o', doc = "Output BAM file to write.") val output: PathToBam, - @arg(flag = 'l', doc = "The interval list over which to normalize coverage. " + - "If provided, templates whose primary reads do not overlap with this region will be dropped entirely.") val targetIntervals: Option[PathToIntervals] = None, - @arg(name = "mq", doc = "MAPQ lower bound for considering read-template.") val minMQ: PhredScore = DefaultMinMapQ, - @arg(flag = 'c', doc = "Target coverage.") val coverageTarget: Short, + @arg(flag = 'o', doc = "Output BAM file to write the templates that are needed to bring " + + "the coverage to the target value.") val output: PathToBam, + @arg(flag = 'l', doc = "The interval list over which to normalize coverage. If not provided program will " + + "assume that coverage over entire reference is required.") val targetIntervals: Option[PathToIntervals] = None, + @arg(name = "mq", doc = "Mapping Quality lower bound for considering a read.") val minMQ: PhredScore = DefaultMinMapQ, + @arg(flag = 'c', doc = "Target coverage. ") val coverageTarget: Short, @arg(flag = 's', doc = "Order in which to emit the records.") val sortOrder: SamOrder = SamOrder.Coordinate, @arg(flag = 'm', doc = "Max records in RAM.") val maxRecordsInRam: Int = SamWriter.DefaultMaxRecordsInRam @@ -70,39 +80,47 @@ class NormalizeCoverage( //sets up the overlap detector and the coverage tracker - def createCoverageManager(bam: PathToBam, intervalsMaybe: Option[PathToIntervals]): CoverageManager = { - val intervals: IntervalList = intervalsMaybe match { + private def createCoverageManager(bam: PathToBam, intervalsMaybe: Option[PathToIntervals]): CoverageManager = { + val dict = SequenceDictionary.extract(bam) + val intervals = intervalsMaybe match { + // if no intervals are provided, make one from the embedded dictionary case None => - val dict = SequenceDictionary.extract(bam) - val il: IntervalList = new IntervalList(dict.toSam) + val il = new IntervalList(dict.toSam) il.addall(getIntervalsFromDictionary(dict).asJava) il - case Some(intervals) => IntervalList.fromPath(intervals) + case Some(intervals) => + val il = IntervalList.fromPath(intervals) + // make sure that input il and bam have compatible dictionaries. + SequenceUtil.assertSequenceDictionariesEqual(dict.toSam, il.getHeader.getSequenceDictionary,true) + il } - new CoverageManager(intervals) + new CoverageManager(intervals, minMapQ = minMQ, coverageTarget = coverageTarget) } - override def execute(): Unit = { checkArguments() val coverageManager = createCoverageManager(input, targetIntervals) val in = SamSource(input) - val header: SAMFileHeader = in.header.clone() - header.setGroupOrder(GroupOrder.query) - header.setSortOrder(SortOrder.unsorted) - val out = SamWriter(output, header, sort = Some(sortOrder), maxRecordsInRam = maxRecordsInRam) + // create an output writer with the correct output order + val out = SamWriter(output, + in.header.clone(), + sort = Some(sortOrder), + maxRecordsInRam = maxRecordsInRam) - val templateIterator: Iterator[Template] = Bams.queryGroupedRandomIterator(in.iterator, + // create a random-ordered template iterator + val templateIterator = Bams.templateRandomIterator(in.iterator, in.header, maxRecordsInRam) + // find the templates required for coverage and add them to the output writer coverageManager - .processTemplates(templateIterator, minMQ, coverageTarget) - .foreach(out.write) + .processTemplates(templateIterator) + .foreach(t => t.allReads.foreach(out.write)) + // close streams out.close() in.close() } diff --git a/src/test/scala/com/fulcrumgenomics/bam/BamsTest.scala b/src/test/scala/com/fulcrumgenomics/bam/BamsTest.scala index b7c300097..40bdb4d29 100644 --- a/src/test/scala/com/fulcrumgenomics/bam/BamsTest.scala +++ b/src/test/scala/com/fulcrumgenomics/bam/BamsTest.scala @@ -218,6 +218,67 @@ class BamsTest extends UnitSpec { } } + "Bams.templateRandomIterator" should "return template objects in a random order (by queryname)" in { + val builder = new SamBuilder(sort=Some(SamOrder.Coordinate)) + builder.addPair(name="p1", start1=100, start2=300) + builder.addFrag(name="f1", start=100) + builder.addPair(name="p2", start1=500, start2=200) + builder.addPair(name="p0", start1=700, start2=999) + + val templates = Bams.templateRandomIterator(builder.iterator, builder.header,10).toSeq + templates should have size 4 + templates.map(_.name) shouldBe Seq("f1", "p0", "p1", "p2") + + templates.foreach {t => + (t.r1Supplementals ++ t.r1Secondaries ++ t.r2Supplementals ++ t.r2Secondaries).isEmpty shouldBe true + if (t.name startsWith "f") { + t.r1.exists(r => !r.paired) shouldBe true + t.r2.isEmpty shouldBe true + } + else { + t.r1.exists(r => r.firstOfPair) shouldBe true + t.r2.exists(r => r.secondOfPair) shouldBe true + } + } + } + + it should "only return templates in a random sorted order" in { + val builder1 = new SamBuilder(sort = Some(SamOrder.Unknown)) + builder1.addPair(name = "A9") + builder1.addPair(name = "A88") + val iterator1 = Bams.templateRandomIterator(builder1.iterator, builder1.header, maxInMemory = 10, Io.tmpDir) + iterator1.toList.map(_.name) should contain theSameElementsInOrderAs Seq("A88", "A9") + + val builder2 = new SamBuilder(sort = Some(SamOrder.Unknown)) + builder2.addPair(name = "A88") + builder2.addPair(name = "A9") + val iterator2 = Bams.templateRandomIterator(builder2.iterator, builder2.header, maxInMemory = 10, Io.tmpDir) + iterator2.toList.map(_.name) should contain theSameElementsInOrderAs Seq("A88", "A9") + } + + it should "handle reads with secondary and supplementary records" in { + val builder = new SamBuilder(sort=Some(SamOrder.Coordinate)) + Range.inclusive(1, 5).foreach { i=> + builder.addPair(s"p$i", start1=i*100, start2=i*100 + 500) + builder.addPair(s"p$i", start1=i*1000, start2=i*1000 + 500).foreach(_.secondary = true) + builder.addPair(s"p$i", start1=i*10, start2=i*10 + 500).foreach(_.supplementary = true) + } + + val templates = Bams.templateRandomIterator(builder.iterator,builder.header,10).toSeq + templates.map(_.name) shouldBe Seq("p1", "p2", "p3", "p4", "p5") + templates.foreach { t => + t.r1.exists(r => r.firstOfPair && !r.secondary & !r.supplementary) shouldBe true + t.r2.exists(r => r.secondOfPair && !r.secondary & !r.supplementary) shouldBe true + Seq(t.r1Secondaries, t.r2Secondaries, t.r1Supplementals, t.r2Supplementals).foreach(rs => rs.size shouldBe 1) + + t.r1Secondaries.foreach (r => (r.firstOfPair && r.secondary) shouldBe true) + t.r2Secondaries.foreach (r => (r.secondOfPair && r.secondary) shouldBe true) + t.r1Supplementals.foreach(r => (r.firstOfPair && r.supplementary) shouldBe true) + t.r2Supplementals.foreach(r => (r.secondOfPair && r.supplementary) shouldBe true) + } + } + + "Bams.regenerateNmUqMdTags" should "null out fields on unmapped reads" in { val builder = new SamBuilder(sort=Some(SamOrder.Coordinate)) val rec = builder.addFrag(unmapped=true).get diff --git a/src/test/scala/com/fulcrumgenomics/personal/yfarjoun/CoverageManagerTest.scala b/src/test/scala/com/fulcrumgenomics/personal/yfarjoun/CoverageManagerTest.scala index 0971c90b0..e3813ae04 100644 --- a/src/test/scala/com/fulcrumgenomics/personal/yfarjoun/CoverageManagerTest.scala +++ b/src/test/scala/com/fulcrumgenomics/personal/yfarjoun/CoverageManagerTest.scala @@ -27,7 +27,8 @@ class CoverageManagerTest extends UnitSpec { val il: IntervalList = new IntervalList(dict) il.addall(List(i1, i2, i3).asJava) - val covMan: CoverageManager = new CoverageManager(il) + val covMan: CoverageManager = new CoverageManager(il, minMapQ = 30.toByte, 10) + val covManZero: CoverageManager = new CoverageManager(il, minMapQ = 10.toByte, 0) "CoverageManager.getMinCoverage" should "start its life with coverage = zero" in { covMan.getMinCoverage(i1) shouldEqual 0 @@ -64,56 +65,62 @@ class CoverageManagerTest extends UnitSpec { val builder = new SamBuilder(sort = Some(SamOrder.Queryname)) builder.header.setSequenceDictionary(dict) builder.addPair(name = "p1", contig = chr1Index, start1 = 100, start2 = 150) - val minMapQ: PhredScore = 30.toByte val templates: Seq[Template] = Bams.templateIterator(builder.toSource).toSeq covMan.resetCoverage() - templates.foreach(t => covMan.incrementCoverage(t, minMapQ)) + templates.foreach(t => covMan.incrementCoverage(t)) covMan.getMinCoverage(new Interval("chr1", 150, 200)) shouldEqual 1 - covMan.getMinCoverage(template = templates.head, minMapQ) shouldEqual 1 + covMan.getMinTemplateCoverage(template = templates.head) shouldEqual 1 } it should "Correctly add reads from an easy template" in { val builder = new SamBuilder(sort = Some(SamOrder.Queryname)) builder.header.setSequenceDictionary(dict) builder.addPair(name = "p1", contig = chr1Index, start1 = 100, start2 = 150) - val minMapQ: PhredScore = 30.toByte val templates: Seq[Template] = Bams.templateIterator(builder.toSource).toSeq - covMan.resetCoverage() - // here we should *not* add anything to the coverage, since the coverage target is zero - covMan.processTemplates(templates.iterator, minMapQ, 0).isEmpty shouldBe true - covMan.getMinCoverage(new Interval("chr1", 150, 200)) shouldEqual 0 - covMan.getMinCoverage(template = templates.head, minMapQ) shouldEqual 0 + covManZero.resetCoverage() - // here we *should* add to the coverage, since the coverage target is one - covMan.processTemplates(templates.iterator, minMapQ, 1).isEmpty shouldBe false + covManZero.processTemplates(templates.iterator).isEmpty shouldBe true + covManZero.getMinTemplateCoverage(template = templates.head) shouldEqual 0 + covManZero.getMinCoverage(new Interval("chr1", 150, 200)) shouldEqual 0 + + // here we *should* add to the coverage, since the coverage target is 10 + covMan.resetCoverage() + + covMan.processTemplates(templates.iterator).isEmpty shouldBe false covMan.getMinCoverage(new Interval("chr1", 150, 200)) shouldEqual 1 - covMan.getMinCoverage(template = templates.head, minMapQ) shouldEqual 1 + covMan.getMinTemplateCoverage(template = templates.head) shouldEqual 1 } - it should "Correctly only consider coverage from the reads of high mapq" in { + it should "Correctly only consider coverage from the reads of high mapq (target zero)" in { val builder = new SamBuilder(sort = Some(SamOrder.Queryname)) builder.header.setSequenceDictionary(dict) - builder.addPair(name = "p1", contig = chr1Index, start1 = 100, start2 = 150, mapq1 = PhredScore.MinValue,mapq2 = PhredScore.MaxValue) - val minMapQ: PhredScore = 30.toByte - val templates: Seq[Template] = Bams.templateIterator(builder.toSource).toSeq + builder.addPair(name = "p1", contig = chr1Index, start1 = 100, start2 = 150, mapq1 = PhredScore.MinValue, mapq2 = PhredScore.MaxValue) - covMan.resetCoverage() + // Here we should *not* add anything to the coverage, since the coverage target is zero. + covManZero.resetCoverage() + val templates: Seq[Template] = Bams.templateIterator(builder.toSource).toSeq + covManZero.processTemplates(templates.iterator).isEmpty shouldBe true + covManZero.getMinCoverage(new Interval("chr1", 150, 200)) shouldEqual 0 + covManZero.getMinTemplateCoverage(template = templates.head) shouldEqual 0 + } - // here we should *not* add anything to the coverage, since the coverage target is zero - covMan.processTemplates(templates.iterator, minMapQ, 0).isEmpty shouldBe true - covMan.getMinCoverage(new Interval("chr1", 150, 200)) shouldEqual 0 - covMan.getMinCoverage(template = templates.head, minMapQ) shouldEqual 0 + it should "Correctly only consider coverage from the reads of high mapq (target 10)" in { + val builder = new SamBuilder(sort = Some(SamOrder.Queryname)) + builder.header.setSequenceDictionary(dict) + builder.addPair(name = "p1", contig = chr1Index, start1 = 100, start2 = 150, mapq1 = PhredScore.MinValue, mapq2 = PhredScore.MaxValue) - // here we *should* add to the coverage, since the coverage target is one - covMan.processTemplates(templates.iterator, minMapQ, 1).isEmpty shouldBe false + // Here we *should* add to the coverage, since the coverage target is 10. + covMan.resetCoverage() + val templates: Seq[Template] = Bams.templateIterator(builder.toSource).toSeq + covMan.processTemplates(templates.iterator).isEmpty shouldBe false covMan.getMinCoverage(new Interval("chr1", 100, 149)) shouldEqual 0 - covMan.getMinCoverage(new Interval("chr1", 150, 249)) shouldEqual 1 + covMan.getMinCoverage(new Interval("chr1", 150, 150)) shouldEqual 1 + covMan.getMinTemplateCoverage(template = templates.head) shouldEqual 1 covMan.getMinCoverage(new Interval("chr1", 0, 250)) shouldEqual 0 - covMan.getMinCoverage(template = templates.head, minMapQ) shouldEqual 1 } @@ -138,16 +145,15 @@ class CoverageManagerTest extends UnitSpec { builder.addPair(name = "non-overlapping2", contig = chr2Index, start1 = 100, start2 = 150) .iterator.foreach(r => r.pf = false) - val minMapQ: PhredScore = 30.toByte val templates: Seq[Template] = Bams.templateIterator(builder.toSource).toSeq covMan.resetCoverage() // here we *should not* add to the coverage, since non of the reads pass filteres - covMan.processTemplates(templates.iterator, minMapQ, 1).isEmpty shouldBe true + covMan.processTemplates(templates.iterator).isEmpty shouldBe true covMan.getMinCoverage(new Interval("chr1", 150, 200)) shouldEqual 0.toShort // since there's no region of interest as all the reads get filtered out... - covMan.getMinCoverage(template = templates.head, minMapQ) shouldEqual Short.MaxValue + covMan.getMinTemplateCoverage(template = templates.head) shouldEqual Short.MaxValue } @@ -155,22 +161,22 @@ class CoverageManagerTest extends UnitSpec { val builder = new SamBuilder(sort = Some(SamOrder.Queryname)) builder.header.setSequenceDictionary(dict) - builder.addPair(name = "secondary", contig = chr1Index, start1 = 100, start2 = 150, mapq1 = 40, mapq2 = 40) + builder.addPair(name = "secondary", contig = chr1Index, start1 = 300, start2 = 400) builder.addPair(name = "secondary", contig = chr2Index, start1 = 100, start2 = 150) .iterator.foreach(r => r.secondary = true) - val minMapQ: PhredScore = 30.toByte val templates: Seq[Template] = Bams.templateIterator(builder.toSource).toSeq covMan.resetCoverage() - // here we *should not* add to the coverage, since non of the reads pass filters - val reads = covMan.processTemplates(templates.iterator, minMapQ, 1) + // here we *should not* add to the coverage, since none of the reads pass filters + val filteredTemplates = covMan.processTemplates(templates.iterator) - reads.isEmpty shouldBe false - reads.size shouldBe 4 - covMan.getMinCoverage(new Interval("chr1", 150, 200)) shouldEqual 1.toShort - covMan.getMinCoverage(template = templates.head, minMapQ) shouldEqual 1 + filteredTemplates.isEmpty shouldBe false + filteredTemplates.size shouldBe 1 + covMan.getMinCoverage(new Interval("chr1", 100, 200)) shouldEqual 0 + covMan.getMinCoverage(new Interval("chr1", 300, 400)) shouldEqual 1 + covMan.getMinTemplateCoverage(template = templates.head) shouldEqual 1 } } From c58668f5d34b9c3c7a5484f5fa89d84b9a070962 Mon Sep 17 00:00:00 2001 From: Yossi Farjoun Date: Thu, 29 Dec 2022 11:34:58 -0500 Subject: [PATCH 14/16] - improved performance by not calculating the minimum over the region. --- .../personal/yfarjoun/CoverageManager.scala | 81 +++++++++++-------- .../personal/yfarjoun/NormalizeCoverage.scala | 21 ++--- 2 files changed, 53 insertions(+), 49 deletions(-) diff --git a/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/CoverageManager.scala b/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/CoverageManager.scala index dbe33d51b..0865da3d1 100644 --- a/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/CoverageManager.scala +++ b/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/CoverageManager.scala @@ -1,3 +1,28 @@ +/* + * The MIT License + * + * Copyright (c) 2022 Fulcrum Genomics + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + */ + package com.fulcrumgenomics.personal.yfarjoun import com.fulcrumgenomics.FgBioDef.javaIterableToIterator @@ -5,13 +30,12 @@ import com.fulcrumgenomics.bam.Template import com.fulcrumgenomics.bam.api.SamRecord import com.fulcrumgenomics.personal.yfarjoun.CoverageManager.{convertSamToInterval, readFilterForCoverage} import com.fulcrumgenomics.util.NumericTypes.PhredScore -import htsjdk.samtools.util.{Interval, IntervalList, OverlapDetector} +import htsjdk.samtools.util.{Interval, IntervalList, Locatable, OverlapDetector} import scala.jdk.CollectionConverters.{IteratorHasAsJava, IteratorHasAsScala} object CoverageManager { - /** * a Filter for reads determining if they count towards coverage. * @@ -33,23 +57,20 @@ object CoverageManager { /** * Method that takes in a SamRecord and returns an Interval with the same position. - * will return an empty Interval on contig "" if read is unmapped + * requires that the read is mapped * * @param samRecord SamRecord to convert to an Interval * @return Interval covering the same position as the alignment of the read */ def convertSamToInterval(samRecord: SamRecord): Interval = { - if (samRecord.mapped) { - new Interval(samRecord.refName, samRecord.start, samRecord.end) - } else { - new Interval("", 1, 0) - } + assert(samRecord.mapped) + new Interval(samRecord.refName, samRecord.start, samRecord.end) } /** * Takes an iterator of Intervals and reduces it to a *non-overlapping* Seq of the same. * - * Result will cover the same original territory, and are guarranteed to not have overlapping Intervals. + * Result will cover the same original territory, and are guaranteed to not have overlapping Intervals. */ def squish(ls: IterableOnce[Interval]): IterableOnce[Interval] = { new IntervalList.IntervalMergerIterator(ls.iterator.asJava, true, false, false) @@ -63,45 +84,35 @@ object CoverageManager { * as providing coverage. * */ -class CoverageManager(val intervals: IntervalList, val minMapQ:PhredScore, val coverageTarget:Int) { +class CoverageManager(val intervals: IndexedSeq[Interval], val minMapQ: PhredScore, val coverageTarget: Int) { private val coverage: OverlapDetector[LocusTrack] = new OverlapDetector[LocusTrack](0, 0) - IntervalList.getUniqueIntervals(intervals, false) - .foreach(i => coverage.addLhs(new LocusTrack(i), i)) - + intervals.foreach(i => coverage.addLhs(new LocusTrack(i), i)) - - private def getCoverageTracks(locus: Interval): Iterator[LocusTrack] = { + private def getCoverageTracks(locus: Locatable): Iterator[LocusTrack] = { coverage.getOverlaps(locus).map(lt => lt.sliceToLocus(locus)) } /** - * provides the minimum value in the coverage map in the region that is both - * a region of interest, and covered by one of coverage-worthy reads in the template. - * - * returns Short.MaxValue if template doesn't overlap with requested intervals + * Determines if a Template contains a read that provides coverage over a region that needs it. */ - private[yfarjoun] def getMinTemplateCoverage(template: Template): Short = { - template.allReads.filter(readFilterForCoverage(_, minMapQ)).map(read => { - getMinCoverage(convertSamToInterval(read)) - }).minOption.getOrElse(Short.MaxValue) + private[yfarjoun] def needsCoverage(template: Template): Boolean = { + template.allReads.filter(readFilterForCoverage(_, minMapQ)) + .exists(r => needsCoverage(r.asSam)) } /** - * returns the minimum value in the region of interest and in the locus provided - * - * returns Short.MaxValue if template doesn't overlap with requested intervals (so that it will - * register as "not needed" to hit coverage no matter what the target is. + * Determines if a locus contains overlaps a region that needs coverage. */ - def getMinCoverage(locus: Interval): Short = { - getCoverageTracks(locus) - .map(lt => lt.track.min) - .minOption. - getOrElse(Short.MaxValue) + def needsCoverage(locus: Locatable): Boolean = { + getCoverageTracks(locus).exists(lt => lt.track.exists(_ < coverageTarget)) } - + /** + * Increments the coverage map over an interval. + * Will not increment beyond Short.MaxValue to avoid overflow + */ private[yfarjoun] def incrementCoverage(interval: Interval): Unit = { getCoverageTracks(interval).foreach(locusTrack => locusTrack.track.indices @@ -118,7 +129,7 @@ class CoverageManager(val intervals: IntervalList, val minMapQ:PhredScore, val c .iterator.foreach(read => incrementCoverage(read)) } - /** Mostly for testing */ + /** for testing */ private[yfarjoun] def resetCoverage(): Unit = { coverage.getAll.foreach(locusTrack => locusTrack.track.indices @@ -144,7 +155,7 @@ class CoverageManager(val intervals: IntervalList, val minMapQ:PhredScore, val c // Only consider templates that contain reads which cover regions // that need coverage, i.e. that at least one base that they cover is less // than the target coverage - .filter(t => getMinTemplateCoverage(t) < coverageTarget) + .filter(t => needsCoverage(t)) //increment coverages and emit reads from template .map(t => { incrementCoverage(t) diff --git a/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverage.scala b/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverage.scala index 27a8b918e..e643a2fc5 100644 --- a/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverage.scala +++ b/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverage.scala @@ -37,6 +37,7 @@ import com.fulcrumgenomics.util.Io import com.fulcrumgenomics.util.NumericTypes.PhredScore import htsjdk.samtools.util.{Interval, IntervalList, SequenceUtil} +import java.util import scala.jdk.CollectionConverters._ object NormalizeCoverage { @@ -82,17 +83,10 @@ class NormalizeCoverage( //sets up the overlap detector and the coverage tracker private def createCoverageManager(bam: PathToBam, intervalsMaybe: Option[PathToIntervals]): CoverageManager = { val dict = SequenceDictionary.extract(bam) - val intervals = intervalsMaybe match { + val intervals:IndexedSeq[Interval] = intervalsMaybe match { // if no intervals are provided, make one from the embedded dictionary - case None => - val il = new IntervalList(dict.toSam) - il.addall(getIntervalsFromDictionary(dict).asJava) - il - case Some(intervals) => - val il = IntervalList.fromPath(intervals) - // make sure that input il and bam have compatible dictionaries. - SequenceUtil.assertSequenceDictionariesEqual(dict.toSam, il.getHeader.getSequenceDictionary,true) - il + case None =>getIntervalsFromDictionary(dict).toIndexedSeq + case Some(intervals) => IntervalList.fromPath(intervals).uniqued().getIntervals.toIndexedSeq } new CoverageManager(intervals, minMapQ = minMQ, coverageTarget = coverageTarget) } @@ -101,7 +95,6 @@ class NormalizeCoverage( checkArguments() val coverageManager = createCoverageManager(input, targetIntervals) - val in = SamSource(input) // create an output writer with the correct output order @@ -118,11 +111,11 @@ class NormalizeCoverage( // find the templates required for coverage and add them to the output writer coverageManager .processTemplates(templateIterator) - .foreach(t => t.allReads.foreach(out.write)) + .foreach(t => out++=t.allReads) // close streams - out.close() - in.close() + out.safelyClose() + in.safelyClose() } From 204f76b04eaaeb1f03ef0da56c19205afdef2568 Mon Sep 17 00:00:00 2001 From: Yossi Farjoun Date: Thu, 29 Dec 2022 21:36:48 -0500 Subject: [PATCH 15/16] - removed wordy functions fixed tests --- .../personal/yfarjoun/CoverageManager.scala | 51 +++------- .../personal/yfarjoun/LocusTrack.scala | 37 +++++-- .../personal/yfarjoun/NormalizeCoverage.scala | 10 +- .../yfarjoun/CoverageManagerTest.scala | 99 +++++++++---------- 4 files changed, 91 insertions(+), 106 deletions(-) diff --git a/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/CoverageManager.scala b/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/CoverageManager.scala index 0865da3d1..acb84442b 100644 --- a/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/CoverageManager.scala +++ b/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/CoverageManager.scala @@ -28,8 +28,9 @@ package com.fulcrumgenomics.personal.yfarjoun import com.fulcrumgenomics.FgBioDef.javaIterableToIterator import com.fulcrumgenomics.bam.Template import com.fulcrumgenomics.bam.api.SamRecord -import com.fulcrumgenomics.personal.yfarjoun.CoverageManager.{convertSamToInterval, readFilterForCoverage} +import com.fulcrumgenomics.personal.yfarjoun.CoverageManager.readFilterForCoverage import com.fulcrumgenomics.util.NumericTypes.PhredScore +import htsjdk.samtools.SAMRecord import htsjdk.samtools.util.{Interval, IntervalList, Locatable, OverlapDetector} import scala.jdk.CollectionConverters.{IteratorHasAsJava, IteratorHasAsScala} @@ -55,18 +56,6 @@ object CoverageManager { record.mapq >= minMapQ } - /** - * Method that takes in a SamRecord and returns an Interval with the same position. - * requires that the read is mapped - * - * @param samRecord SamRecord to convert to an Interval - * @return Interval covering the same position as the alignment of the read - */ - def convertSamToInterval(samRecord: SamRecord): Interval = { - assert(samRecord.mapped) - new Interval(samRecord.refName, samRecord.start, samRecord.end) - } - /** * Takes an iterator of Intervals and reduces it to a *non-overlapping* Seq of the same. * @@ -84,24 +73,16 @@ object CoverageManager { * as providing coverage. * */ -class CoverageManager(val intervals: IndexedSeq[Interval], val minMapQ: PhredScore, val coverageTarget: Int) { +class CoverageManager(val intervals: IntervalList, val minMapQ: PhredScore, val coverageTarget: Int) { private val coverage: OverlapDetector[LocusTrack] = new OverlapDetector[LocusTrack](0, 0) - intervals.foreach(i => coverage.addLhs(new LocusTrack(i), i)) + intervals.uniqued().foreach(i => coverage.addLhs(new LocusTrack(i), i)) private def getCoverageTracks(locus: Locatable): Iterator[LocusTrack] = { coverage.getOverlaps(locus).map(lt => lt.sliceToLocus(locus)) } - /** - * Determines if a Template contains a read that provides coverage over a region that needs it. - */ - private[yfarjoun] def needsCoverage(template: Template): Boolean = { - template.allReads.filter(readFilterForCoverage(_, minMapQ)) - .exists(r => needsCoverage(r.asSam)) - } - /** * Determines if a locus contains overlaps a region that needs coverage. */ @@ -113,22 +94,12 @@ class CoverageManager(val intervals: IndexedSeq[Interval], val minMapQ: PhredSco * Increments the coverage map over an interval. * Will not increment beyond Short.MaxValue to avoid overflow */ - private[yfarjoun] def incrementCoverage(interval: Interval): Unit = { - getCoverageTracks(interval).foreach(locusTrack => + private[yfarjoun] def incrementCoverage(locatable: Locatable): Unit = { + getCoverageTracks(locatable).foreach(locusTrack => locusTrack.track.indices .foreach(j => locusTrack.track(j) = Math.min(Short.MaxValue, locusTrack.track(j) + 1).toShort)) } - /** - * Increments the coverage map over the range of the relevant reads of the template. - * Will not increment beyond Short.MaxValue to avoid overflow - */ - private[yfarjoun] def incrementCoverage(template: Template): Unit = { - CoverageManager - .squish(template.allReads.filter(readFilterForCoverage(_, minMapQ)).map(convertSamToInterval)) - .iterator.foreach(read => incrementCoverage(read)) - } - /** for testing */ private[yfarjoun] def resetCoverage(): Unit = { coverage.getAll.foreach(locusTrack => @@ -155,11 +126,11 @@ class CoverageManager(val intervals: IndexedSeq[Interval], val minMapQ: PhredSco // Only consider templates that contain reads which cover regions // that need coverage, i.e. that at least one base that they cover is less // than the target coverage - .filter(t => needsCoverage(t)) - //increment coverages and emit reads from template - .map(t => { - incrementCoverage(t) - t + .filter(t => { + val coverageReads: Seq[SAMRecord] = t.allReads.filter(readFilterForCoverage(_, minMapQ)).map(_.asSam).toSeq + val templateNeeded = coverageReads.exists(needsCoverage(_)) + if (templateNeeded) coverageReads.foreach(incrementCoverage(_)) + templateNeeded }).toSeq } } diff --git a/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/LocusTrack.scala b/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/LocusTrack.scala index e81a33812..bbf8955b2 100644 --- a/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/LocusTrack.scala +++ b/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/LocusTrack.scala @@ -1,3 +1,27 @@ +/* + * The MIT License + * + * Copyright (c) 2022 Fulcrum Genomics + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + */ package com.fulcrumgenomics.personal.yfarjoun import htsjdk.samtools.util.{Interval, Locatable} @@ -6,7 +30,6 @@ import scala.collection.mutable object LocusTrack { val Empty = new LocusTrack(None) - /** * A small class which implements a "connected slice", i.e. a slice of a mutable.Seq which, when modified * simply modifies its "parent" Seq. Unlike Seq.slice and Array.slice, copies of data are not made. @@ -16,7 +39,6 @@ object LocusTrack { * @tparam A Type of Seq */ class ConnectedSlice[A](val mySeq: mutable.Seq[A], val from: Int, val to: Int) extends mutable.Seq[A] { - assert(from >= 0) assert(to >= from) assert(to <= mySeq.size) @@ -63,14 +85,13 @@ class LocusTrack(val locus: Option[Interval], val track: mutable.Seq[Short]) { this(Some(l)) } - // return a slice of the track which matches a region overlapping a - // given locus + /** + * return a slice of the track which matches a region overlapping a given locus + */ def sliceToLocus(l: Locatable): LocusTrack = { locus match { - case None => - LocusTrack.Empty - case Some(loc) if !loc.contigsMatch(l) => - LocusTrack.Empty + case None => LocusTrack.Empty + case Some(loc) if !loc.contigsMatch(l) => LocusTrack.Empty case Some(loc) => val start: Int = Math.max(loc.getStart, l.getStart) val end: Int = Math.min(loc.getEnd, l.getEnd) diff --git a/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverage.scala b/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverage.scala index e643a2fc5..5cf006630 100644 --- a/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverage.scala +++ b/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverage.scala @@ -44,7 +44,6 @@ object NormalizeCoverage { /** Various default values for the Coverage Normalizer. */ val DefaultMinMapQ: PhredScore = 30.toByte - def getIntervalsFromDictionary(dict: SequenceDictionary): List[Interval] = { dict.iterator.map(seq => new Interval(seq.name, 1, seq.length)).toList } @@ -83,10 +82,12 @@ class NormalizeCoverage( //sets up the overlap detector and the coverage tracker private def createCoverageManager(bam: PathToBam, intervalsMaybe: Option[PathToIntervals]): CoverageManager = { val dict = SequenceDictionary.extract(bam) - val intervals:IndexedSeq[Interval] = intervalsMaybe match { + val intervals:IntervalList = intervalsMaybe match { // if no intervals are provided, make one from the embedded dictionary - case None =>getIntervalsFromDictionary(dict).toIndexedSeq - case Some(intervals) => IntervalList.fromPath(intervals).uniqued().getIntervals.toIndexedSeq + case None => val il=new IntervalList(dict.toSam) + il.addall(getIntervalsFromDictionary(dict).asJava) + il + case Some(intervalsListFile) => IntervalList.fromPath(intervalsListFile) } new CoverageManager(intervals, minMapQ = minMQ, coverageTarget = coverageTarget) } @@ -118,7 +119,6 @@ class NormalizeCoverage( in.safelyClose() } - private def checkArguments(): Unit = { Io.assertReadable(input) Io.assertCanWriteFile(output) diff --git a/src/test/scala/com/fulcrumgenomics/personal/yfarjoun/CoverageManagerTest.scala b/src/test/scala/com/fulcrumgenomics/personal/yfarjoun/CoverageManagerTest.scala index e3813ae04..9d305b0d5 100644 --- a/src/test/scala/com/fulcrumgenomics/personal/yfarjoun/CoverageManagerTest.scala +++ b/src/test/scala/com/fulcrumgenomics/personal/yfarjoun/CoverageManagerTest.scala @@ -1,5 +1,6 @@ package com.fulcrumgenomics.personal.yfarjoun +import com.fulcrumgenomics.FgBioDef._ import com.fulcrumgenomics.bam.api.SamOrder import com.fulcrumgenomics.bam.{Bams, Template} import com.fulcrumgenomics.fasta.Converters.ToSAMSequenceDictionary @@ -27,38 +28,41 @@ class CoverageManagerTest extends UnitSpec { val il: IntervalList = new IntervalList(dict) il.addall(List(i1, i2, i3).asJava) - val covMan: CoverageManager = new CoverageManager(il, minMapQ = 30.toByte, 10) + val covManTwo: CoverageManager = new CoverageManager(il, minMapQ = 30.toByte, 2) + val covManOne: CoverageManager = new CoverageManager(il, minMapQ = 30.toByte, 1) val covManZero: CoverageManager = new CoverageManager(il, minMapQ = 10.toByte, 0) - "CoverageManager.getMinCoverage" should "start its life with coverage = zero" in { - covMan.getMinCoverage(i1) shouldEqual 0 - covMan.getMinCoverage(i2) shouldEqual 0 - covMan.getMinCoverage(i3) shouldEqual 0 + "CoverageManager.needsCoverage" should "start being true" in { + covManOne.needsCoverage(i1) shouldBe true + covManOne.needsCoverage(i2) shouldBe true + covManOne.needsCoverage(i3) shouldBe true } - it should "return Short.MaxValue when out of range" in { - covMan.getMinCoverage(i4) shouldEqual Short.MaxValue + it should "return false when out of range" in { + covManOne.needsCoverage(i4) shouldBe false } - it should "remain coverage zero when only partially covered" in { - covMan.resetCoverage() - covMan.incrementCoverage(i1) - covMan.getMinCoverage(i2) shouldEqual 0 - covMan.getMinCoverage(i3) shouldEqual 0 - covMan.getMinCoverage(i1) shouldEqual 1 + it should "remain true when partially covered" in { + covManOne.resetCoverage() + covManOne.incrementCoverage(i1) + covManOne.needsCoverage(i2) shouldBe true + covManOne.needsCoverage(i3) shouldBe true + covManOne.needsCoverage(i1) shouldBe false } - it should "remain Short.MaxValue when out of range" in { - covMan.getMinCoverage(i4) shouldEqual Short.MaxValue + it should "remain false when out of range" in { + covManOne.resetCoverage() + covManOne.incrementCoverage(i1) + covManOne.needsCoverage(i4) shouldBe false } it should "get to 2 when adding coverage twice" in { - covMan.resetCoverage() - covMan.incrementCoverage(i1) - covMan.incrementCoverage(i1) - covMan.getMinCoverage(i2) shouldEqual 0 - covMan.getMinCoverage(i3) shouldEqual 0 - covMan.getMinCoverage(i1) shouldEqual 2 + covManTwo.resetCoverage() + covManOne.incrementCoverage(i1) + covManOne.incrementCoverage(i1) + covManOne.needsCoverage(i2) shouldBe true + covManOne.needsCoverage(i3) shouldBe true + covManOne.needsCoverage(i1) shouldBe false } it should "increment its coverage by 1 correctly from a template with overlapping reads" in { @@ -67,11 +71,10 @@ class CoverageManagerTest extends UnitSpec { builder.addPair(name = "p1", contig = chr1Index, start1 = 100, start2 = 150) val templates: Seq[Template] = Bams.templateIterator(builder.toSource).toSeq - covMan.resetCoverage() - templates.foreach(t => covMan.incrementCoverage(t)) + covManTwo.resetCoverage() + covManTwo.processTemplates(templates.iterator).isEmpty shouldBe false - covMan.getMinCoverage(new Interval("chr1", 150, 200)) shouldEqual 1 - covMan.getMinTemplateCoverage(template = templates.head) shouldEqual 1 + covManTwo.needsCoverage(new Interval("chr1", 150, 200)) shouldBe true } it should "Correctly add reads from an easy template" in { @@ -84,15 +87,13 @@ class CoverageManagerTest extends UnitSpec { covManZero.resetCoverage() covManZero.processTemplates(templates.iterator).isEmpty shouldBe true - covManZero.getMinTemplateCoverage(template = templates.head) shouldEqual 0 - covManZero.getMinCoverage(new Interval("chr1", 150, 200)) shouldEqual 0 + covManZero.needsCoverage(new Interval("chr1", 150, 200)) shouldBe false // here we *should* add to the coverage, since the coverage target is 10 - covMan.resetCoverage() + covManOne.resetCoverage() - covMan.processTemplates(templates.iterator).isEmpty shouldBe false - covMan.getMinCoverage(new Interval("chr1", 150, 200)) shouldEqual 1 - covMan.getMinTemplateCoverage(template = templates.head) shouldEqual 1 + covManOne.processTemplates(templates.iterator).isEmpty shouldBe false + covManOne.needsCoverage(new Interval("chr1", 150, 200)) shouldEqual false } it should "Correctly only consider coverage from the reads of high mapq (target zero)" in { @@ -104,8 +105,7 @@ class CoverageManagerTest extends UnitSpec { covManZero.resetCoverage() val templates: Seq[Template] = Bams.templateIterator(builder.toSource).toSeq covManZero.processTemplates(templates.iterator).isEmpty shouldBe true - covManZero.getMinCoverage(new Interval("chr1", 150, 200)) shouldEqual 0 - covManZero.getMinTemplateCoverage(template = templates.head) shouldEqual 0 + covManZero.needsCoverage(new Interval("chr1", 150, 200)) shouldBe false } it should "Correctly only consider coverage from the reads of high mapq (target 10)" in { @@ -114,14 +114,12 @@ class CoverageManagerTest extends UnitSpec { builder.addPair(name = "p1", contig = chr1Index, start1 = 100, start2 = 150, mapq1 = PhredScore.MinValue, mapq2 = PhredScore.MaxValue) // Here we *should* add to the coverage, since the coverage target is 10. - covMan.resetCoverage() + covManOne.resetCoverage() val templates: Seq[Template] = Bams.templateIterator(builder.toSource).toSeq - covMan.processTemplates(templates.iterator).isEmpty shouldBe false - covMan.getMinCoverage(new Interval("chr1", 100, 149)) shouldEqual 0 - covMan.getMinCoverage(new Interval("chr1", 150, 150)) shouldEqual 1 - covMan.getMinTemplateCoverage(template = templates.head) shouldEqual 1 - covMan.getMinCoverage(new Interval("chr1", 0, 250)) shouldEqual 0 - + covManOne.processTemplates(templates.iterator).isEmpty shouldBe false + covManOne.needsCoverage(new Interval("chr1", 100, 149)) shouldBe true + covManOne.needsCoverage(new Interval("chr1", 150, 150)) shouldBe false + covManOne.needsCoverage(new Interval("chr1", 0, 250)) shouldBe true } it should "Correctly not add reads that should be filtered " in { @@ -147,36 +145,31 @@ class CoverageManagerTest extends UnitSpec { val templates: Seq[Template] = Bams.templateIterator(builder.toSource).toSeq - covMan.resetCoverage() + covManOne.resetCoverage() - // here we *should not* add to the coverage, since non of the reads pass filteres - covMan.processTemplates(templates.iterator).isEmpty shouldBe true - covMan.getMinCoverage(new Interval("chr1", 150, 200)) shouldEqual 0.toShort - // since there's no region of interest as all the reads get filtered out... - covMan.getMinTemplateCoverage(template = templates.head) shouldEqual Short.MaxValue + // here we *should not* add to the coverage, since non of the reads pass filters + covManOne.processTemplates(templates.iterator).isEmpty shouldBe true + covManOne.needsCoverage(new Interval("chr1", 150, 200)) shouldBe true } - it should "Correctly not add reads that should be filtered but include good read from same template" in { val builder = new SamBuilder(sort = Some(SamOrder.Queryname)) builder.header.setSequenceDictionary(dict) builder.addPair(name = "secondary", contig = chr1Index, start1 = 300, start2 = 400) - builder.addPair(name = "secondary", contig = chr2Index, start1 = 100, start2 = 150) .iterator.foreach(r => r.secondary = true) val templates: Seq[Template] = Bams.templateIterator(builder.toSource).toSeq - covMan.resetCoverage() + covManOne.resetCoverage() // here we *should not* add to the coverage, since none of the reads pass filters - val filteredTemplates = covMan.processTemplates(templates.iterator) + val filteredTemplates = covManOne.processTemplates(templates.iterator) filteredTemplates.isEmpty shouldBe false filteredTemplates.size shouldBe 1 - covMan.getMinCoverage(new Interval("chr1", 100, 200)) shouldEqual 0 - covMan.getMinCoverage(new Interval("chr1", 300, 400)) shouldEqual 1 - covMan.getMinTemplateCoverage(template = templates.head) shouldEqual 1 + covManOne.needsCoverage(new Interval("chr1", 100, 200)) shouldBe true + covManOne.needsCoverage(new Interval("chr1", 300, 400)) shouldBe false } } From 7679e93abac8a37e9f14fb89b0be6365dd2d8526 Mon Sep 17 00:00:00 2001 From: Yossi Farjoun Date: Sat, 31 Dec 2022 01:22:34 -0500 Subject: [PATCH 16/16] - completed tests. - cleanup --- .../personal/yfarjoun/NormalizeCoverage.scala | 27 ++-- .../yfarjoun/CoverageManagerTest.scala | 1 - .../NormalizeCoverageOptionsTest.scala | 65 -------- .../yfarjoun/NormalizeCoverageTest.scala | 144 ++++++++++++++++-- 4 files changed, 147 insertions(+), 90 deletions(-) delete mode 100644 src/test/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverageOptionsTest.scala diff --git a/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverage.scala b/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverage.scala index 5cf006630..e01e8fd5f 100644 --- a/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverage.scala +++ b/src/main/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverage.scala @@ -33,11 +33,10 @@ import com.fulcrumgenomics.commons.util.LazyLogging import com.fulcrumgenomics.fasta.SequenceDictionary import com.fulcrumgenomics.personal.yfarjoun.NormalizeCoverage._ import com.fulcrumgenomics.sopt._ -import com.fulcrumgenomics.util.Io +import com.fulcrumgenomics.util.{Io, ProgressLogger} import com.fulcrumgenomics.util.NumericTypes.PhredScore import htsjdk.samtools.util.{Interval, IntervalList, SequenceUtil} -import java.util import scala.jdk.CollectionConverters._ object NormalizeCoverage { @@ -50,7 +49,7 @@ object NormalizeCoverage { } @clp(description = -""" + """ |Normalizes coverage of an input bam to be at a given coverage for every base in the reference/input interval list. | |Only non-secondary, non-duplicate, mapped (with mapping quality >= mq argument), pf reads are considered. A set of reads @@ -78,21 +77,23 @@ class NormalizeCoverage( ) extends FgBioTool with LazyLogging { - //sets up the overlap detector and the coverage tracker private def createCoverageManager(bam: PathToBam, intervalsMaybe: Option[PathToIntervals]): CoverageManager = { val dict = SequenceDictionary.extract(bam) - val intervals:IntervalList = intervalsMaybe match { + val intervals: IntervalList = intervalsMaybe match { // if no intervals are provided, make one from the embedded dictionary - case None => val il=new IntervalList(dict.toSam) + case None => val il = new IntervalList(dict.toSam) il.addall(getIntervalsFromDictionary(dict).asJava) il case Some(intervalsListFile) => IntervalList.fromPath(intervalsListFile) } + SequenceUtil.assertSequenceDictionariesEqual(dict.toSam, intervals.getHeader.getSequenceDictionary,true) new CoverageManager(intervals, minMapQ = minMQ, coverageTarget = coverageTarget) } override def execute(): Unit = { + val readTemplates = new ProgressLogger(logger, "templates", "read") + val writtenTemplates = new ProgressLogger(logger, "templates", "written") checkArguments() val coverageManager = createCoverageManager(input, targetIntervals) @@ -110,13 +111,21 @@ class NormalizeCoverage( maxRecordsInRam) // find the templates required for coverage and add them to the output writer - coverageManager - .processTemplates(templateIterator) - .foreach(t => out++=t.allReads) + val reportedTemplates = ProgressLogger + .ProgressLoggingIterator(templateIterator) + .progress(progressLogger = readTemplates) + + val processedTemplates = coverageManager.processTemplates(reportedTemplates) + + ProgressLogger.ProgressLoggingIterator(processedTemplates.iterator) + .progress(progressLogger = writtenTemplates) + .foreach(t => out ++= t.allReads) // close streams out.safelyClose() in.safelyClose() + + logger.info(f"Read ${readTemplates.getCount}%,d templates and wrote out ${writtenTemplates.getCount}%,d") } private def checkArguments(): Unit = { diff --git a/src/test/scala/com/fulcrumgenomics/personal/yfarjoun/CoverageManagerTest.scala b/src/test/scala/com/fulcrumgenomics/personal/yfarjoun/CoverageManagerTest.scala index 9d305b0d5..a3ea9e612 100644 --- a/src/test/scala/com/fulcrumgenomics/personal/yfarjoun/CoverageManagerTest.scala +++ b/src/test/scala/com/fulcrumgenomics/personal/yfarjoun/CoverageManagerTest.scala @@ -1,6 +1,5 @@ package com.fulcrumgenomics.personal.yfarjoun -import com.fulcrumgenomics.FgBioDef._ import com.fulcrumgenomics.bam.api.SamOrder import com.fulcrumgenomics.bam.{Bams, Template} import com.fulcrumgenomics.fasta.Converters.ToSAMSequenceDictionary diff --git a/src/test/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverageOptionsTest.scala b/src/test/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverageOptionsTest.scala deleted file mode 100644 index 178a85282..000000000 --- a/src/test/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverageOptionsTest.scala +++ /dev/null @@ -1,65 +0,0 @@ -package com.fulcrumgenomics.personal.yfarjoun - -//import com.fulcrumgenomics.FgBioDef.PathToBam -import com.fulcrumgenomics.testing.UnitSpec -import htsjdk.samtools.util.Locatable -//import com.fulcrumgenomics.commons.CommonsDef.PathToBam -import htsjdk.samtools.util.Interval - -import org.scalatest._ -import matchers._ - -trait LocatableMatchers { - - class SameLocusAs(right: Locatable) extends Matcher[Locatable] { - - def apply(left: Locatable): MatchResult = { - MatchResult( - left.getContig.equals(right.getContig) && - left.getStart.equals(right.getStart) && - left.getEnd.equals(right.getEnd), - s"""Locatable $left is not the same as $right"""", - s"""Locatable $left is the same as $right"""" - ) - } - } - - def haveSameLocusAs(right: Locatable) = new SameLocusAs(right) -} - -object LocatableMatchers extends LocatableMatchers - -class NormalizeCoverageOptionsTest extends UnitSpec with LocatableMatchers { - -// val nc = new NormalizeCoverage(PathToBam("/dev/null"), Path(""), Path("")) - - val i1: Interval = new Interval("chr1", 100, 200) - val i2: Interval = new Interval("chr1", 100, 300) - val i3: Interval = new Interval("chr1", 300, 400) - - -// -// var subset_1_2: Seq[Locatable] = CoverageManager.subsetReadToLocus(Seq(i1), Seq(i2)) -// var subset_2_1: Seq[Locatable] = CoverageManager.subsetReadToLocus(Seq(i2), Seq(i1)) -// var subset_1_3: Seq[Locatable] = CoverageManager.subsetReadToLocus(Seq(i1), Seq(i3)) -// var subset_3_1: Seq[Locatable] = CoverageManager.subsetReadToLocus(Seq(i3), Seq(i1)) -// var subset_2_3: Seq[Locatable] = CoverageManager.subsetReadToLocus(Seq(i2), Seq(i3)) -// var subset_3_2: Seq[Locatable] = CoverageManager.subsetReadToLocus(Seq(i3), Seq(i2)) -// -// "subsetReadToLocus" should "correctly subset one locatable to another" in { -// subset_1_2 should have size 1 -// subset_1_2.head should haveSameLocusAs(i1) -// -// subset_2_1 should have size 1 -// subset_2_1.head should haveSameLocusAs(i1) -// -// subset_1_3 shouldBe empty -// subset_3_1 shouldBe empty -// -// subset_2_3 should have size 1 -// subset_2_3.head should haveSameLocusAs(new Interval("chr1",300,300)) -// -// subset_3_2 should have size 1 -// subset_3_2.head should haveSameLocusAs(new Interval("chr1",300,300)) -// } -} diff --git a/src/test/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverageTest.scala b/src/test/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverageTest.scala index 27065360b..d6daf5a7e 100644 --- a/src/test/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverageTest.scala +++ b/src/test/scala/com/fulcrumgenomics/personal/yfarjoun/NormalizeCoverageTest.scala @@ -24,33 +24,147 @@ package com.fulcrumgenomics.personal.yfarjoun -import com.fulcrumgenomics.coord.LocatableOrderingTest._ +import com.fulcrumgenomics.FgBioDef.{PathToBam, PathToIntervals} +import com.fulcrumgenomics.bam.api.{SamOrder, SamRecord, SamSource} +import com.fulcrumgenomics.fasta.Converters.ToSAMSequenceDictionary import com.fulcrumgenomics.fasta.{SequenceDictionary, SequenceMetadata} import com.fulcrumgenomics.testing.{SamBuilder, UnitSpec} -import htsjdk.samtools.util.{Interval, Locatable} +import com.fulcrumgenomics.util.{Io, SortOrder} +import htsjdk.samtools.util.{Interval, IntervalList} +import org.scalatest.matchers.must.Matchers.convertToAnyMustWrapper -/** Companion object for [[LocatableOrderingTest]]. */ -object LocatableOrderingTest { +import scala.jdk.CollectionConverters.SeqHasAsJava - /** The reference sequence name for chromosome 1. */ - private val Chr1: String = "chr1" +/** Unit test for [[NormalizeCoverage]]. */ +class NormalizeCoverageTest extends UnitSpec { - /** The reference sequence name for chromosome 2. */ - private val Chr2: String = "chr2" + /* test will have a region of interest consisting of initially overlapping regions - /** The sequence dictionary for the ordering. */ - private val Dict: SequenceDictionary = SequenceDictionary(SequenceMetadata(Chr1, length = 10_000), SequenceMetadata(Chr2, length = 100_000)) + chr1:100-250 + chr1:200-400 + chr1:410-450 - private val Builder:SamBuilder = new SamBuilder(sd=Some(Dict)) - Builder.addPair() -} + The templates will consist of reads that + - overlap each other (mates and supplementals) + - map to different contigs + - are unneeded for various reasons + - overlap multiple regions of interest + - target coverage = 2 (so that we can test that overlaps count as 1) + - minMq=30 +*/ -/** Unit tests for [[NormalizeCoverage]]. */ + private val dict = SequenceDictionary( + SequenceMetadata(name = "chr1", length = 10000), + SequenceMetadata(name = "chr2", length = 50000), + SequenceMetadata(name = "chr3", length = 10000) + ).asSam + val chr1Index: Int = dict.getSequenceIndex("chr1") + val chr2Index: Int = dict.getSequenceIndex("chr2") + val chr3Index: Int = dict.getSequenceIndex("chr3") -class NormalizeCoverageTest extends UnitSpec { + val i1: Interval = new Interval("chr1", 100, 250) + val i2: Interval = new Interval("chr1", 200, 400) + val i3: Interval = new Interval("chr1", 410, 450) + + val il: IntervalList = new IntervalList(dict) + il.addall(List(i1, i2, i3).asJava) + + val builder = new SamBuilder(sort = Some(SamOrder.Coordinate), readLength = 100) + var expectedReadCount: Int = 0 + + builder.header.setSequenceDictionary(dict) + + // template: chr1:1-100, chr1:200-299 (mapq0) (needed 1) + builder.addPair("needed_1", contig = chr1Index, start1 = 1, start2 = 200, mapq2 = 0) + expectedReadCount += 2 + + // template: chr1:1-99,chr1:451-550 (not needed 1) + builder.addPair("not_needed_1", contig = chr1Index, start1 = 1, start2 = 451, bases1 = "A" * 99, quals1 = "F" * 99, cigar1 = "99M") + + // template: chr1:150-249, unmapped (needed 2) + builder.addPair("needed_2", contig = chr1Index, start1 = 150, unmapped2 = true) + expectedReadCount += 2 + + // template: chr1:150-249 (mapq=10), unmapped (not needed) + builder.addPair("not_needed_2", contig = chr1Index, start1 = 150, mapq1 = 10, unmapped2 = true) + + // template: chr1:120-219, chr1:130-229 and supplementals in chr2 (needed) + builder.addPair("needed_3", contig = chr1Index, start1 = 130, start2 = 140) + val supplementalReads: Seq[SamRecord] = builder.addPair("needed_3", contig = chr3Index, start1 = 1200, start2 = 1300) + supplementalReads.foreach(r => r.supplementary = true) + expectedReadCount += 4 + + // template : chr1:120-219, chr1:130-229 duplicate (not needed 3) + val dupReads: Seq[SamRecord] = builder.addPair("not_needed_3", contig = chr1Index, start1 = 120, start2 = 130) + dupReads.foreach(r => r.duplicate = true) + // template: chr1:120-130, chr1:120-130 (mapq=10) (two but no more are needed for the overlap region) + for (i <- 1 to 3) { + val shortReadsA: Seq[SamRecord] = builder.addPair(s"needed_perhaps_$i", contig = chr1Index, start1 = 120, start2 = 120, mapq2 = 10) + shortReadsA.foreach(r => { + r.bases = "A" * 11 + r.quals = "F" * 11 + r.cigar = "11M" + }) + } + expectedReadCount += 4 + // template: chr1:120-130, chr1:120-130 secondary (not needed 4) + val secondaryReads: Seq[SamRecord] = builder.addPair("not_needed_4", contig = chr1Index, start1 = 120, start2 = 120) + secondaryReads.foreach(r => r.secondary = true) + // template: chr1:380-479, chr2:350-449 (needed 4) + builder.addPair("needed_4", contig = chr1Index, contig2 = Some(chr2Index), start1 = 380, start2 = 350) + expectedReadCount += 2 + // template: chr1:380-479, chr2:350-449 non-pf (not needed 5) + val nonPfReads: Seq[SamRecord] = builder.addPair("not_needed_5", contig = chr1Index, contig2 = Some(chr2Index), start1 = 120, start2 = 120) + nonPfReads.foreach(r => r.pf = false) + + // template: chr2:100-199, chr3:400-499 (not needed 6) + builder.addPair("not_needed_6", contig = chr2Index, contig2 = Some(chr3Index), start1 = 100, start2 = 400) + + // fragment 1: chr1:10-109 (needed) + builder.addFrag("needed_5", contig = chr1Index, start = 10) + expectedReadCount += 1 + + // fragment 2: chr1:451-550 (not needed + builder.addFrag("not_needed_7", contig = chr1Index, start = 451) + + val intervalListFile: PathToIntervals = Io.makeTempFile("NormalizeCoverageTest", ".list") + intervalListFile.toFile.deleteOnExit() + il.write(intervalListFile) + + val bamFileIn: PathToBam = Io.makeTempFile("NormalizeCoverageTest", "_In.bam") + bamFileIn.toFile.deleteOnExit() + builder.write(bamFileIn) + + val bamFileOut: PathToBam = Io.makeTempFile("NormalizeCoverageTest", "_Out.bam") + bamFileOut.toFile.deleteOnExit() + + val normalizeCoverage = new NormalizeCoverage(bamFileIn, bamFileOut, Some(intervalListFile), coverageTarget = 2, sortOrder = SamOrder.Coordinate) + + normalizeCoverage.execute() + + "normalizeCoverage" should "correctly include reads in a simple test" in { + val reads = SamSource(bamFileOut) + + var optionalCount: Int = 0 + var readCount: Int = 0 + reads.foreach(r => { + readCount += 1 + r.name must startWith("needed") + r.name mustNot startWith("not_needed") + if (r.name.startsWith("needed_perhaps")) { + optionalCount += 1 + } + }) + optionalCount mustEqual 4 // each optional template has 2 overlapping reads, and we need two templates, so 4 reads + expectedReadCount must be > 10 // just to be clear that this isn't empty... + readCount mustEqual expectedReadCount + reads.close() + } } + +