bigdatagenomics · jstjohn · Apr 24, 2015 · Apr 24, 2015 · Apr 24, 2015 · May 13, 2015
diff --git a/.gitignore b/.gitignore
@@ -6,6 +6,7 @@
 *.vcf*
 target
 *.log
+*.jar
 
 # Intellij
 .idea/

diff --git a/avocado-cli/src/main/scala/org/bdgenomics/avocado/cli/Avocado.scala b/avocado-cli/src/main/scala/org/bdgenomics/avocado/cli/Avocado.scala
@@ -72,6 +72,9 @@ class AvocadoArgs extends Args4jBase with ParquetArgs {
   @Argument(metaVar = "CONFIG", required = true, usage = "avocado configuration file", index = 3)
   var configFile: String = _
 
+  @Argument(metaVar = "NORMAL", required = false, usage = "ADAM normal data", index = 4)
+  var normalInput: String = _
+
   @option(name = "-debug", usage = "If set, prints a higher level of debug output.")
   var debug = false
 
@@ -204,10 +207,18 @@ class Avocado(protected val args: AvocadoArgs) extends BDGSparkCommand[AvocadoAr
 
     log.info("Loading reads in from " + args.readInput)
     // load in reads from ADAM file
-    val reads: RDD[AlignmentRecord] = LoadReads.time {
+    var reads: RDD[AlignmentRecord] = LoadReads.time {
       Input(sc, args.readInput, reference, config)
     }
 
+    // load in reads from normal ADAM file if there is one
+    var normal: RDD[AlignmentRecord] = null
+
+    if (args.normalInput != null) {
+      normal = Input(sc, args.normalInput, reference, config)
+      reads = sc.union(reads, normal)
+    }
+
     // create stats/config item
     val stats = new AvocadoConfigAndStats(sc, args.debug, reads, reference)
 
@@ -233,4 +244,4 @@ class Avocado(protected val args: AvocadoArgs) extends BDGSparkCommand[AvocadoAr
         args.disableDictionaryEncoding)
     }
   }
-}
+}
diff --git a/avocado-core/src/main/scala/org/bdgenomics/avocado/algorithms/mutect/LikelihoodModel.scala b/avocado-core/src/main/scala/org/bdgenomics/avocado/algorithms/mutect/LikelihoodModel.scala
@@ -0,0 +1,94 @@
+/*
+ * Licensed to Big Data Genomics (BDG) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The BDG licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.bdgenomics.avocado.algorithms.mutect
+
+import org.bdgenomics.avocado.models.AlleleObservation
+import org.bdgenomics.adam.util.PhredUtils.phredToErrorProbability
+import scala.math._
+
+trait LikelihoodModel extends Serializable {
+  def logLikelihood(ref: String,
+                    alt: String,
+                    obs: Iterable[AlleleObservation],
+                    f: Option[Double]): Double
+}
+
+case class LogOdds(m1: LikelihoodModel, m2: LikelihoodModel) {
+
+  def logOdds(ref: String, alt: String,
+              obs: Iterable[AlleleObservation],
+              f: Option[Double]): Double =
+    m1.logLikelihood(ref, alt, obs, f) - m2.logLikelihood(ref, alt, obs, f)
+}
+
+object MutectLogOdds extends LogOdds(MfmModel, M0Model) {
+}
+
+/**
+ * Use for the log odds that a normal is not a heterozygous site
+ */
+object MutectSomaticLogOdds extends LogOdds(M0Model, MHModel) {
+}
+
+object M0Model extends LikelihoodModel {
+
+  def logLikelihood(ref: String,
+                    alt: String,
+                    obs: Iterable[AlleleObservation],
+                    f: Option[Double]): Double =
+    MfmModel.logLikelihood(ref, alt, obs, Some(0.0))
+}
+
+/**
+ * M_{m, 0.5} -- probability of a heterozygous site
+ */
+object MHModel extends LikelihoodModel {
+
+  def logLikelihood(ref: String,
+                    alt: String,
+                    obs: Iterable[AlleleObservation],
+                    f: Option[Double]): Double =
+    MfmModel.logLikelihood(ref, alt, obs, Some(0.5))
+}
+
+/**
+ * M_{m, f}
+ */
+object MfmModel extends LikelihoodModel {
+
+  def P_bi(obs: AlleleObservation, r: String, m: String, f: Double): Double = {
+    val ei = phredToErrorProbability(obs.phred)
+
+    if (obs.allele == r) {
+      f * (ei / 3.0) + (1.0 - f) * (1.0 - ei)
+    } else if (obs.allele == m) {
+      f * (1.0 - ei) + (1.0 - f) * (ei / 3.0)
+    } else {
+      ei / 3.0
+    }
+  }
+
+  def logLikelihood(ref: String, alt: String,
+                    obs: Iterable[AlleleObservation],
+                    f: Option[Double]): Double = {
+    val fEstimate: Double = f.getOrElse(obs.count(_.allele == alt).toDouble / obs.size)
+    obs.map { ob => log10(P_bi(ob, ref, alt, fEstimate)) }.sum
+  }
+}
+
diff --git a/avocado-core/src/main/scala/org/bdgenomics/avocado/discovery/ReadExplorer.scala b/avocado-core/src/main/scala/org/bdgenomics/avocado/discovery/ReadExplorer.scala
@@ -23,7 +23,8 @@ import org.apache.spark.Logging
 import org.apache.spark.rdd.RDD
 import org.bdgenomics.adam.models.ReferencePosition
 import org.bdgenomics.adam.rdd.ADAMContext._
-import org.bdgenomics.adam.rich.RichAlignmentRecord
+import org.bdgenomics.adam.rich.{ DecadentRead, RichAlignmentRecord }
+import org.bdgenomics.adam.util.MdTag
 import org.bdgenomics.avocado.Timers._
 import org.bdgenomics.avocado.models.{ AlleleObservation, Observation }
 import org.bdgenomics.avocado.stats.AvocadoConfigAndStats
@@ -43,10 +44,38 @@ class ReadExplorer(referenceObservations: RDD[Observation]) extends Explorer wit
 
   val companion: ExplorerCompanion = ReadExplorer
 
+  def mdTagToMismatchPositions(mdTag: MdTag, cigar: List[CigarElement]): Seq[Int] = {
+    var idx = 0
+    val insertions = cigar.map(c => {
+      (c, c.getLength)
+    }).map(kv => {
+      val r = (kv._1, idx)
+      idx += kv._2
+      r
+    }).flatMap(kv => {
+      val (ce, i) = kv
+      if (ce.getOperator == CigarOperator.I) {
+        (0 until ce.getLength).map(_ + i)
+      } else {
+        Seq.empty
+      }
+    })
+
+    val deletions = mdTag.deletions
+    val oriPositions = mdTag.mismatches.keys
+    var mismatchPositions = oriPositions.zip(oriPositions)
+    for (iPos <- insertions) {
+      mismatchPositions = mismatchPositions.map({ case (p, i) => if (i <= iPos) (p + 1, i) else (p, i) })
+    }
+    for ((dPos, _) <- deletions) {
+      mismatchPositions = mismatchPositions.map({ case (p, i) => if (i > dPos) (p - 1, i) else (p, i) })
+    }
+    mismatchPositions.map({ case (p, i) => p.toInt }).toSeq
+  }
+
   def readToObservations(r: (AlignmentRecord, Long)): Seq[Observation] = ExploringRead.time {
     val (read, readId) = r
     val richRead: RichAlignmentRecord = RichAlignmentRecord(read)
-
     // get read start, contig, strand, sample, mapq, and sequence
     var pos: Long = read.getStart
     val contig: String = read.getContig.getContigName
@@ -64,14 +93,115 @@ class ReadExplorer(referenceObservations: RDD[Observation]) extends Explorer wit
     // get cigar, md tag, and phred scores for bases
     val cigar: List[CigarElement] = richRead.samtoolsCigar.getCigarElements
     val quals = richRead.qualityScores
-    val mdTag = richRead.mdTag
+    val mdString = read.getMismatchingPositions
+    val mismatchPositions: Option[Seq[Int]] = if (mdString != null && mdString != "")
+      Some(mdTagToMismatchPositions(MdTag(read.getMismatchingPositions,
+        if (cigar.head.getOperator == CigarOperator.S) cigar.head.getLength else 0,
+        richRead.samtoolsCigar), cigar))
+    else None
 
     // observations
     var observations = Seq[Observation]()
 
     // position in the current read
     var readPos = 0
 
+    // get the sum of mismatching bases
+    val qscores: Option[Seq[Int]] = mismatchPositions.map(l => {
+      l.map(p => {
+        quals(p)
+      })
+    })
+
+    val mismatchQScoreSum = qscores.map(_.sum)
+
+    // Helper function to get the unclipped read length (hard or soft) from CIGAR
+    def unclippedLenFromCigar(cigar: Cigar): Int = {
+      cigar.getCigarElements.map(ce => ce.getOperator match {
+        case CigarOperator.D | CigarOperator.N | CigarOperator.P => 0
+        case _ => ce.getLength
+      }).sum
+    }
+
+    def alignedLenFromCigar(cigar: Cigar): Int = {
+      cigar.getCigarElements.map(alignedElementLength).sum
+    }
+
+    def alignedElementLength(ce: CigarElement): Int = {
+      ce.getOperator match {
+        case CigarOperator.D | CigarOperator.N | CigarOperator.P | CigarOperator.H | CigarOperator.S => 0
+        case _ => ce.getLength
+      }
+    }
+
+    // Helper function to calculate the length of an element, if it is a clipping element
+    def basesTrimmed(cigarElement: CigarElement): Int = {
+      cigarElement.getOperator match {
+        case CigarOperator.S | CigarOperator.H => cigarElement.getLength
+        case _                                 => 0
+      }
+    }
+
+    // Set up variables to help with tracking the distance from indels, and
+    // the distance from the current allele to the first and last trimmed base
+    // within this read.
+    val readLen = unclippedLenFromCigar(richRead.samtoolsCigar)
+    val trimmedFromStart = basesTrimmed(cigar.head)
+    val trimmedFromEnd = basesTrimmed(cigar.last)
+    var softclippedBases = 0
+    val alignedLen = alignedLenFromCigar(richRead.samtoolsCigar)
+
+    val cigarLenOps = cigar.zipWithIndex.map({
+      case (ce: CigarElement, idx: Int) =>
+        (idx, (ce.getLength, ce.getOperator))
+    }).toMap
+
+    val alignedLenFromCigars = cigar.map(alignedElementLength)
+
+    // List of changes that are only insertions along with their lengths and pos (pos, (len, CigarOperator.I))
+    val insertions = cigarLenOps.filter({ case (idx, (len, op)) => op == CigarOperator.I })
+    // List of changes that are only deletions along with their lengths and pos (pos, (len, CigarOperator.D))
+    val deletions = cigarLenOps.filter({ case (idx, (len, op)) => op == CigarOperator.D })
+
+    /**
+     *
+     * @param idx position of the insertion in the Cigar list
+     * @param len lenght of the insertion from the Cigar list
+     * @param del whether it is deletion or insertion
+     * @return
+     */
+    def makeNaiveDistanceVec(idx: Int, len: Int, del: Boolean): Vector[Int] = {
+      // Finds the distance (num of bases in the read) before the event
+      val lpre = (0 until idx).map(alignedLenFromCigars(_)).sum
+
+      // Finds the num of bases in the read after the event (at idx)
+      val lpost = ((idx + 1) until cigar.length).map(alignedLenFromCigars(_)).sum
+
+      // Makes a list for every base pair in the read based on how far it is from this particular event
+      // eg. if it is an insertion, lpre = 5, lpost = 6, insertion len = 3, read length = 14
+      // (5, 4, 3, 2, 1, 0, 0, 0, 1, 2, 3, 4, 5, 6)
+      val distanceVec = ((1 to lpre).reverse ++ (if (del) Vector.empty[Int] else Vector.fill(len)(0)) ++ (1 to lpost).map(-_)).toVector
+      assert(distanceVec.length == alignedLen)
+      distanceVec
+    }
+
+    val insertionDistVecs = insertions.map({ case (idx, (len, _)) => makeNaiveDistanceVec(idx, len, false) })
+    val deletionDistVecs = deletions.map({ case (idx, (len, _)) => makeNaiveDistanceVec(idx, len, true) })
+
+    val posToInsDist: Option[Vector[Int]] = if (insertionDistVecs.size > 0) Some(insertionDistVecs.transpose.map(l => l.minBy(Math.abs(_))).toVector) else None
+    val posToDelDist: Option[Vector[Int]] = if (deletionDistVecs.size > 0) Some(deletionDistVecs.transpose.map(l => l.minBy(Math.abs(_))).toVector) else None
+
+    def getTags(read: RichAlignmentRecord): Option[Seq[org.bdgenomics.adam.models.Attribute]] = {
+      try {
+        Option(read.tags)
+      } catch {
+        case e: NullPointerException => None
+      }
+    }
+
+    val tags: Option[Seq[org.bdgenomics.adam.models.Attribute]] = getTags(richRead)
+    val mateRescue: Boolean = tags.getOrElse(Seq()).exists(a => a.tag == "XT" && a.value == "M")
+
     def processAlignmentMatch() {
       observations = AlleleObservation(ReferencePosition(contig, pos),
         1,
@@ -80,7 +210,15 @@ class ReadExplorer(referenceObservations: RDD[Observation]) extends Explorer wit
         mapq,
         negativeStrand,
         firstOfPair,
-        readPos,
+        readPos - softclippedBases,
+        alignedLen,
+        posToInsDist.flatMap((lst: Vector[Int]) => Some(lst(readPos - softclippedBases))),
+        posToDelDist.flatMap((lst: Vector[Int]) => Some(lst(readPos - softclippedBases))),
+        trimmedFromStart,
+        trimmedFromEnd,
+        readLen,
+        mismatchQScoreSum,
+        mateRescue,
         sample,
         readId) +: observations
       readPos += 1
@@ -100,13 +238,22 @@ class ReadExplorer(referenceObservations: RDD[Observation]) extends Explorer wit
           mapq,
           negativeStrand,
           firstOfPair,
-          readPos,
+          readPos - softclippedBases,
+          alignedLen,
+          posToInsDist.flatMap((lst: Vector[Int]) => Some(lst(readPos - softclippedBases))),
+          posToDelDist.flatMap((lst: Vector[Int]) => Some(lst(readPos - softclippedBases))),
+          trimmedFromStart,
+          trimmedFromEnd,
+          readLen,
+          mismatchQScoreSum,
+          mateRescue,
           sample,
           readId).asInstanceOf[Observation] +: observations
 
         // increment read pointers
         readPos += alleleLength
         pos += 1
+
       } else if (idx + 1 < cigar.length && cigar(idx + 1).getOperator == CigarOperator.D) {
         // the allele includes the matching base
         val alleleLength = 1 + cigar(idx + 1).getLength
@@ -119,7 +266,15 @@ class ReadExplorer(referenceObservations: RDD[Observation]) extends Explorer wit
           mapq,
           negativeStrand,
           firstOfPair,
-          readPos,
+          readPos - softclippedBases,
+          alignedLen,
+          posToInsDist.flatMap((lst: Vector[Int]) => Some(lst(readPos - softclippedBases))),
+          posToDelDist.flatMap((lst: Vector[Int]) => Some(lst(readPos - softclippedBases))),
+          trimmedFromStart,
+          trimmedFromEnd,
+          readLen,
+          mismatchQScoreSum,
+          mateRescue,
           sample,
           readId).asInstanceOf[Observation] +: observations
 
@@ -152,7 +307,9 @@ class ReadExplorer(referenceObservations: RDD[Observation]) extends Explorer wit
           // no op; handle inserts by looking ahead from match/mismatch operator
         }
         case CigarOperator.S => {
-          readPos += cigar(i).getLength
+          val clippedLen = cigar(i).getLength
+          readPos += clippedLen
+          softclippedBases += clippedLen
         }
         case CigarOperator.H =>
         case _ => {
@@ -177,4 +334,4 @@ class ReadExplorer(referenceObservations: RDD[Observation]) extends Explorer wit
         .flatMap(readToObservations)
     } ++ referenceObservations
   }
-}
+}
-Original file line number
+Diff line change
@@ Expand Up / @@ -6,6 +6,7 @@ @@
     *.vcf*
     target
     *.log
+    *.jar
     # Intellij
     .idea/
@@ Expand Down @@