Skip to content

Commit

Permalink
refactor how we iterate over the list
Browse files Browse the repository at this point in the history
  • Loading branch information
davidangb committed Nov 1, 2024
1 parent dc744ca commit 7857f88
Show file tree
Hide file tree
Showing 5 changed files with 108 additions and 54 deletions.
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
package org.broadinstitute.dsde.firecloud.filematch.result

import java.nio.file.Path

/**
* Marker trait for failed/successful file-matching results
*/
trait FileMatchResult {}
trait FileMatchResult {
def firstFile: Path
}
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,11 @@ import java.nio.file.Path
/**
* MatchResult indicating that the file successfully hit a known pattern.
*/
case class SuccessfulMatchResult(firstFile: Path, secondFile: Path, id: String) extends FileMatchResult {}
case class SuccessfulMatchResult(firstFile: Path, secondFile: Path, id: String) extends FileMatchResult {

def toPartial: PartialMatchResult = PartialMatchResult(firstFile, id)

}

@VisibleForTesting
object SuccessfulMatchResult {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ import org.broadinstitute.dsde.firecloud.filematch.result.FileMatchResult

import java.nio.file.Path

trait FileMatchStrategy {
trait FileRecognitionStrategy {

def matchFirstFile(path: Path): FileMatchResult

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ object IlluminaPairedEndStrategy {
)
}

class IlluminaPairedEndStrategy extends FileMatchStrategy {
class IlluminaPairedEndStrategy extends FileRecognitionStrategy {
override def matchFirstFile(path: Path): FileMatchResult = {
val foundMatch = FILE_ENDINGS.find { case (key, _) => path.toString.endsWith(key) }

Expand Down
Original file line number Diff line number Diff line change
@@ -1,81 +1,127 @@
package org.broadinstitute.dsde.firecloud.utils

import com.typesafe.scalalogging.LazyLogging
import org.broadinstitute.dsde.firecloud.filematch.result.{
FailedMatchResult,
FileMatchResult,
PartialMatchResult,
SuccessfulMatchResult
}
import org.broadinstitute.dsde.firecloud.filematch.strategy.{FileMatchStrategy, IlluminaPairedEndStrategy}
import org.broadinstitute.dsde.firecloud.filematch.result
import org.broadinstitute.dsde.firecloud.filematch.result.{FailedMatchResult, FileMatchResult, SuccessfulMatchResult}
import org.broadinstitute.dsde.firecloud.filematch.strategy.{FileRecognitionStrategy, IlluminaPairedEndStrategy}

import java.nio.file.Path
import scala.annotation.tailrec

// *******************************************************************************************************************
// POC of file-matching for AJ-2025:
// Given a list of files, pair those files based on Illumina single end and paired end read patterns
// *******************************************************************************************************************

/**
* Given a list of files, pair those files based on their naming conventions.
* At the time of writing, this involves recognizing Illumina single end and paired end read patterns
* such as those defined at https://support.illumina.com/help/BaseSpace_Sequence_Hub_OLH_009008_2/Source/Informatics/BS/NamingConvention_FASTQ-files-swBS.htm
*
* In the future, we may support additional naming conventions
*/
class FileMatcher extends LazyLogging {

private val matchingStrategies: List[FileMatchStrategy] = List(new IlluminaPairedEndStrategy())
// the list of recognition strategies to use
private val matchingStrategies: List[FileRecognitionStrategy] = List(new IlluminaPairedEndStrategy())

/**
* Given a list of files, pair up those files according to our known recognition strategies.
* @param pathList the list of files to inspect
* @return pairing results
*/
def pairPaths(pathList: List[Path]): List[FileMatchResult] =
performPairing(pathList)

/**
* Given a list of files, pair up those files according to our known recognition strategies.
* @param fileList the list of files to inspect, as Strings
* @return pairing results
*/
def pairFiles(fileList: List[String]): List[FileMatchResult] = {
// convert fileList to pathList
val pathList = fileList.map(file => new java.io.File(file).toPath)
pairPaths(pathList)
}

def pairPaths(pathList: List[Path]): List[FileMatchResult] = {
// sort the incoming list for better performance (???)
val paths = pathList.sorted
/**
* Implementation for file pairing. This executes in three steps:
* 1. Use our known file recognition strategies to identify all "read 1" files in the file list
* 2. Search for all "read 2" files in the file list which match the previously-identified "read 1"s
* 3. Handle the remaining files which are not recognized as either "read 1" or "read 2"
*
* @param pathList the list of files to inspect
* @return pairing results
*/
private def performPairing(pathList: List[Path]): List[FileMatchResult] = {
// find every path in the incoming pathList that is recognized by one of our known patterns
val desiredPairings: List[SuccessfulMatchResult] = findFirstFiles(pathList)

// remove the recognized firstFiles from the outstanding pathList
val remainingPaths: List[Path] = pathList diff desiredPairings.map(_.firstFile)

// process the recognized "read 1" files, and look for their desired pairings in the outstanding pathList.
// this will result in either SuccessfulMatchResult when the desired pairing is found, or PartialMatchResult
// when the desired pairing is not found
val pairingResults: List[FileMatchResult] = findSecondFiles(remainingPaths, desiredPairings)

// remove the recognized "read 2" files from the outstanding pathList
val unrecognizedPaths: List[Path] = remainingPaths diff pairingResults.collect { case s: SuccessfulMatchResult =>
s.secondFile
}
// translate the unrecognized paths into a FileMatchResult
val unrecognizedResults: List[FailedMatchResult] = unrecognizedPaths.map(path => FailedMatchResult(path))

pairNextPath(paths, List())
// return results, sorted by firstFile
(pairingResults ++ unrecognizedResults).sortBy(r => r.firstFile)
}

@tailrec
private def pairNextPath(remainingPathList: List[Path], pairsFound: List[FileMatchResult]): List[FileMatchResult] =
remainingPathList match {
case Nil =>
// no files left to match. Just return what we have found so far.
pairsFound
case nextFile :: remaining =>
// try to match the nextFile to our known recognition strategies
tryMatchStrategies(nextFile, remaining) match {
case success: SuccessfulMatchResult =>
// we found a pair for the current file, remove the pair from the remaining file list
pairNextPath(remaining.filterNot(_.equals(success.secondFile)), pairsFound.appended(success))
case failure => pairNextPath(remaining, pairsFound.appended(failure))
}
/**
* find every path in the incoming pathList that is recognized as a "read 1" by our known patterns
* @param pathList the list of files to inspect
* @return pairing results
*/
private def findFirstFiles(pathList: List[Path]): List[SuccessfulMatchResult] = {
val firstFiles: List[result.SuccessfulMatchResult] = pathList.collect { path =>
tryPairingStrategies(path) match {
case success: SuccessfulMatchResult => success
}
}
firstFiles
}

/**
* find every path in the incoming pathList that is recognized as a "read 2" by our known patterns
*
* @param pathList the list of files to inspect
* @param desiredPairings the "read 2" files to look for in the pathList
* @return pairing results
*/
private def findSecondFiles(pathList: List[Path],
desiredPairings: List[SuccessfulMatchResult]
): List[FileMatchResult] =
desiredPairings.map { desiredPairing =>
// search for the desired pairing's secondFile in the list of actual files
pathList.find(p => p.equals(desiredPairing.secondFile)) match {
case Some(_) => desiredPairing
case None => desiredPairing.toPartial
}
}

private def tryMatchStrategies(mainFile: Path, remainingPathList: List[Path]): FileMatchResult = {
// does the current file hit on any of our file-matching patterns? Iterate over the matching strategies
// and return the first successful match result.
/**
* Attempt all the configured file recognition strategies against the supplied file.
*
* @param mainFile the file to try to recognize
* @return SuccessfulMatchResult if the file is recognized; FailedMatchResult if not
*/
private def tryPairingStrategies(mainFile: Path): FileMatchResult = {
// does the current file hit on any of our file-matching patterns?
// Iterate over the matching strategies and return the first successful match result.
val strategyHit = matchingStrategies.collectFirst(strategy =>
strategy.matchFirstFile(mainFile) match {
case success: SuccessfulMatchResult => success
}
)

strategyHit match {
case Some(desiredResult: SuccessfulMatchResult) =>
// The current file `mainFile` hits on our file-matching patterns.
// The `desiredResult` contains the name of the file that we hope `mainFile` can be paired with.
// Now, check if the matching second file in `desiredResult` actually exists in the original file list.
if (remainingPathList.exists(path => path.equals(desiredResult.secondFile))) {
desiredResult
} else {
PartialMatchResult(desiredResult.firstFile, desiredResult.id)
}
case _ =>
// the current file isn't recognized by any of our file-matching patterns, or the file it should be paired
// with doesn't exist.
FailedMatchResult(mainFile)
// The current file is recognized by one of our recognition strategies
case Some(desiredResult: SuccessfulMatchResult) => desiredResult
// the current file is not recognized
case _ => FailedMatchResult(mainFile)
}

}

}

0 comments on commit 7857f88

Please sign in to comment.