-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
refactor how we iterate over the list
- Loading branch information
Showing
5 changed files
with
108 additions
and
54 deletions.
There are no files selected for viewing
6 changes: 5 additions & 1 deletion
6
src/main/scala/org/broadinstitute/dsde/firecloud/filematch/result/FileMatchResult.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,10 @@ | ||
package org.broadinstitute.dsde.firecloud.filematch.result | ||
|
||
import java.nio.file.Path | ||
|
||
/** | ||
* Marker trait for failed/successful file-matching results | ||
*/ | ||
trait FileMatchResult {} | ||
trait FileMatchResult { | ||
def firstFile: Path | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
146 changes: 96 additions & 50 deletions
146
src/main/scala/org/broadinstitute/dsde/firecloud/utils/FileMatcher.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,81 +1,127 @@ | ||
package org.broadinstitute.dsde.firecloud.utils | ||
|
||
import com.typesafe.scalalogging.LazyLogging | ||
import org.broadinstitute.dsde.firecloud.filematch.result.{ | ||
FailedMatchResult, | ||
FileMatchResult, | ||
PartialMatchResult, | ||
SuccessfulMatchResult | ||
} | ||
import org.broadinstitute.dsde.firecloud.filematch.strategy.{FileMatchStrategy, IlluminaPairedEndStrategy} | ||
import org.broadinstitute.dsde.firecloud.filematch.result | ||
import org.broadinstitute.dsde.firecloud.filematch.result.{FailedMatchResult, FileMatchResult, SuccessfulMatchResult} | ||
import org.broadinstitute.dsde.firecloud.filematch.strategy.{FileRecognitionStrategy, IlluminaPairedEndStrategy} | ||
|
||
import java.nio.file.Path | ||
import scala.annotation.tailrec | ||
|
||
// ******************************************************************************************************************* | ||
// POC of file-matching for AJ-2025: | ||
// Given a list of files, pair those files based on Illumina single end and paired end read patterns | ||
// ******************************************************************************************************************* | ||
|
||
/** | ||
* Given a list of files, pair those files based on their naming conventions. | ||
* At the time of writing, this involves recognizing Illumina single end and paired end read patterns | ||
* such as those defined at https://support.illumina.com/help/BaseSpace_Sequence_Hub_OLH_009008_2/Source/Informatics/BS/NamingConvention_FASTQ-files-swBS.htm | ||
* | ||
* In the future, we may support additional naming conventions | ||
*/ | ||
class FileMatcher extends LazyLogging { | ||
|
||
private val matchingStrategies: List[FileMatchStrategy] = List(new IlluminaPairedEndStrategy()) | ||
// the list of recognition strategies to use | ||
private val matchingStrategies: List[FileRecognitionStrategy] = List(new IlluminaPairedEndStrategy()) | ||
|
||
/** | ||
* Given a list of files, pair up those files according to our known recognition strategies. | ||
* @param pathList the list of files to inspect | ||
* @return pairing results | ||
*/ | ||
def pairPaths(pathList: List[Path]): List[FileMatchResult] = | ||
performPairing(pathList) | ||
|
||
/** | ||
* Given a list of files, pair up those files according to our known recognition strategies. | ||
* @param fileList the list of files to inspect, as Strings | ||
* @return pairing results | ||
*/ | ||
def pairFiles(fileList: List[String]): List[FileMatchResult] = { | ||
// convert fileList to pathList | ||
val pathList = fileList.map(file => new java.io.File(file).toPath) | ||
pairPaths(pathList) | ||
} | ||
|
||
def pairPaths(pathList: List[Path]): List[FileMatchResult] = { | ||
// sort the incoming list for better performance (???) | ||
val paths = pathList.sorted | ||
/** | ||
* Implementation for file pairing. This executes in three steps: | ||
* 1. Use our known file recognition strategies to identify all "read 1" files in the file list | ||
* 2. Search for all "read 2" files in the file list which match the previously-identified "read 1"s | ||
* 3. Handle the remaining files which are not recognized as either "read 1" or "read 2" | ||
* | ||
* @param pathList the list of files to inspect | ||
* @return pairing results | ||
*/ | ||
private def performPairing(pathList: List[Path]): List[FileMatchResult] = { | ||
// find every path in the incoming pathList that is recognized by one of our known patterns | ||
val desiredPairings: List[SuccessfulMatchResult] = findFirstFiles(pathList) | ||
|
||
// remove the recognized firstFiles from the outstanding pathList | ||
val remainingPaths: List[Path] = pathList diff desiredPairings.map(_.firstFile) | ||
|
||
// process the recognized "read 1" files, and look for their desired pairings in the outstanding pathList. | ||
// this will result in either SuccessfulMatchResult when the desired pairing is found, or PartialMatchResult | ||
// when the desired pairing is not found | ||
val pairingResults: List[FileMatchResult] = findSecondFiles(remainingPaths, desiredPairings) | ||
|
||
// remove the recognized "read 2" files from the outstanding pathList | ||
val unrecognizedPaths: List[Path] = remainingPaths diff pairingResults.collect { case s: SuccessfulMatchResult => | ||
s.secondFile | ||
} | ||
// translate the unrecognized paths into a FileMatchResult | ||
val unrecognizedResults: List[FailedMatchResult] = unrecognizedPaths.map(path => FailedMatchResult(path)) | ||
|
||
pairNextPath(paths, List()) | ||
// return results, sorted by firstFile | ||
(pairingResults ++ unrecognizedResults).sortBy(r => r.firstFile) | ||
} | ||
|
||
@tailrec | ||
private def pairNextPath(remainingPathList: List[Path], pairsFound: List[FileMatchResult]): List[FileMatchResult] = | ||
remainingPathList match { | ||
case Nil => | ||
// no files left to match. Just return what we have found so far. | ||
pairsFound | ||
case nextFile :: remaining => | ||
// try to match the nextFile to our known recognition strategies | ||
tryMatchStrategies(nextFile, remaining) match { | ||
case success: SuccessfulMatchResult => | ||
// we found a pair for the current file, remove the pair from the remaining file list | ||
pairNextPath(remaining.filterNot(_.equals(success.secondFile)), pairsFound.appended(success)) | ||
case failure => pairNextPath(remaining, pairsFound.appended(failure)) | ||
} | ||
/** | ||
* find every path in the incoming pathList that is recognized as a "read 1" by our known patterns | ||
* @param pathList the list of files to inspect | ||
* @return pairing results | ||
*/ | ||
private def findFirstFiles(pathList: List[Path]): List[SuccessfulMatchResult] = { | ||
val firstFiles: List[result.SuccessfulMatchResult] = pathList.collect { path => | ||
tryPairingStrategies(path) match { | ||
case success: SuccessfulMatchResult => success | ||
} | ||
} | ||
firstFiles | ||
} | ||
|
||
/** | ||
* find every path in the incoming pathList that is recognized as a "read 2" by our known patterns | ||
* | ||
* @param pathList the list of files to inspect | ||
* @param desiredPairings the "read 2" files to look for in the pathList | ||
* @return pairing results | ||
*/ | ||
private def findSecondFiles(pathList: List[Path], | ||
desiredPairings: List[SuccessfulMatchResult] | ||
): List[FileMatchResult] = | ||
desiredPairings.map { desiredPairing => | ||
// search for the desired pairing's secondFile in the list of actual files | ||
pathList.find(p => p.equals(desiredPairing.secondFile)) match { | ||
case Some(_) => desiredPairing | ||
case None => desiredPairing.toPartial | ||
} | ||
} | ||
|
||
private def tryMatchStrategies(mainFile: Path, remainingPathList: List[Path]): FileMatchResult = { | ||
// does the current file hit on any of our file-matching patterns? Iterate over the matching strategies | ||
// and return the first successful match result. | ||
/** | ||
* Attempt all the configured file recognition strategies against the supplied file. | ||
* | ||
* @param mainFile the file to try to recognize | ||
* @return SuccessfulMatchResult if the file is recognized; FailedMatchResult if not | ||
*/ | ||
private def tryPairingStrategies(mainFile: Path): FileMatchResult = { | ||
// does the current file hit on any of our file-matching patterns? | ||
// Iterate over the matching strategies and return the first successful match result. | ||
val strategyHit = matchingStrategies.collectFirst(strategy => | ||
strategy.matchFirstFile(mainFile) match { | ||
case success: SuccessfulMatchResult => success | ||
} | ||
) | ||
|
||
strategyHit match { | ||
case Some(desiredResult: SuccessfulMatchResult) => | ||
// The current file `mainFile` hits on our file-matching patterns. | ||
// The `desiredResult` contains the name of the file that we hope `mainFile` can be paired with. | ||
// Now, check if the matching second file in `desiredResult` actually exists in the original file list. | ||
if (remainingPathList.exists(path => path.equals(desiredResult.secondFile))) { | ||
desiredResult | ||
} else { | ||
PartialMatchResult(desiredResult.firstFile, desiredResult.id) | ||
} | ||
case _ => | ||
// the current file isn't recognized by any of our file-matching patterns, or the file it should be paired | ||
// with doesn't exist. | ||
FailedMatchResult(mainFile) | ||
// The current file is recognized by one of our recognition strategies | ||
case Some(desiredResult: SuccessfulMatchResult) => desiredResult | ||
// the current file is not recognized | ||
case _ => FailedMatchResult(mainFile) | ||
} | ||
|
||
} | ||
|
||
} |