From 7857f88d7ace0107349877df5ae555c037e21b3a Mon Sep 17 00:00:00 2001 From: David An Date: Fri, 1 Nov 2024 10:17:59 -0400 Subject: [PATCH] refactor how we iterate over the list --- .../filematch/result/FileMatchResult.scala | 6 +- .../result/SuccessfulMatchResult.scala | 6 +- ...gy.scala => FileRecognitionStrategy.scala} | 2 +- .../strategy/IlluminaPairedEndStrategy.scala | 2 +- .../dsde/firecloud/utils/FileMatcher.scala | 146 ++++++++++++------ 5 files changed, 108 insertions(+), 54 deletions(-) rename src/main/scala/org/broadinstitute/dsde/firecloud/filematch/strategy/{FileMatchStrategy.scala => FileRecognitionStrategy.scala} (87%) diff --git a/src/main/scala/org/broadinstitute/dsde/firecloud/filematch/result/FileMatchResult.scala b/src/main/scala/org/broadinstitute/dsde/firecloud/filematch/result/FileMatchResult.scala index 369eb72a4..7809a2b52 100644 --- a/src/main/scala/org/broadinstitute/dsde/firecloud/filematch/result/FileMatchResult.scala +++ b/src/main/scala/org/broadinstitute/dsde/firecloud/filematch/result/FileMatchResult.scala @@ -1,6 +1,10 @@ package org.broadinstitute.dsde.firecloud.filematch.result +import java.nio.file.Path + /** * Marker trait for failed/successful file-matching results */ -trait FileMatchResult {} +trait FileMatchResult { + def firstFile: Path +} diff --git a/src/main/scala/org/broadinstitute/dsde/firecloud/filematch/result/SuccessfulMatchResult.scala b/src/main/scala/org/broadinstitute/dsde/firecloud/filematch/result/SuccessfulMatchResult.scala index 7289b7ec3..55a613b0c 100644 --- a/src/main/scala/org/broadinstitute/dsde/firecloud/filematch/result/SuccessfulMatchResult.scala +++ b/src/main/scala/org/broadinstitute/dsde/firecloud/filematch/result/SuccessfulMatchResult.scala @@ -7,7 +7,11 @@ import java.nio.file.Path /** * MatchResult indicating that the file successfully hit a known pattern. */ -case class SuccessfulMatchResult(firstFile: Path, secondFile: Path, id: String) extends FileMatchResult {} +case class SuccessfulMatchResult(firstFile: Path, secondFile: Path, id: String) extends FileMatchResult { + + def toPartial: PartialMatchResult = PartialMatchResult(firstFile, id) + +} @VisibleForTesting object SuccessfulMatchResult { diff --git a/src/main/scala/org/broadinstitute/dsde/firecloud/filematch/strategy/FileMatchStrategy.scala b/src/main/scala/org/broadinstitute/dsde/firecloud/filematch/strategy/FileRecognitionStrategy.scala similarity index 87% rename from src/main/scala/org/broadinstitute/dsde/firecloud/filematch/strategy/FileMatchStrategy.scala rename to src/main/scala/org/broadinstitute/dsde/firecloud/filematch/strategy/FileRecognitionStrategy.scala index 4e316ef35..1c3cf2783 100644 --- a/src/main/scala/org/broadinstitute/dsde/firecloud/filematch/strategy/FileMatchStrategy.scala +++ b/src/main/scala/org/broadinstitute/dsde/firecloud/filematch/strategy/FileRecognitionStrategy.scala @@ -4,7 +4,7 @@ import org.broadinstitute.dsde.firecloud.filematch.result.FileMatchResult import java.nio.file.Path -trait FileMatchStrategy { +trait FileRecognitionStrategy { def matchFirstFile(path: Path): FileMatchResult diff --git a/src/main/scala/org/broadinstitute/dsde/firecloud/filematch/strategy/IlluminaPairedEndStrategy.scala b/src/main/scala/org/broadinstitute/dsde/firecloud/filematch/strategy/IlluminaPairedEndStrategy.scala index 33e1ab948..17de75b48 100644 --- a/src/main/scala/org/broadinstitute/dsde/firecloud/filematch/strategy/IlluminaPairedEndStrategy.scala +++ b/src/main/scala/org/broadinstitute/dsde/firecloud/filematch/strategy/IlluminaPairedEndStrategy.scala @@ -26,7 +26,7 @@ object IlluminaPairedEndStrategy { ) } -class IlluminaPairedEndStrategy extends FileMatchStrategy { +class IlluminaPairedEndStrategy extends FileRecognitionStrategy { override def matchFirstFile(path: Path): FileMatchResult = { val foundMatch = FILE_ENDINGS.find { case (key, _) => path.toString.endsWith(key) } diff --git a/src/main/scala/org/broadinstitute/dsde/firecloud/utils/FileMatcher.scala b/src/main/scala/org/broadinstitute/dsde/firecloud/utils/FileMatcher.scala index 53f26431a..be4d151aa 100644 --- a/src/main/scala/org/broadinstitute/dsde/firecloud/utils/FileMatcher.scala +++ b/src/main/scala/org/broadinstitute/dsde/firecloud/utils/FileMatcher.scala @@ -1,81 +1,127 @@ package org.broadinstitute.dsde.firecloud.utils import com.typesafe.scalalogging.LazyLogging -import org.broadinstitute.dsde.firecloud.filematch.result.{ - FailedMatchResult, - FileMatchResult, - PartialMatchResult, - SuccessfulMatchResult -} -import org.broadinstitute.dsde.firecloud.filematch.strategy.{FileMatchStrategy, IlluminaPairedEndStrategy} +import org.broadinstitute.dsde.firecloud.filematch.result +import org.broadinstitute.dsde.firecloud.filematch.result.{FailedMatchResult, FileMatchResult, SuccessfulMatchResult} +import org.broadinstitute.dsde.firecloud.filematch.strategy.{FileRecognitionStrategy, IlluminaPairedEndStrategy} import java.nio.file.Path -import scala.annotation.tailrec - -// ******************************************************************************************************************* -// POC of file-matching for AJ-2025: -// Given a list of files, pair those files based on Illumina single end and paired end read patterns -// ******************************************************************************************************************* +/** + * Given a list of files, pair those files based on their naming conventions. + * At the time of writing, this involves recognizing Illumina single end and paired end read patterns + * such as those defined at https://support.illumina.com/help/BaseSpace_Sequence_Hub_OLH_009008_2/Source/Informatics/BS/NamingConvention_FASTQ-files-swBS.htm + * + * In the future, we may support additional naming conventions + */ class FileMatcher extends LazyLogging { - private val matchingStrategies: List[FileMatchStrategy] = List(new IlluminaPairedEndStrategy()) + // the list of recognition strategies to use + private val matchingStrategies: List[FileRecognitionStrategy] = List(new IlluminaPairedEndStrategy()) + /** + * Given a list of files, pair up those files according to our known recognition strategies. + * @param pathList the list of files to inspect + * @return pairing results + */ + def pairPaths(pathList: List[Path]): List[FileMatchResult] = + performPairing(pathList) + + /** + * Given a list of files, pair up those files according to our known recognition strategies. + * @param fileList the list of files to inspect, as Strings + * @return pairing results + */ def pairFiles(fileList: List[String]): List[FileMatchResult] = { // convert fileList to pathList val pathList = fileList.map(file => new java.io.File(file).toPath) pairPaths(pathList) } - def pairPaths(pathList: List[Path]): List[FileMatchResult] = { - // sort the incoming list for better performance (???) - val paths = pathList.sorted + /** + * Implementation for file pairing. This executes in three steps: + * 1. Use our known file recognition strategies to identify all "read 1" files in the file list + * 2. Search for all "read 2" files in the file list which match the previously-identified "read 1"s + * 3. Handle the remaining files which are not recognized as either "read 1" or "read 2" + * + * @param pathList the list of files to inspect + * @return pairing results + */ + private def performPairing(pathList: List[Path]): List[FileMatchResult] = { + // find every path in the incoming pathList that is recognized by one of our known patterns + val desiredPairings: List[SuccessfulMatchResult] = findFirstFiles(pathList) + + // remove the recognized firstFiles from the outstanding pathList + val remainingPaths: List[Path] = pathList diff desiredPairings.map(_.firstFile) + + // process the recognized "read 1" files, and look for their desired pairings in the outstanding pathList. + // this will result in either SuccessfulMatchResult when the desired pairing is found, or PartialMatchResult + // when the desired pairing is not found + val pairingResults: List[FileMatchResult] = findSecondFiles(remainingPaths, desiredPairings) + + // remove the recognized "read 2" files from the outstanding pathList + val unrecognizedPaths: List[Path] = remainingPaths diff pairingResults.collect { case s: SuccessfulMatchResult => + s.secondFile + } + // translate the unrecognized paths into a FileMatchResult + val unrecognizedResults: List[FailedMatchResult] = unrecognizedPaths.map(path => FailedMatchResult(path)) - pairNextPath(paths, List()) + // return results, sorted by firstFile + (pairingResults ++ unrecognizedResults).sortBy(r => r.firstFile) } - @tailrec - private def pairNextPath(remainingPathList: List[Path], pairsFound: List[FileMatchResult]): List[FileMatchResult] = - remainingPathList match { - case Nil => - // no files left to match. Just return what we have found so far. - pairsFound - case nextFile :: remaining => - // try to match the nextFile to our known recognition strategies - tryMatchStrategies(nextFile, remaining) match { - case success: SuccessfulMatchResult => - // we found a pair for the current file, remove the pair from the remaining file list - pairNextPath(remaining.filterNot(_.equals(success.secondFile)), pairsFound.appended(success)) - case failure => pairNextPath(remaining, pairsFound.appended(failure)) - } + /** + * find every path in the incoming pathList that is recognized as a "read 1" by our known patterns + * @param pathList the list of files to inspect + * @return pairing results + */ + private def findFirstFiles(pathList: List[Path]): List[SuccessfulMatchResult] = { + val firstFiles: List[result.SuccessfulMatchResult] = pathList.collect { path => + tryPairingStrategies(path) match { + case success: SuccessfulMatchResult => success + } + } + firstFiles + } + /** + * find every path in the incoming pathList that is recognized as a "read 2" by our known patterns + * + * @param pathList the list of files to inspect + * @param desiredPairings the "read 2" files to look for in the pathList + * @return pairing results + */ + private def findSecondFiles(pathList: List[Path], + desiredPairings: List[SuccessfulMatchResult] + ): List[FileMatchResult] = + desiredPairings.map { desiredPairing => + // search for the desired pairing's secondFile in the list of actual files + pathList.find(p => p.equals(desiredPairing.secondFile)) match { + case Some(_) => desiredPairing + case None => desiredPairing.toPartial + } } - private def tryMatchStrategies(mainFile: Path, remainingPathList: List[Path]): FileMatchResult = { - // does the current file hit on any of our file-matching patterns? Iterate over the matching strategies - // and return the first successful match result. + /** + * Attempt all the configured file recognition strategies against the supplied file. + * + * @param mainFile the file to try to recognize + * @return SuccessfulMatchResult if the file is recognized; FailedMatchResult if not + */ + private def tryPairingStrategies(mainFile: Path): FileMatchResult = { + // does the current file hit on any of our file-matching patterns? + // Iterate over the matching strategies and return the first successful match result. val strategyHit = matchingStrategies.collectFirst(strategy => strategy.matchFirstFile(mainFile) match { case success: SuccessfulMatchResult => success } ) - strategyHit match { - case Some(desiredResult: SuccessfulMatchResult) => - // The current file `mainFile` hits on our file-matching patterns. - // The `desiredResult` contains the name of the file that we hope `mainFile` can be paired with. - // Now, check if the matching second file in `desiredResult` actually exists in the original file list. - if (remainingPathList.exists(path => path.equals(desiredResult.secondFile))) { - desiredResult - } else { - PartialMatchResult(desiredResult.firstFile, desiredResult.id) - } - case _ => - // the current file isn't recognized by any of our file-matching patterns, or the file it should be paired - // with doesn't exist. - FailedMatchResult(mainFile) + // The current file is recognized by one of our recognition strategies + case Some(desiredResult: SuccessfulMatchResult) => desiredResult + // the current file is not recognized + case _ => FailedMatchResult(mainFile) } - } }