Merge pull request #10 from p2m2/develop

0.2
p2m2 · Oct 6, 2022 · 5e1abb2 · 5e1abb2
2 parents 9c0fceb + 42478fe
commit 5e1abb2
Show file tree

Hide file tree

Showing 17 changed files with 436 additions and 128 deletions.
diff --git a/build.sbt b/build.sbt
@@ -23,6 +23,7 @@ libraryDependencies ++= Seq(
   "com.lihaoyi" %% "ujson" % "2.0.0",
   "org.scala-lang.modules" %% "scala-parser-combinators" % "2.1.1",
   "com.lihaoyi" %% "utest" % "0.8.1" % Test,
+  "org.slf4j" % "slf4j-simple" % "2.0.3" % Test,
 )
 
 credentials += {

diff --git a/src/main/resources/default.json b/src/main/resources/default.json
@@ -2,20 +2,31 @@
   "Glucosinolate" : {
     "deltaMp0Mp2" : 1.996,
     "numberSulfurMin" : 2,
-    "neutralLoss" :
-      {
-        "gluconolactone" : 178.0,
-        "sulfureTrioxide" : 80.0,
-        "anhydroglucose" : 162.0,
+    "minMzCoreStructure" : 317.995896,
+    "neutralLoss" : {
         "thioglucose_s03" : 242.0,
+        "glucosinolate_223" : 223.0,
         "thioglucose" : 196.0,
-        "glucosinolate_223" : 223.0
+        "gluconolactone" : 178.0,
+        "RCNO4S2-" : 163.0,
+        "anhydroglucose" : 162.0,
+        "sulfureTrioxide" : 80.0
       },
     "daughterIons" : {
-      "C6H11O9S_259" : 259.0,
       "C6H11O8S2_275" : 275.0,
+      "C6H11O9S_259" : 259.0,
+      "C6H10O8S-_241" : 242.0,
       "C6H9NO8S_241" : 241.0,
-      "C6H11O7S_227" : 227.0
+      "C6H11O7S-_227" :  227.0,
+      "C6H11O5S-_195" : 195.03,
+      "C6H11O2-_153" : 163.0,
+      "C2H4O5NS-_153" : 153.98,
+      "C2H3O5S-_138" : 138.97,
+      "C2H2O4S-_135" : 135.97,
+      "HO4S2-_128" : 128.93,
+      "HSO4-_97" : 96.95,
+      "SO4-_95" : 96.0,
+      "C2H3OS-" : 74.99
     },
     "databaseReference" : {
       "G01" : 406.029,

diff --git a/src/main/scala/Main.scala → ...ala/fr/inrae/metabolomics/p2m2/Main.scala b/src/main/scala/Main.scala → ...ala/fr/inrae/metabolomics/p2m2/Main.scala
@@ -1,7 +1,9 @@
+package fr.inrae.metabolomics.p2m2
+
 import fr.inrae.metabolomics.p2m2.`export`.CsvMetabolitesIdentificationFile
 import fr.inrae.metabolomics.p2m2.builder.{MetaboliteIdentification, PeakIdentification, ScanLoader}
 import fr.inrae.metabolomics.p2m2.config.ConfigReader
-import fr.inrae.metabolomics.p2m2.output.CsvMetabolitesIdentification
+import fr.inrae.metabolomics.p2m2.output.MetabolitesIdentification
 import umich.ms.fileio.filetypes.mzxml.{MZXMLFile, MZXMLIndex}
 
 import java.io.File
@@ -12,16 +14,16 @@ object Main extends App {
   import scopt.OParser
 
   case class Config(
-                     mzfiles : Seq[File] = Seq(),
-                     jsonFamilyMetabolitesDetection : Option[File] = None,
-                     thresholdIntensityFilter : Option[Int] = None,
-                     thresholdAbundanceM0Filter : Double = 0.1,
-                     overrepresentedPeakFilter : Int = 800,
-                     startRT : Option[Double] = None,
-                     endRT : Option[Double] = None,
-                     precisionMzh : Int = 1000,
-                     toleranceMz : Double = 0.01,
-                     outfile : Option[File] = None,
+                     mzfiles: Seq[File] = Seq(),
+                     jsonFamilyMetabolitesDetection: Option[File] = None,
+                     thresholdIntensityFilter: Option[Int] = None,
+                     thresholdAbundanceM0Filter: Double = 0.1,
+                     overrepresentedPeakFilter: Int = 800,
+                     startRT: Option[Double] = None,
+                     endRT: Option[Double] = None,
+                     precisionMzh: Int = 1000,
+                     toleranceMz: Double = 0.01,
+                     outfile: Option[File] = None,
                      verbose: Boolean = false,
                      debug: Boolean = false
                    )
@@ -36,11 +38,11 @@ object Main extends App {
         .optional()
         .action((x, c) => c.copy(jsonFamilyMetabolitesDetection = Some(x)))
         .text(s"json configuration to detect metabolite family."),
-      opt[Int]('i',"thresholdIntensityFilter")
+      opt[Int]('i', "thresholdIntensityFilter")
         .optional()
         .action((x, c) => c.copy(thresholdIntensityFilter = Some(x)))
         .text(s"Keep ions above a x intensity (calculation on start-up time)"),
-      opt[Int]('p',"overrepresentedPeakFilter")
+      opt[Int]('p', "overrepresentedPeakFilter")
         .optional()
         .action((x, c) => c.copy(overrepresentedPeakFilter = x))
         .text(s"filter about over represented peaks. default ${Config().overrepresentedPeakFilter}"),
@@ -56,11 +58,11 @@ object Main extends App {
         .optional()
         .action((x, c) => c.copy(precisionMzh = x))
         .text(s"precision/rounded Mzh (number to the right of the decimal point) . ${Config().precisionMzh}"),
-      opt[Double]('t',"toleranceMz")
+      opt[Double]('t', "toleranceMz")
         .optional()
         .action((x, c) => c.copy(toleranceMz = x))
         .text(s"tolerance accepted. ${Config().toleranceMz}"),
-      opt[File]('o',"outputFile")
+      opt[File]('o', "outputFile")
         .optional()
         .action((x, c) => c.copy(outfile = Some(x)))
         .text(s"output path file."),
@@ -91,12 +93,12 @@ object Main extends App {
     // arguments are bad, error message will have been displayed
   }
 
-  def process(config : Config): Unit = {
+  def process(config: Config): Unit = {
 
     val confJson = config.jsonFamilyMetabolitesDetection match {
       case Some(jsonFilePath) =>
-        val s = Source.fromFile (jsonFilePath)
-        val res = ConfigReader.read(s.getLines ().mkString)
+        val s = Source.fromFile(jsonFilePath)
+        val res = ConfigReader.read(s.getLines().mkString)
         s.close()
         res
       case None =>
@@ -124,57 +126,69 @@ object Main extends App {
               intensityFilter,
               confJson.deltaMp0Mp2(family),
               confJson.numberSulfurMin(family),
+              confJson.minMzCoreStructure(family),
               confJson.neutralLoss(family),
               confJson.daughterIons(family)
             )
         }
-        val f = config.outfile.getOrElse(new File( s"$family.csv"))
+        val f = config.outfile.getOrElse(new File(s"$family.csv"))
         f.delete()
-        CsvMetabolitesIdentificationFile.build(values,family,confJson,f)
+        CsvMetabolitesIdentificationFile.build(values, family, confJson, f)
         println(s"========= check ${f.getPath} ===============")
       })
-    }
+  }
 
 
-    def analyse_metabolite(
-                            config: Config,
-                            source: MZXMLFile,
-                            index: MZXMLIndex,
-                            intensityFilter: Int,
-                            deltaMp0Mp2: Double,
-                            numberSulfurMin: Double,
-                            neutralLoss: Map[String, Double],
-                            daughterIons: Map[String, Double]
-                          ): Seq[CsvMetabolitesIdentification] = {
-
-      val listSulfurMetabolites: Seq[PeakIdentification] =
-        ScanLoader.
-          getScanIdxAndSpectrumM0M2WithDelta(
-            source,
-            index,
-            config.startRT,
-            config.endRT,
-            config.thresholdAbundanceM0Filter,
-            intensityFilter,
-            filteringOnNbSulfur = numberSulfurMin.toInt,
-            config.toleranceMz,
-            deltaMOM2 = deltaMp0Mp2)
-
-      val listSulfurMetabolitesSelected: Seq[PeakIdentification] = //listSulfurMetabolites
-        ScanLoader.keepSimilarMzWithMaxAbundance(listSulfurMetabolites, config.precisionMzh)
-
-      val m: MetaboliteIdentification =
-        ScanLoader.filterOverRepresentedPeak(
+  def analyse_metabolite(
+                          config: Config,
+                          source: MZXMLFile,
+                          index: MZXMLIndex,
+                          intensityFilter: Int,
+                          deltaMp0Mp2: Double,
+                          numberSulfurMin: Double,
+                          mzCoreStructure : Double,
+                          neutralLoss: Map[String, Double],
+                          daughterIons: Map[String, Double]
+                        ): Seq[MetabolitesIdentification] = {
+
+    val listSulfurMetabolites: Seq[PeakIdentification] =
+      ScanLoader.
+        getScanIdxAndSpectrumM0M2WithDelta(
           source,
           index,
           config.startRT,
           config.endRT,
-          listSulfurMetabolitesSelected,
+          config.thresholdAbundanceM0Filter,
           intensityFilter,
-          config.overrepresentedPeakFilter,
-          neutralLoss.toSeq,
-          daughterIons.toSeq
-        )
-      m.getInfos(config.precisionMzh)
-    }
-}
+          filteringOnNbSulfur = numberSulfurMin.toInt,
+          config.toleranceMz,
+          deltaMOM2 = deltaMp0Mp2)
+
+    /* Diagnostics : Ions frequency on selected Scan peak detected ! */
+
+    val frequencyOfMz: Seq[(Int, Int)] = Seq() // DaughterIonsDiag.IonsFrequencyOnSelectedScanPeakDetected(source,index,listSulfurMetabolites)
+    println(frequencyOfMz)
+    /* Attention c est lent..... peut etre a faire en option !!*/
+    println("\n\n\n==============   Twenty Ions frequency on selected Scan peak detected =========================")
+    println(frequencyOfMz.reverse.slice(1, 20).map {
+      case (mz, freq) => (mz.toString + " m/z -> " + freq)
+    }.mkString(" , "))
+
+    val listSulfurMetabolitesSelected: Seq[PeakIdentification] = // listSulfurMetabolites
+      ScanLoader.keepSimilarMzWithMaxAbundance(listSulfurMetabolites, config.precisionMzh)
+
+    val m: MetaboliteIdentification =
+      ScanLoader.filterOverRepresentedPeak(
+        source,
+        index,
+        config.startRT,
+        config.endRT,
+        listSulfurMetabolitesSelected,
+        intensityFilter,
+        config.overrepresentedPeakFilter,
+        neutralLoss.toSeq,
+        daughterIons.toSeq
+      )
+    m.findDiagnosticIonsAndNeutralLosses(config.precisionMzh,mzCoreStructure)
+  }
+}
diff --git a/src/main/scala/fr/inrae/metabolomics/p2m2/analyzer/Peak.scala b/src/main/scala/fr/inrae/metabolomics/p2m2/analyzer/Peak.scala
diff --git a/src/main/scala/fr/inrae/metabolomics/p2m2/builder/MetaboliteIdentification.scala b/src/main/scala/fr/inrae/metabolomics/p2m2/builder/MetaboliteIdentification.scala
@@ -1,6 +1,6 @@
 package fr.inrae.metabolomics.p2m2.builder
 
-import fr.inrae.metabolomics.p2m2.output.CsvMetabolitesIdentification
+import fr.inrae.metabolomics.p2m2.output.MetabolitesIdentification
 import umich.ms.fileio.filetypes.mzxml.{MZXMLFile, MZXMLIndex}
 
 case class MetaboliteIdentification(
@@ -12,32 +12,48 @@ case class MetaboliteIdentification(
                                      nls : Seq[(String,Double)],
                                      dis : Seq[(String,Double)]
                                    ) {
-  def getInfo( p :PeakIdentification,precisionMzh : Int) : CsvMetabolitesIdentification = {
-    val mz = p.peaks.map(p2 => (p2.mz*precisionMzh ).round / precisionMzh.toDouble )
-    val intensities = p.peaks.map(_.intensity)
-    val abundance = p.peaks.map(_.abundance)
+  def getInfo( p :PeakIdentification,precisionMzh : Int, mzCoreStructure : Double) : Option[MetabolitesIdentification] = p.peaks.nonEmpty match {
+    case true =>
+      val mz = p.peaks.map(p2 => (p2.mz*precisionMzh ).round / precisionMzh.toDouble )
+      val intensities = p.peaks.map(_.intensity)
+      val abundance = p.peaks.map(_.abundance)
 
-    CsvMetabolitesIdentification(
-      mz,
-      intensities,
-      abundance,
-      p.rt,
-      neutralLosses = ScanLoader.detectNeutralLoss(source,index,start,end,p,nls),
-      daughterIons = ScanLoader.detectDaughterIons(source,index,start,end,p,dis)
-    )
+      if ( p.peaks.head.mz >= mzCoreStructure )
+        Some(MetabolitesIdentification(
+          mz,
+          intensities,
+          abundance,
+          p.rt,
+          neutralLosses = ScanLoader.detectNeutralLoss(source,index,start,end,p,nls),
+          daughterIons = ScanLoader.detectDaughterIons(source,index,start,end,p,dis)
+        ))
+      else
+        Some(MetabolitesIdentification(
+          mz,
+          intensities,
+          abundance,
+          p.rt,
+          neutralLosses = Map(),
+          daughterIons = Map()
+        ))
+    case false => None
   }
 
-  def getInfos(precisionMzh : Int): Seq[CsvMetabolitesIdentification] = {
+  /**
+   *
+   * @param precisionMzh precision of mzh
+   * @param mzCoreStructure minimum size of a metabolite according param family
+   * @return
+   */
+  def findDiagnosticIonsAndNeutralLosses(precisionMzh : Int, mzCoreStructure : Double): Seq[MetabolitesIdentification] = {
     println("\n== detectNeutralLoss/detectDaughterIons == ")
 
     peaks.zipWithIndex
-      . map {
+      . flatMap {
      case (x,idx) =>
        print(s"\r===>$idx/${peaks.size}")
-       getInfo(x,precisionMzh)
+       getInfo(x,precisionMzh,mzCoreStructure)
     }
-      /* remove entry if none neutral and none daughters ions detected or big abundance (>60%)*/
-      .filter( csvM => (csvM.neutralLosses.values.flatten.nonEmpty && csvM.daughterIons.values.flatten.nonEmpty) )
       .sortBy( x => (x.rt,x.mz.head) )
   }
 }