From bbacc77ab95a32384665969b0b186c6f908258fa Mon Sep 17 00:00:00 2001 From: Brian Pratt Date: Thu, 17 Nov 2022 15:36:56 -0800 Subject: [PATCH] Apply a bit of cleanup to any formula being assigned to a molecule, while preserving harmless idiosyncracies: drop any zero-count atoms (e.g. CH0N => CN) change things like "S0001" to "S1" but don't change things like "S1" to "S" or "COOOH" to "CO3H" as they're not strictly wrong, and the existing format may matter to the user --- pwiz_tools/Skyline/Model/CustomMolecule.cs | 2 +- pwiz_tools/Skyline/Test/AdductTest.cs | 6 +++ pwiz_tools/Skyline/TestUtil/AssertEx.cs | 17 +++++++ pwiz_tools/Skyline/Util/BioMassCalc.cs | 56 ++++++++++++++++++++-- 4 files changed, 75 insertions(+), 6 deletions(-) diff --git a/pwiz_tools/Skyline/Model/CustomMolecule.cs b/pwiz_tools/Skyline/Model/CustomMolecule.cs index 1767675503..d1f5daa699 100644 --- a/pwiz_tools/Skyline/Model/CustomMolecule.cs +++ b/pwiz_tools/Skyline/Model/CustomMolecule.cs @@ -459,7 +459,7 @@ public string Formula // The molecular formula - may contain isotopes get { return _formula; } protected set { - _formula = value ?? string.Empty; + _formula = BioMassCalc.MONOISOTOPIC.RegularizeFormula(value ?? string.Empty); // e.g. XeC12N1H0 => XeC12N1 var unlabeled = string.IsNullOrEmpty(_formula) ? _formula : BioMassCalc.MONOISOTOPIC.StripLabelsFromFormula(_formula); UnlabeledFormula = Equals(_formula, unlabeled) ? _formula : unlabeled; } diff --git a/pwiz_tools/Skyline/Test/AdductTest.cs b/pwiz_tools/Skyline/Test/AdductTest.cs index 33244a6ea0..5cb4668d6b 100644 --- a/pwiz_tools/Skyline/Test/AdductTest.cs +++ b/pwiz_tools/Skyline/Test/AdductTest.cs @@ -85,6 +85,12 @@ private void TestException(string formula, string adductText) private void TestAdductOperators() { // Test some underlying formula handling for fanciful user-supplied values + AssertEx.AreEqual("COOOHN", BioMassCalc.MONOISOTOPIC.RegularizeFormula("COOOHNS0")); + AssertEx.AreEqual("XeC12N1", BioMassCalc.MONOISOTOPIC.RegularizeFormula("XeC12N1H0")); + AssertEx.AreEqual("XeC12N1", BioMassCalc.MONOISOTOPIC.RegularizeFormula("XeC12N01H0")); + AssertEx.AreEqual("C'3C2H9NO2O\"2S1", BioMassCalc.MONOISOTOPIC.RegularizeFormula("C'3C2H9H'0NO2O\"2S001")); + var labels = new Dictionary(){{"C'",3},{"O\"",2}}; // Find the C'3O"2 in C'3C2H9H'0NO2O"2S (yes, H'0 - seen in the wild - but we drop zero counts) + AssertEx.AreEqual(labels, BioMassCalc.MONOISOTOPIC.FindIsotopeLabelsInFormula("C'3C2H9H'0NO2O\"2S")); AssertEx.AreEqual(Adduct.SINGLY_PROTONATED, Adduct.FromStringAssumeProtonated("(M+H)+") ); AssertEx.IsTrue(Adduct.FromStringAssumeProtonatedNonProteomic("[M-H2O+H]+").SameEffect(Adduct.FromStringAssumeProtonatedNonProteomic("(M+H)+[-H2O]"))); Assert.IsTrue(Molecule.AreEquivalentFormulas("C10H30Si5O5H-CH4", "C9H27O5Si5")); diff --git a/pwiz_tools/Skyline/TestUtil/AssertEx.cs b/pwiz_tools/Skyline/TestUtil/AssertEx.cs index f6bc57ab4d..d95e01bf47 100644 --- a/pwiz_tools/Skyline/TestUtil/AssertEx.cs +++ b/pwiz_tools/Skyline/TestUtil/AssertEx.cs @@ -61,6 +61,23 @@ public static void AreEqualDeep(IList l1, IList l2) } } + public static void AreEqual(IDictionary expected, IDictionary actual, string message = null) + { + AreEqual(expected.Count, actual.Count, message); + + foreach (var keyValuePairExpected in expected) + { + if (!actual.TryGetValue(keyValuePairExpected.Key, out var valueActual )) + { + AreEqual(keyValuePairExpected.Key.ToString(), null, message); + } + else + { + AreEqual(keyValuePairExpected.Value, valueActual, message); + } + } + } + public static void AreEqual(T expected, T actual, string message = null) { if (!Equals(expected, actual)) diff --git a/pwiz_tools/Skyline/Util/BioMassCalc.cs b/pwiz_tools/Skyline/Util/BioMassCalc.cs index 7cc4edb312..0945cf0246 100644 --- a/pwiz_tools/Skyline/Util/BioMassCalc.cs +++ b/pwiz_tools/Skyline/Util/BioMassCalc.cs @@ -589,6 +589,53 @@ public TypedMass CalculateMassFromFormula(IDictionary desc) return new TypedMass(totalMass, MassType); } + /// + /// Remove, if necessary, any weirdness like "H0" (zero hydrogens) in a formula, preserving other less offensive idiosyncrasies like N1 instead of N + /// e.g. XeC12N001H0 => XeC12N1 + /// Preserve things like COOOH instead of making it CO3H, e.g. COOOHN1S0 => COOOHN1 or COOOHNS0 => COOOHN + /// + /// the formula + /// the tidied up formula + public string RegularizeFormula(string desc) + { + if (string.IsNullOrEmpty(desc)) + { + return desc; + } + desc = desc.Trim(); + var atomCounts = new List>(); + while (desc.Length > 0) + { + var atom = NextSymbol(desc); + desc = desc.Substring(atom.Length); + + var nDigits = 0; // Looking for an explicit atom count declaration + while (nDigits < desc.Length && char.IsDigit(desc[nDigits])) + { + nDigits++; + } + + var atomCount = 1; + var atomCountString = ""; + if (nDigits > 0) // There was an explicit count declaration + { + atomCount = int.Parse(desc.Substring(0, nDigits), CultureInfo.InvariantCulture); + atomCountString = atomCount.ToString(CultureInfo.InvariantCulture); // Change "H01" to "H1" + } + + if (atomCount != 0) // Drop "H0" + { + atomCounts.Add(new KeyValuePair (atom, atomCountString)); + } + + desc = desc.Substring(nDigits).TrimStart(); + } + + return string.Concat(atomCounts.Select(atomCount => + $@"{atomCount.Key}{atomCount.Value}")); + + } + /// /// Turn a formula like C5H9H'3NO2S into C5H12NO2S /// @@ -647,7 +694,7 @@ public string StripLabelsFromFormula(string desc) } /// - /// Find the C'3H'0 in C'3C2H9H'0NO2S (yes, H'0 - seen in the wild) + /// Find the C'3O"2 in C'3C2H9H'0NO2O"2S (yes, H'0 - seen in the wild - but drop zero counts) /// public IDictionary FindIsotopeLabelsInFormula(string desc) { @@ -655,7 +702,7 @@ public IDictionary FindIsotopeLabelsInFormula(string desc) return null; var parse = desc; var dictAtomCounts = new Dictionary(); - ParseCounts(ref parse, dictAtomCounts, false, null, true); + ParseCounts(ref parse, dictAtomCounts, false); return dictAtomCounts.Where(pair => DICT_HEAVYSYMBOL_TO_MONOSYMBOL.ContainsKey(pair.Key)).ToDictionary(p => p.Key, p => p.Value); } @@ -917,8 +964,7 @@ public double ParseMass(IDictionary desc) /// Dictionary of atomic symbols and counts (may already contain counts from other formulas) /// True if counts should be subtracted /// If non-null, used to note order of appearance of atomic symbols in formula - /// If true, keep dictionary entries for atoms that end up with 0 count - public void ParseCounts(ref string desc, IDictionary dictAtomCounts, bool negative, IList atomOrder=null, bool retainZeros = false) + public void ParseCounts(ref string desc, IDictionary dictAtomCounts, bool negative, IList atomOrder=null) { if (string.IsNullOrEmpty(desc)) { @@ -969,7 +1015,7 @@ public void ParseCounts(ref string desc, IDictionary dictAtomCounts } } - if (dictAtomCounts[sym] == 0 && !retainZeros) + if (dictAtomCounts[sym] == 0) dictAtomCounts.Remove(sym); desc = desc.Substring(endCount).TrimStart();