Skip to content

Commit

Permalink
Apply a bit of cleanup to any formula being assigned to a molecule, w…
Browse files Browse the repository at this point in the history
…hile preserving harmless idiosyncracies:

drop any zero-count atoms (e.g. CH0N => CN)
change things like "S0001" to "S1"
but don't change things like "S1" to "S" or "COOOH" to "CO3H" as they're not strictly wrong, and the existing format may matter to the user
  • Loading branch information
bspratt committed Nov 21, 2022
1 parent a56bd28 commit bbacc77
Show file tree
Hide file tree
Showing 4 changed files with 75 additions and 6 deletions.
2 changes: 1 addition & 1 deletion pwiz_tools/Skyline/Model/CustomMolecule.cs
Original file line number Diff line number Diff line change
Expand Up @@ -459,7 +459,7 @@ public string Formula // The molecular formula - may contain isotopes
get { return _formula; }
protected set
{
_formula = value ?? string.Empty;
_formula = BioMassCalc.MONOISOTOPIC.RegularizeFormula(value ?? string.Empty); // e.g. XeC12N1H0 => XeC12N1
var unlabeled = string.IsNullOrEmpty(_formula) ? _formula : BioMassCalc.MONOISOTOPIC.StripLabelsFromFormula(_formula);
UnlabeledFormula = Equals(_formula, unlabeled) ? _formula : unlabeled;
}
Expand Down
6 changes: 6 additions & 0 deletions pwiz_tools/Skyline/Test/AdductTest.cs
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,12 @@ private void TestException(string formula, string adductText)
private void TestAdductOperators()
{
// Test some underlying formula handling for fanciful user-supplied values
AssertEx.AreEqual("COOOHN", BioMassCalc.MONOISOTOPIC.RegularizeFormula("COOOHNS0"));
AssertEx.AreEqual("XeC12N1", BioMassCalc.MONOISOTOPIC.RegularizeFormula("XeC12N1H0"));
AssertEx.AreEqual("XeC12N1", BioMassCalc.MONOISOTOPIC.RegularizeFormula("XeC12N01H0"));
AssertEx.AreEqual("C'3C2H9NO2O\"2S1", BioMassCalc.MONOISOTOPIC.RegularizeFormula("C'3C2H9H'0NO2O\"2S001"));
var labels = new Dictionary<string, int>(){{"C'",3},{"O\"",2}}; // Find the C'3O"2 in C'3C2H9H'0NO2O"2S (yes, H'0 - seen in the wild - but we drop zero counts)
AssertEx.AreEqual(labels, BioMassCalc.MONOISOTOPIC.FindIsotopeLabelsInFormula("C'3C2H9H'0NO2O\"2S"));
AssertEx.AreEqual(Adduct.SINGLY_PROTONATED, Adduct.FromStringAssumeProtonated("(M+H)+") );
AssertEx.IsTrue(Adduct.FromStringAssumeProtonatedNonProteomic("[M-H2O+H]+").SameEffect(Adduct.FromStringAssumeProtonatedNonProteomic("(M+H)+[-H2O]")));
Assert.IsTrue(Molecule.AreEquivalentFormulas("C10H30Si5O5H-CH4", "C9H27O5Si5"));
Expand Down
17 changes: 17 additions & 0 deletions pwiz_tools/Skyline/TestUtil/AssertEx.cs
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,23 @@ public static void AreEqualDeep<TItem>(IList<TItem> l1, IList<TItem> l2)
}
}

public static void AreEqual<TKey,TValue>(IDictionary<TKey,TValue> expected, IDictionary<TKey, TValue> actual, string message = null)
{
AreEqual(expected.Count, actual.Count, message);

foreach (var keyValuePairExpected in expected)
{
if (!actual.TryGetValue(keyValuePairExpected.Key, out var valueActual ))
{
AreEqual(keyValuePairExpected.Key.ToString(), null, message);
}
else
{
AreEqual(keyValuePairExpected.Value, valueActual, message);
}
}
}

public static void AreEqual<T>(T expected, T actual, string message = null)
{
if (!Equals(expected, actual))
Expand Down
56 changes: 51 additions & 5 deletions pwiz_tools/Skyline/Util/BioMassCalc.cs
Original file line number Diff line number Diff line change
Expand Up @@ -589,6 +589,53 @@ public TypedMass CalculateMassFromFormula(IDictionary<string, int> desc)
return new TypedMass(totalMass, MassType);
}

/// <summary>
/// Remove, if necessary, any weirdness like "H0" (zero hydrogens) in a formula, preserving other less offensive idiosyncrasies like N1 instead of N
/// e.g. XeC12N001H0 => XeC12N1
/// Preserve things like COOOH instead of making it CO3H, e.g. COOOHN1S0 => COOOHN1 or COOOHNS0 => COOOHN
/// </summary>
/// <param name="desc">the formula</param>
/// <returns>the tidied up formula</returns>
public string RegularizeFormula(string desc)
{
if (string.IsNullOrEmpty(desc))
{
return desc;
}
desc = desc.Trim();
var atomCounts = new List<KeyValuePair<string, string>>();
while (desc.Length > 0)
{
var atom = NextSymbol(desc);
desc = desc.Substring(atom.Length);

var nDigits = 0; // Looking for an explicit atom count declaration
while (nDigits < desc.Length && char.IsDigit(desc[nDigits]))
{
nDigits++;
}

var atomCount = 1;
var atomCountString = "";
if (nDigits > 0) // There was an explicit count declaration
{
atomCount = int.Parse(desc.Substring(0, nDigits), CultureInfo.InvariantCulture);
atomCountString = atomCount.ToString(CultureInfo.InvariantCulture); // Change "H01" to "H1"
}

if (atomCount != 0) // Drop "H0"
{
atomCounts.Add(new KeyValuePair<string, string> (atom, atomCountString));
}

desc = desc.Substring(nDigits).TrimStart();
}

return string.Concat(atomCounts.Select(atomCount =>
$@"{atomCount.Key}{atomCount.Value}"));

}

/// <summary>
/// Turn a formula like C5H9H'3NO2S into C5H12NO2S
/// </summary>
Expand Down Expand Up @@ -647,15 +694,15 @@ public string StripLabelsFromFormula(string desc)
}

/// <summary>
/// Find the C'3H'0 in C'3C2H9H'0NO2S (yes, H'0 - seen in the wild)
/// Find the C'3O"2 in C'3C2H9H'0NO2O"2S (yes, H'0 - seen in the wild - but drop zero counts)
/// </summary>
public IDictionary<string, int> FindIsotopeLabelsInFormula(string desc)
{
if (string.IsNullOrEmpty(desc))
return null;
var parse = desc;
var dictAtomCounts = new Dictionary<string, int>();
ParseCounts(ref parse, dictAtomCounts, false, null, true);
ParseCounts(ref parse, dictAtomCounts, false);
return dictAtomCounts.Where(pair => DICT_HEAVYSYMBOL_TO_MONOSYMBOL.ContainsKey(pair.Key)).ToDictionary(p => p.Key, p => p.Value);
}

Expand Down Expand Up @@ -917,8 +964,7 @@ public double ParseMass(IDictionary<string, int> desc)
/// <param name="dictAtomCounts">Dictionary of atomic symbols and counts (may already contain counts from other formulas)</param>
/// <param name="negative">True if counts should be subtracted</param>
/// <param name="atomOrder">If non-null, used to note order of appearance of atomic symbols in formula</param>
/// <param name="retainZeros">If true, keep dictionary entries for atoms that end up with 0 count</param>
public void ParseCounts(ref string desc, IDictionary<string, int> dictAtomCounts, bool negative, IList<string> atomOrder=null, bool retainZeros = false)
public void ParseCounts(ref string desc, IDictionary<string, int> dictAtomCounts, bool negative, IList<string> atomOrder=null)
{
if (string.IsNullOrEmpty(desc))
{
Expand Down Expand Up @@ -969,7 +1015,7 @@ public void ParseCounts(ref string desc, IDictionary<string, int> dictAtomCounts
}
}

if (dictAtomCounts[sym] == 0 && !retainZeros)
if (dictAtomCounts[sym] == 0)
dictAtomCounts.Remove(sym);

desc = desc.Substring(endCount).TrimStart();
Expand Down

0 comments on commit bbacc77

Please sign in to comment.