Fixed need for intercept (#62)

* Fixed need for intercept and cleaned brew * Create models for test on-the-fly * Fixed capitalization
wfondrie · Jul 18, 2022 · 21680cc · 21680cc
1 parent 3a638c0
commit 21680cc
Show file tree

Hide file tree

Showing 9 changed files with 53 additions and 34 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,7 +1,13 @@
 # Changelog for mokapot  
 
-## Unreleased
+## [0.8.2] - 2022-07-18
+### Added
+- `mokapot.Model()` objects now recored the CV fold that they were fit on.
+  This means that they can be provided to `mokapot.brew()` in any order 
+  and still maintain proper cross-validation bins.
+
 ### Fixed
+- Resolved issue where models were required to have an intercept term.
 - The PepXML parser would sometimes try and log transform features with `0`'s, resulting in missing values.
 
 ## [0.8.1] - 2022-06-24

diff --git a/data/models/mokapot.model_fold-1.pkl b/data/models/mokapot.model_fold-1.pkl
diff --git a/data/models/mokapot.model_fold-2.pkl b/data/models/mokapot.model_fold-2.pkl
diff --git a/data/models/mokapot.model_fold-3.pkl b/data/models/mokapot.model_fold-3.pkl
diff --git a/mokapot/brew.py b/mokapot/brew.py
@@ -46,7 +46,7 @@ def brew(psms, model=None, test_fdr=0.01, folds=3, max_workers=1):
         machine models used by Percolator. If a list of
         :py:class:`mokapot.Model` objects is provided, they are assumed
         to be previously trained models and will and one will be
-        used to rescore each fold in the order they are provided.
+        used to rescore each fold.
     test_fdr : float, optional
         The false-discovery rate threshold at which to evaluate
         the learned models.
@@ -98,8 +98,8 @@ def brew(psms, model=None, test_fdr=0.01, folds=3, max_workers=1):
 
     # If trained models are provided, use the them as-is.
     try:
-        models = [[m, False] for m in model if m.is_trained]
-        assert len(models) == len(model)  # Test that all models are fitted.
+        fitted = [[m, False] for m in model if m.is_trained]
+        assert len(fitted) == len(model)  # Test that all models are fitted.
         assert len(model) == folds
     except AssertionError as orig_err:
         if len(model) != folds:
@@ -114,35 +114,37 @@ def brew(psms, model=None, test_fdr=0.01, folds=3, max_workers=1):
 
         raise err from orig_err
     except TypeError:
-        models = Parallel(n_jobs=max_workers, require="sharedmem")(
+        fitted = Parallel(n_jobs=max_workers, require="sharedmem")(
             delayed(_fit_model)(d, copy.deepcopy(model), f)
             for f, d in enumerate(train_sets)
         )
 
-    # sort models to have deterministic results with multithreading.
-    # Only way I found to sort is using intercept values
-    models.sort(key=lambda x: x[0].estimator.intercept_)
+    # Sort models to have deterministic results with multithreading.
+    fitted.sort(key=lambda x: x[0].fold)
+    models, resets = list(zip(*fitted))
 
     # Determine if the models need to be reset:
-    reset = any([m[1] for m in models])
+    reset = any(resets)
+
+    # If we reset, just use the original model on all the folds:
     if reset:
-        # If we reset, just use the original model on all the folds:
         scores = [
             p._calibrate_scores(model.predict(p), test_fdr) for p in psms
         ]
-    elif all([m[0].is_trained for m in models]):
-        # If we don't reset, assign scores to each fold:
-        models = [m for m, _ in models]
+
+    # If we don't reset, assign scores to each fold:
+    elif all([m.is_trained for m in models]):
         scores = [
             _predict(p, i, models, test_fdr) for p, i in zip(psms, test_idx)
         ]
+
+    # If model training has failed
     else:
-        # If model training has failed
         scores = [np.zeros(len(p.data)) for p in psms]
 
     # Find which is best: the learned model, the best feature, or
     # a pretrained model.
-    if not all([m.override for m in models]) or not model.override:
+    if not all([m.override for m in models]):
         best_feats = [p._find_best_feature(test_fdr) for p in psms]
         feat_total = sum([best_feat[1] for best_feat in best_feats])
     else:
@@ -280,6 +282,7 @@ def _fit_model(train_set, model, fold):
     """
     LOGGER.info("")
     LOGGER.info("=== Analyzing Fold %i ===", fold + 1)
+    model.fold = fold + 1
     reset = False
     try:
         model.fit(train_set)

diff --git a/mokapot/config.py b/mokapot/config.py
@@ -17,7 +17,7 @@ def _fill_text(self, text, width, indent):
 
 class Config:
     """
-    The xenith configuration options.
+    The mokapot configuration options.
 
     Options can be specified as command-line arguments.
     """
@@ -259,11 +259,6 @@ def _parser():
         help=(
             "Load previously saved models and skip model training."
             "Note that the number of models must match the value of --folds."
-            "If the models are being applied to the same dataset that they "
-            "were trained on originally, the models must be provided in the "
-            "same order as the folds to maintain cross-vaildation fold "
-            "relationships. Failure to do so will result in invalid FDR "
-            "estimates."
         ),
     )
 

diff --git a/mokapot/model.py b/mokapot/model.py
@@ -114,6 +114,8 @@ class Model:
         The number of PSMs for training.
     shuffle : bool
         Is the order of PSMs shuffled for training?
+    fold : int or None
+        The CV fold on which this model was fit, if any.
     """
 
     def __init__(
@@ -146,6 +148,11 @@ def __init__(
         self.override = override
         self.shuffle = shuffle
 
+        # To keep track of the fold that this was trained on.
+        # Needed to ensure reproducibility in brew() with
+        # multiprocessing.
+        self.fold = None
+
         # Sort out whether we need to optimize hyperparameters:
         if hasattr(self.estimator, "estimator"):
             self._needs_cv = True

diff --git a/tests/system_tests/test_cli.py b/tests/system_tests/test_cli.py
@@ -34,12 +34,6 @@ def pepxml_file():
     return pepxml
 
 
-@pytest.fixture
-def models_path():
-    """Get the saved models"""
-    return sorted(list(Path("data", "models").glob("*")))
-
-
 def test_basic_cli(tmp_path, scope_files):
     """Test that basic cli works."""
     cmd = ["mokapot", scope_files[0], "--dest_dir", tmp_path]
@@ -183,7 +177,7 @@ def test_cli_pepxml(tmp_path, pepxml_file):
     assert len(binned) > len(unbinned)
 
 
-def test_cli_saved_models(tmp_path, phospho_files, models_path):
+def test_cli_saved_models(tmp_path, phospho_files):
     """Test that saved_models works"""
     cmd = [
         "mokapot",
@@ -192,12 +186,11 @@ def test_cli_saved_models(tmp_path, phospho_files, models_path):
         tmp_path,
         "--test_fdr",
         "0.01",
-        "--load_models",
-        models_path[0],
-        models_path[1],
-        models_path[2],
     ]
 
+    subprocess.run(cmd + ["--save_models"], check=True)
+
+    cmd += ["--load_models", *list(Path(tmp_path).glob("*.pkl"))]
     subprocess.run(cmd, check=True)
     assert Path(tmp_path, "mokapot.psms.txt").exists()
     assert Path(tmp_path, "mokapot.peptides.txt").exists()
diff --git a/tests/unit_tests/test_brew.py b/tests/unit_tests/test_brew.py
@@ -2,7 +2,8 @@
 import pytest
 import numpy as np
 import mokapot
-from mokapot import PercolatorModel
+from mokapot import PercolatorModel, Model
+from sklearn.ensemble import RandomForestClassifier
 
 np.random.seed(42)
 
@@ -21,6 +22,18 @@ def test_brew_simple(psms, svm):
     assert isinstance(models[0], PercolatorModel)
 
 
+def test_brew_random_forest(psms):
+    """Verify there are no dependencies on the SVM."""
+    rfm = Model(
+        RandomForestClassifier(),
+        train_fdr=0.1,
+    )
+    results, models = mokapot.brew(psms, model=rfm, test_fdr=0.1)
+    assert isinstance(results, mokapot.confidence.LinearConfidence)
+    assert len(models) == 3
+    assert isinstance(models[0], Model)
+
+
 def test_brew_joint(psms, svm):
     """Test that the multiple input PSM collections yield multiple out"""
     collections = [psms, psms, psms]
@@ -58,8 +71,10 @@ def test_brew_trained_models(psms, svm):
         psms, svm, test_fdr=0.05
     )
     np.random.seed(3)
+    models = list(models_with_training)
+    models.reverse()  # Change the model order
     results_without_training, models_without_training = mokapot.brew(
-        psms, models_with_training, test_fdr=0.05
+        psms, models, test_fdr=0.05
     )
     assert models_with_training == models_without_training
     assert results_with_training.accepted == results_without_training.accepted