Skip to content

Commit

Permalink
Fixed need for intercept (#62)
Browse files Browse the repository at this point in the history
* Fixed need for intercept and cleaned brew

* Create models for test on-the-fly

* Fixed capitalization
  • Loading branch information
wfondrie authored Jul 18, 2022
1 parent 3a638c0 commit 21680cc
Show file tree
Hide file tree
Showing 9 changed files with 53 additions and 34 deletions.
8 changes: 7 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,13 @@
# Changelog for mokapot

## Unreleased
## [0.8.2] - 2022-07-18
### Added
- `mokapot.Model()` objects now recored the CV fold that they were fit on.
This means that they can be provided to `mokapot.brew()` in any order
and still maintain proper cross-validation bins.

### Fixed
- Resolved issue where models were required to have an intercept term.
- The PepXML parser would sometimes try and log transform features with `0`'s, resulting in missing values.

## [0.8.1] - 2022-06-24
Expand Down
Binary file removed data/models/mokapot.model_fold-1.pkl
Binary file not shown.
Binary file removed data/models/mokapot.model_fold-2.pkl
Binary file not shown.
Binary file removed data/models/mokapot.model_fold-3.pkl
Binary file not shown.
31 changes: 17 additions & 14 deletions mokapot/brew.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def brew(psms, model=None, test_fdr=0.01, folds=3, max_workers=1):
machine models used by Percolator. If a list of
:py:class:`mokapot.Model` objects is provided, they are assumed
to be previously trained models and will and one will be
used to rescore each fold in the order they are provided.
used to rescore each fold.
test_fdr : float, optional
The false-discovery rate threshold at which to evaluate
the learned models.
Expand Down Expand Up @@ -98,8 +98,8 @@ def brew(psms, model=None, test_fdr=0.01, folds=3, max_workers=1):

# If trained models are provided, use the them as-is.
try:
models = [[m, False] for m in model if m.is_trained]
assert len(models) == len(model) # Test that all models are fitted.
fitted = [[m, False] for m in model if m.is_trained]
assert len(fitted) == len(model) # Test that all models are fitted.
assert len(model) == folds
except AssertionError as orig_err:
if len(model) != folds:
Expand All @@ -114,35 +114,37 @@ def brew(psms, model=None, test_fdr=0.01, folds=3, max_workers=1):

raise err from orig_err
except TypeError:
models = Parallel(n_jobs=max_workers, require="sharedmem")(
fitted = Parallel(n_jobs=max_workers, require="sharedmem")(
delayed(_fit_model)(d, copy.deepcopy(model), f)
for f, d in enumerate(train_sets)
)

# sort models to have deterministic results with multithreading.
# Only way I found to sort is using intercept values
models.sort(key=lambda x: x[0].estimator.intercept_)
# Sort models to have deterministic results with multithreading.
fitted.sort(key=lambda x: x[0].fold)
models, resets = list(zip(*fitted))

# Determine if the models need to be reset:
reset = any([m[1] for m in models])
reset = any(resets)

# If we reset, just use the original model on all the folds:
if reset:
# If we reset, just use the original model on all the folds:
scores = [
p._calibrate_scores(model.predict(p), test_fdr) for p in psms
]
elif all([m[0].is_trained for m in models]):
# If we don't reset, assign scores to each fold:
models = [m for m, _ in models]

# If we don't reset, assign scores to each fold:
elif all([m.is_trained for m in models]):
scores = [
_predict(p, i, models, test_fdr) for p, i in zip(psms, test_idx)
]

# If model training has failed
else:
# If model training has failed
scores = [np.zeros(len(p.data)) for p in psms]

# Find which is best: the learned model, the best feature, or
# a pretrained model.
if not all([m.override for m in models]) or not model.override:
if not all([m.override for m in models]):
best_feats = [p._find_best_feature(test_fdr) for p in psms]
feat_total = sum([best_feat[1] for best_feat in best_feats])
else:
Expand Down Expand Up @@ -280,6 +282,7 @@ def _fit_model(train_set, model, fold):
"""
LOGGER.info("")
LOGGER.info("=== Analyzing Fold %i ===", fold + 1)
model.fold = fold + 1
reset = False
try:
model.fit(train_set)
Expand Down
7 changes: 1 addition & 6 deletions mokapot/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def _fill_text(self, text, width, indent):

class Config:
"""
The xenith configuration options.
The mokapot configuration options.
Options can be specified as command-line arguments.
"""
Expand Down Expand Up @@ -259,11 +259,6 @@ def _parser():
help=(
"Load previously saved models and skip model training."
"Note that the number of models must match the value of --folds."
"If the models are being applied to the same dataset that they "
"were trained on originally, the models must be provided in the "
"same order as the folds to maintain cross-vaildation fold "
"relationships. Failure to do so will result in invalid FDR "
"estimates."
),
)

Expand Down
7 changes: 7 additions & 0 deletions mokapot/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,8 @@ class Model:
The number of PSMs for training.
shuffle : bool
Is the order of PSMs shuffled for training?
fold : int or None
The CV fold on which this model was fit, if any.
"""

def __init__(
Expand Down Expand Up @@ -146,6 +148,11 @@ def __init__(
self.override = override
self.shuffle = shuffle

# To keep track of the fold that this was trained on.
# Needed to ensure reproducibility in brew() with
# multiprocessing.
self.fold = None

# Sort out whether we need to optimize hyperparameters:
if hasattr(self.estimator, "estimator"):
self._needs_cv = True
Expand Down
15 changes: 4 additions & 11 deletions tests/system_tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,12 +34,6 @@ def pepxml_file():
return pepxml


@pytest.fixture
def models_path():
"""Get the saved models"""
return sorted(list(Path("data", "models").glob("*")))


def test_basic_cli(tmp_path, scope_files):
"""Test that basic cli works."""
cmd = ["mokapot", scope_files[0], "--dest_dir", tmp_path]
Expand Down Expand Up @@ -183,7 +177,7 @@ def test_cli_pepxml(tmp_path, pepxml_file):
assert len(binned) > len(unbinned)


def test_cli_saved_models(tmp_path, phospho_files, models_path):
def test_cli_saved_models(tmp_path, phospho_files):
"""Test that saved_models works"""
cmd = [
"mokapot",
Expand All @@ -192,12 +186,11 @@ def test_cli_saved_models(tmp_path, phospho_files, models_path):
tmp_path,
"--test_fdr",
"0.01",
"--load_models",
models_path[0],
models_path[1],
models_path[2],
]

subprocess.run(cmd + ["--save_models"], check=True)

cmd += ["--load_models", *list(Path(tmp_path).glob("*.pkl"))]
subprocess.run(cmd, check=True)
assert Path(tmp_path, "mokapot.psms.txt").exists()
assert Path(tmp_path, "mokapot.peptides.txt").exists()
19 changes: 17 additions & 2 deletions tests/unit_tests/test_brew.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
import pytest
import numpy as np
import mokapot
from mokapot import PercolatorModel
from mokapot import PercolatorModel, Model
from sklearn.ensemble import RandomForestClassifier

np.random.seed(42)

Expand All @@ -21,6 +22,18 @@ def test_brew_simple(psms, svm):
assert isinstance(models[0], PercolatorModel)


def test_brew_random_forest(psms):
"""Verify there are no dependencies on the SVM."""
rfm = Model(
RandomForestClassifier(),
train_fdr=0.1,
)
results, models = mokapot.brew(psms, model=rfm, test_fdr=0.1)
assert isinstance(results, mokapot.confidence.LinearConfidence)
assert len(models) == 3
assert isinstance(models[0], Model)


def test_brew_joint(psms, svm):
"""Test that the multiple input PSM collections yield multiple out"""
collections = [psms, psms, psms]
Expand Down Expand Up @@ -58,8 +71,10 @@ def test_brew_trained_models(psms, svm):
psms, svm, test_fdr=0.05
)
np.random.seed(3)
models = list(models_with_training)
models.reverse() # Change the model order
results_without_training, models_without_training = mokapot.brew(
psms, models_with_training, test_fdr=0.05
psms, models, test_fdr=0.05
)
assert models_with_training == models_without_training
assert results_with_training.accepted == results_without_training.accepted
Expand Down

0 comments on commit 21680cc

Please sign in to comment.