Skip to content

Commit

Permalink
Prepare release v0.8.0 (#45)
Browse files Browse the repository at this point in the history
  • Loading branch information
wfondrie authored Mar 11, 2022
1 parent 1c76a7e commit 8ef79af
Show file tree
Hide file tree
Showing 9 changed files with 72 additions and 45 deletions.
6 changes: 3 additions & 3 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,10 @@ jobs:

steps:
- uses: actions/checkout@v2
- name: Set up Python 3.8
- name: Set up Python
uses: actions/setup-python@v2
with:
python-version: "3.8"
python-version: "3.x"

- name: Install dependencies
run: |
Expand All @@ -38,7 +38,7 @@ jobs:
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
- name: Run unit and system tests
run: |
pytest --cov=mokapot tests/
pytest -v --cov=mokapot tests/
- name: Upload coverage to codecov
uses: codecov/codecov-action@v1
with:
Expand Down
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
repos:
- repo: https://github.com/psf/black
rev: 19.10b0 # Replace by any tag/version: https://github.com/psf/black/tags
rev: 22.1.0 # Replace by any tag/version: https://github.com/psf/black/tags
hooks:
- id: black
language_version: python3 # Should be a command that runs python3.6+
16 changes: 16 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,21 @@
# Changelog for mokapot

## [0.8.0] - 2022-03-11

Thanks to @sambenfredj, @gessulat, @tkschmidt, and @MatthewThe for
PR #44, which made these things happen!

### Added
- A new command line argument, `--max_workers`. This allows the
cross-validation folds to be computed in parallel.
- The `PercolatorModel` class now has an `n_jobs` parameter, which
controls parallelization of the grid search.

### Changes
- Improved speed by using multiple jobs for grid search by default.
- Parallelization within `mokapot.brew()` now uses `joblib`
instead of `concurrent.futures`.

## [0.7.4] - 2021-09-03
### Changed
- Improved documentation and added warnings for `--subset_max_train`. Thanks
Expand Down
32 changes: 11 additions & 21 deletions mokapot/brew.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,10 @@
"""
import logging
import copy
from concurrent.futures import ProcessPoolExecutor

import pandas as pd
import numpy as np
from joblib import Parallel, delayed

from .model import PercolatorModel

Expand Down Expand Up @@ -47,7 +47,8 @@ def brew(psms, model=None, test_fdr=0.01, folds=3, max_workers=1):
The number of processes to use for model training. More workers
will require more memory, but will typically decrease the total
run time. An integer exceeding the number of folds will have
no additional effect.
no additional effect. Note that logging messages will be garbled
if more than one worker is enabled.
Returns
-------
Expand Down Expand Up @@ -81,25 +82,14 @@ def brew(psms, model=None, test_fdr=0.01, folds=3, max_workers=1):
LOGGER.info("Splitting PSMs into %i folds...", folds)
test_idx = [p._split(folds) for p in psms]
train_sets = _make_train_sets(psms, test_idx)

# Create args for map:
map_args = [
_fit_model,
train_sets,
[copy.deepcopy(model) for _ in range(folds)],
range(folds),
]

# Train models optionally in parallel
with ProcessPoolExecutor(max_workers=max_workers) as prc:
if max_workers == 1:
map_fun = map
else:
map_args[1] = list(map_args[1])
map_args[3] = list(map_args[3])
map_fun = prc.map

models = list(map_fun(*map_args))
if max_workers != 1:
# train_sets can't be a generator for joblib :(
train_sets = list(train_sets)

models = Parallel(n_jobs=max_workers, require="sharedmem")(
delayed(_fit_model)(d, copy.deepcopy(model), f)
for f, d in enumerate(train_sets)
)

# Determine if the models need to be reset:
reset = any([m[1] for m in models])
Expand Down
6 changes: 5 additions & 1 deletion mokapot/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,11 @@ def _parser():
"--max_workers",
default=1,
type=int,
help="The number of processes to use for model training.",
help=(
"The number of processes to use for model training. Note that "
"using more than one worker will result in garbled logging "
"messages."
),
)

parser.add_argument(
Expand Down
26 changes: 12 additions & 14 deletions mokapot/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,17 +127,6 @@ def __init__(
shuffle=True,
):
"""Initialize a Model object"""
if estimator is None:
warnings.warn(
"The estimator will need to be specified in future "
"versions. Use the PercolatorModel class instead.",
DeprecationWarning,
)
svm_model = LinearSVC(dual=False)
estimator = GridSearchCV(
svm_model, param_grid=PERC_GRID, refit=False, cv=3
)

self.estimator = clone(estimator)
self.features = None
self.is_trained = False
Expand Down Expand Up @@ -391,6 +380,8 @@ class PercolatorModel(Model):
shuffle : bool, optional
Should the order of PSMs be randomized for training? For deterministic
algorithms, this will have no effect.
n_jobs : int, optional
The number of jobs used to parallelize the hyperparameter grid search.
Attributes
----------
Expand All @@ -416,8 +407,9 @@ class PercolatorModel(Model):
the model still be used?
subset_max_train : int or None
The number of PSMs for training.
shuffle : bool
Is the order of PSMs shuffled for training?
n_jobs : int
The number of jobs to use for parallizing the hyperparameter
grid search.
"""

def __init__(
Expand All @@ -428,11 +420,17 @@ def __init__(
direction=None,
override=False,
subset_max_train=None,
n_jobs=-1,
):
"""Initialize a PercolatorModel"""
self.n_jobs = n_jobs
svm_model = LinearSVC(dual=False)
estimator = GridSearchCV(
svm_model, param_grid=PERC_GRID, refit=False, cv=3, n_jobs=-1
svm_model,
param_grid=PERC_GRID,
refit=False,
cv=3,
n_jobs=n_jobs,
)

super().__init__(
Expand Down
1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ install_requires =
matplotlib>=3.1.3
lxml>=4.6.2
triqler>=0.6.2
joblib>=1.1.0

[options.extras_require]
docs =
Expand Down
4 changes: 2 additions & 2 deletions tests/unit_tests/test_brew.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import pytest
import numpy as np
import mokapot
from mokapot import LinearPsmDataset, PercolatorModel
from mokapot import PercolatorModel

np.random.seed(42)

Expand Down Expand Up @@ -47,4 +47,4 @@ def test_brew_test_fdr_error(psms, svm):
# @pytest.mark.skip(reason="Not currently working, at least on MacOS.")
def test_brew_multiprocess(psms, svm):
"""Test that multiprocessing doesn't yield an error"""
mokapot.brew(psms, svm, test_fdr=0.05, max_workers=3)
mokapot.brew(psms, svm, test_fdr=0.05, max_workers=2)
24 changes: 21 additions & 3 deletions tests/unit_tests/test_model.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
"""Test that models work as expected"""
import pytest
import mokapot
import numpy as np
import pandas as pd
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler
Expand Down Expand Up @@ -37,6 +39,8 @@ def test_model_init():
model = mokapot.Model(LogisticRegression())
assert isinstance(model.scaler, StandardScaler)

print(model)


def test_perc_init():
"""Test the initialization of a PercolatorModel"""
Expand Down Expand Up @@ -71,6 +75,14 @@ def test_model_fit(psms):
assert isinstance(model.estimator, LogisticRegression)
assert model.is_trained

no_targets = pd.DataFrame({"targets": [False] * 100})
with pytest.raises(ValueError):
model.fit(no_targets)

no_decoys = pd.DataFrame({"targets": [True] * 100})
with pytest.raises(ValueError):
model.fit(no_decoys)


def test_model_fit_large_subset(psms):
model = mokapot.Model(
Expand Down Expand Up @@ -101,10 +113,8 @@ def test_model_predict(psms):
# The case where a model is trained on a dataset with different features:
psms._data["blah"] = np.random.randn(len(psms))
psms._feature_columns = ("score", "blah")
try:
with pytest.raises(ValueError):
model.predict(psms)
except ValueError:
pass


def test_model_persistance(tmp_path):
Expand All @@ -116,3 +126,11 @@ def test_model_persistance(tmp_path):
loaded = mokapot.load_model(model_file)

assert isinstance(loaded, mokapot.Model)


def test_dummy_scaler():
"""Test the DummyScaler class"""
data = np.random.default_rng(42).normal(0, 1, (20, 10))
scaler = mokapot.model.DummyScaler()
assert (data == scaler.fit_transform(data)).all()
assert (data == scaler.transform(data)).all()

0 comments on commit 8ef79af

Please sign in to comment.