From 6725bdc6366d735b516ce18af7f1ab51a94b8f0e Mon Sep 17 00:00:00 2001 From: Will Fondrie Date: Mon, 11 Sep 2023 10:45:30 -0700 Subject: [PATCH] Add copy to multiprocessing (#106) * Add copy * Updated tests * Updated changelog --- CHANGELOG.md | 33 +++++++++++++----------- mokapot/brew.py | 3 ++- tests/conftest.py | 13 +++++++++- tests/unit_tests/test_brew.py | 14 ++++++++-- tests/unit_tests/test_confidence.py | 6 ++--- tests/unit_tests/test_writer_flashlfq.py | 2 +- tests/unit_tests/test_writer_txt.py | 2 +- 7 files changed, 49 insertions(+), 24 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2a81e5f3..27b7fe2d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,9 +1,11 @@ # Changelog for mokapot ## [Unreleased] + +## [v0.10.1] - 2023-09-11 ### Breaking changes - Mokapot now uses `numpy.random.Generator` instead of the deprecated `numpy.random.RandomState` API. - New `rng` arguments have been added to functions and classes that rely on randomness in lieu of setting a global random seed with `np.random.seed()`. Thanks @sjust-seerbio! + New `rng` arguments have been added to functions and classes that rely on randomness in lieu of setting a global random seed with `np.random.seed()`. Thanks @sjust-seerbio! (#55) ### Changed - Added linting with Ruff to tests and pre-commit hooks (along with others)! @@ -11,15 +13,16 @@ ### Fixed - The PepXML reader, which broke due to a Pandas update. - Potential bug if lowercase peptide sequences were used and protein-level confidence estimates were enabled +- Multiprocessing led to the same training set being used for all splits (#104). -## [0.9.1] - 2022-12-14 +## [v0.9.1] - 2022-12-14 ### Changed - Cross-validation classes are now detected by looking for inheritance from the `sklearn.model_selection._search.BaseSearchCV` class. ### Fixed - Fixed backward compatibility issue for Python <3.10. -## [0.9.0] - 2022-12-02 +## [v0.9.0] - 2022-12-02 ### Added - Support for plugins, allowing mokapot to use new models. - Added a custom Docker image with optional dependencies. @@ -31,11 +34,11 @@ - Updated GitHub Actions. - Migrated to a full pyproject.toml setuptools build. Thanks @jspaezp! -## [0.8.3] - 2022-07-20 +## [v0.8.3] - 2022-07-20 ### Fixed - Fixed the reported mokapot score when group FDR is used. -## [0.8.2] - 2022-07-18 +## [v0.8.2] - 2022-07-18 ### Added - `mokapot.Model()` objects now recorded the CV fold that they were fit on. This means that they can be provided to `mokapot.brew()` in any order @@ -45,7 +48,7 @@ - Resolved issue where models were required to have an intercept term. - The PepXML parser would sometimes try and log transform features with `0`'s, resulting in missing values. -## [0.8.1] - 2022-06-24 +## [v0.8.1] - 2022-06-24 ### Added - Support for previously trained models in the `brew()` function and the CLI @@ -56,7 +59,7 @@ `min_length-1`. - Links to example datasets in the documentation. -## [0.8.0] - 2022-03-11 +## [v0.8.0] - 2022-03-11 Thanks to @sambenfredj, @gessulat, @tkschmidt, and @MatthewThe for PR #44, which made these things happen! @@ -72,17 +75,17 @@ PR #44, which made these things happen! - Parallelization within `mokapot.brew()` now uses `joblib` instead of `concurrent.futures`. -## [0.7.4] - 2021-09-03 +## [v0.7.4] - 2021-09-03 ### Changed - Improved documentation and added warnings for `--subset_max_train`. Thanks @jspaezp! -## [0.7.3] - 2021-07-20 +## [v0.7.3] - 2021-07-20 ### Fixed - Fixed bug where the `--keep_decoys` did not work with `--aggregate`. Also, added tests to cover this. Thanks @jspaezp! -## [0.7.2] - 2021-07-16 +## [v0.7.2] - 2021-07-16 ### Added - `--keep_decoys` option to the command line interface. Thanks @jspaezp! - Notes about setting a random seed to the Python API documentation. (Issue #30) @@ -96,12 +99,12 @@ PR #44, which made these things happen! ### Changed - Updates to unit tests. Warnings are now treated as errors for system tests. -## [0.7.1] - 2021-03-22 +## [v0.7.1] - 2021-03-22 ### Changed - Updated the build to align with [PEP517](https://www.python.org/dev/peps/pep-0517/) -## [0.7.0] - 2021-03-19 +## [v0.7.0] - 2021-03-19 ### Added - Support for downstream peptide and protein quantitation with [FlashLFQ](https://github.com/smith-chem-wisc/FlashLFQ). This is accomplished @@ -127,7 +130,7 @@ PR #44, which made these things happen! `importlib.metadata` to the standard library, saving a few hundred milliseconds. -## [0.6.2] - 2021-03-12 +## [v0.6.2] - 2021-03-12 ### Added - Now checks to verify there are no debugging print statements in the code base when linting. @@ -135,7 +138,7 @@ PR #44, which made these things happen! ### Fixed - Removed debugging print statements. -## [0.6.1] - 2021-03-11 +## [v0.6.1] - 2021-03-11 ### Fixed - Parsing Percolator tab-delimited files with a "DefaultDirection" line. - `Label` column is now converted to boolean during PIN file parsing. @@ -143,7 +146,7 @@ PR #44, which made these things happen! - Parsing modifications from pepXML files were indexed incorrectly on the peptide string. -## [0.6.0] - 2021-03-03 +## [v0.6.0] - 2021-03-03 ### Added - Support for parsing PSMs from PepXML input files. - This changelog. diff --git a/mokapot/brew.py b/mokapot/brew.py index 86e06b30..c2c6ea9c 100644 --- a/mokapot/brew.py +++ b/mokapot/brew.py @@ -106,9 +106,10 @@ def brew(psms, model=None, test_fdr=0.01, folds=3, max_workers=1, rng=None): LOGGER.info("Splitting PSMs into %i folds...", folds) test_idx = [p._split(folds) for p in psms] train_sets = _make_train_sets(psms, test_idx) + if max_workers != 1: # train_sets can't be a generator for joblib :( - train_sets = list(train_sets) + train_sets = [copy.copy(d) for d in train_sets] # If trained models are provided, use the them as-is. try: diff --git a/tests/conftest.py b/tests/conftest.py index c490a617..481d94e4 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,12 +1,19 @@ """ This file contains fixtures that are used at multiple points in the tests. """ +import logging import pytest import numpy as np import pandas as pd from mokapot import LinearPsmDataset +@pytest.fixture(autouse=True) +def set_logging(caplog): + """Add logging to everything.""" + caplog.set_level(level=logging.INFO, logger="mokapot") + + @pytest.fixture(scope="session") def psm_df_6(): """A DataFrame containing 6 PSMs""" @@ -34,6 +41,9 @@ def psm_df_1000(tmp_path): "score": np.concatenate( [rng.normal(3, size=200), rng.normal(size=300)] ), + "score2": np.concatenate( + [rng.normal(3, size=200), rng.normal(size=300)] + ), "filename": "test.mzML", "calcmass": rng.uniform(500, 2000, size=500), "expmass": rng.uniform(500, 2000, size=500), @@ -47,6 +57,7 @@ def psm_df_1000(tmp_path): "group": rng.choice(2, size=500), "peptide": [_random_peptide(5, rng) for _ in range(500)], "score": rng.normal(size=500), + "score2": rng.normal(size=500), "filename": "test.mzML", "calcmass": rng.uniform(500, 2000, size=500), "expmass": rng.uniform(500, 2000, size=500), @@ -75,7 +86,7 @@ def psms(psm_df_1000): target_column="target", spectrum_columns="spectrum", peptide_column="peptide", - feature_columns="score", + feature_columns=["score", "score2"], filename_column="filename", scan_column="spectrum", calcmass_column="calcmass", diff --git a/tests/unit_tests/test_brew.py b/tests/unit_tests/test_brew.py index 319626b1..27d04951 100644 --- a/tests/unit_tests/test_brew.py +++ b/tests/unit_tests/test_brew.py @@ -44,7 +44,7 @@ def test_brew_joint(psms, svm): def test_brew_folds(psms, svm): """Test that changing the number of folds works""" - results, models = mokapot.brew(psms, svm, test_fdr=0.05, folds=4) + results, models = mokapot.brew(psms, svm, test_fdr=0.1, folds=4) assert isinstance(results, mokapot.confidence.LinearConfidence) assert len(models) == 4 @@ -92,7 +92,12 @@ def test_brew_test_fdr_error(psms, svm): # @pytest.mark.skip(reason="Not currently working, at least on MacOS.") def test_brew_multiprocess(psms, svm): """Test that multiprocessing doesn't yield an error""" - mokapot.brew(psms, svm, test_fdr=0.05, max_workers=2) + _, models = mokapot.brew(psms, svm, test_fdr=0.05, max_workers=2) + + # The models should not be the same: + assert_not_close(models[0].estimator.coef_, models[1].estimator.coef_) + assert_not_close(models[1].estimator.coef_, models[2].estimator.coef_) + assert_not_close(models[2].estimator.coef_, models[0].estimator.coef_) def test_brew_trained_models(psms, svm): @@ -131,3 +136,8 @@ def test_brew_using_non_trained_models_error(psms, svm): "One or more of the provided models was not previously trained" in str(err) ) + + +def assert_not_close(x, y): + """Assert that two arrays are not equal""" + np.testing.assert_raises(AssertionError, np.testing.assert_allclose, x, y) diff --git a/tests/unit_tests/test_confidence.py b/tests/unit_tests/test_confidence.py index 34994f07..06f2d014 100644 --- a/tests/unit_tests/test_confidence.py +++ b/tests/unit_tests/test_confidence.py @@ -28,12 +28,12 @@ def test_one_group(psm_df_1000): ) np.random.seed(42) - grouped = psms.assign_confidence() + grouped = psms.assign_confidence(eval_fdr=0.05) scores1 = grouped.group_confidence_estimates[0].psms["mokapot score"] np.random.seed(42) psms._group_column = None - ungrouped = psms.assign_confidence() + ungrouped = psms.assign_confidence(eval_fdr=0.05) scores2 = ungrouped.psms["mokapot score"] pd.testing.assert_series_equal(scores1, scores2) @@ -59,7 +59,7 @@ def test_pickle(psm_df_1000, tmp_path): copy_data=True, ) - results = psms.assign_confidence() + results = psms.assign_confidence(eval_fdr=0.05) pkl_file = tmp_path / "results.pkl" with pkl_file.open("wb+") as pkl_dat: pickle.dump(results, pkl_dat) diff --git a/tests/unit_tests/test_writer_flashlfq.py b/tests/unit_tests/test_writer_flashlfq.py index 9aba9d70..8b468a18 100644 --- a/tests/unit_tests/test_writer_flashlfq.py +++ b/tests/unit_tests/test_writer_flashlfq.py @@ -8,7 +8,7 @@ def test_sanity(psms, tmp_path): """Run simple sanity checks""" - conf = psms.assign_confidence() + conf = psms.assign_confidence(eval_fdr=0.05) test1 = conf.to_flashlfq(tmp_path / "test1.txt") mokapot.to_flashlfq(conf, tmp_path / "test2.txt") test3 = mokapot.to_flashlfq([conf, conf], tmp_path / "test3.txt") diff --git a/tests/unit_tests/test_writer_txt.py b/tests/unit_tests/test_writer_txt.py index fea7f19a..326cbae2 100644 --- a/tests/unit_tests/test_writer_txt.py +++ b/tests/unit_tests/test_writer_txt.py @@ -8,7 +8,7 @@ def test_sanity(psms, tmp_path): """Run simple sanity checks""" - conf = psms.assign_confidence() + conf = psms.assign_confidence(eval_fdr=0.05) test1 = conf.to_txt(dest_dir=tmp_path, file_root="test1") mokapot.to_txt(conf, dest_dir=tmp_path, file_root="test2") test3 = mokapot.to_txt([conf, conf], dest_dir=tmp_path, file_root="test3")