Add copy to multiprocessing (#106)

* Add copy * Updated tests * Updated changelog
wfondrie · Sep 11, 2023 · 6725bdc · 6725bdc
1 parent 2bc1613
commit 6725bdc
Show file tree

Hide file tree

Showing 7 changed files with 49 additions and 24 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,25 +1,28 @@
 # Changelog for mokapot
 
 ## [Unreleased]
+
+## [v0.10.1] - 2023-09-11
 ### Breaking changes
 - Mokapot now uses `numpy.random.Generator` instead of the deprecated `numpy.random.RandomState` API.
-  New `rng` arguments have been added to functions and classes that rely on randomness in lieu of setting a global random seed with `np.random.seed()`. Thanks @sjust-seerbio!
+  New `rng` arguments have been added to functions and classes that rely on randomness in lieu of setting a global random seed with `np.random.seed()`. Thanks @sjust-seerbio! (#55)
 
 ### Changed
 - Added linting with Ruff to tests and pre-commit hooks (along with others)!
 
 ### Fixed
 - The PepXML reader, which broke due to a Pandas update.
 - Potential bug if lowercase peptide sequences were used and protein-level confidence estimates were enabled
+- Multiprocessing led to the same training set being used for all splits (#104).
 
-## [0.9.1] - 2022-12-14
+## [v0.9.1] - 2022-12-14
 ### Changed
 - Cross-validation classes are now detected by looking for inheritance from the `sklearn.model_selection._search.BaseSearchCV` class.
 
 ### Fixed
 - Fixed backward compatibility issue for Python <3.10.
 
-## [0.9.0] - 2022-12-02
+## [v0.9.0] - 2022-12-02
 ### Added
 - Support for plugins, allowing mokapot to use new models.
 - Added a custom Docker image with optional dependencies.
@@ -31,11 +34,11 @@
 - Updated GitHub Actions.
 - Migrated to a full pyproject.toml setuptools build. Thanks @jspaezp!
 
-## [0.8.3] - 2022-07-20
+## [v0.8.3] - 2022-07-20
 ### Fixed
 - Fixed the reported mokapot score when group FDR is used.
 
-## [0.8.2] - 2022-07-18
+## [v0.8.2] - 2022-07-18
 ### Added
 - `mokapot.Model()` objects now recorded the CV fold that they were fit on.
   This means that they can be provided to `mokapot.brew()` in any order
@@ -45,7 +48,7 @@
 - Resolved issue where models were required to have an intercept term.
 - The PepXML parser would sometimes try and log transform features with `0`'s, resulting in missing values.
 
-## [0.8.1] - 2022-06-24
+## [v0.8.1] - 2022-06-24
 
 ### Added
 - Support for previously trained models in the `brew()` function and the CLI
@@ -56,7 +59,7 @@
   `min_length-1`.
 - Links to example datasets in the documentation.
 
-## [0.8.0] - 2022-03-11
+## [v0.8.0] - 2022-03-11
 
 Thanks to @sambenfredj, @gessulat, @tkschmidt, and @MatthewThe for
 PR #44, which made these things happen!
@@ -72,17 +75,17 @@ PR #44, which made these things happen!
 - Parallelization within `mokapot.brew()` now uses `joblib`
   instead of `concurrent.futures`.
 
-## [0.7.4] - 2021-09-03
+## [v0.7.4] - 2021-09-03
 ### Changed
 - Improved documentation and added warnings for `--subset_max_train`. Thanks
   @jspaezp!
 
-## [0.7.3] - 2021-07-20
+## [v0.7.3] - 2021-07-20
 ### Fixed
 - Fixed bug where the `--keep_decoys` did not work with `--aggregate`. Also,
   added tests to cover this. Thanks @jspaezp!
 
-## [0.7.2] - 2021-07-16
+## [v0.7.2] - 2021-07-16
 ### Added
 - `--keep_decoys` option to the command line interface. Thanks @jspaezp!
 - Notes about setting a random seed to the Python API documentation. (Issue #30)
@@ -96,12 +99,12 @@ PR #44, which made these things happen!
 ### Changed
 - Updates to unit tests. Warnings are now treated as errors for system tests.
 
-## [0.7.1] - 2021-03-22
+## [v0.7.1] - 2021-03-22
 ### Changed
 - Updated the build to align with
   [PEP517](https://www.python.org/dev/peps/pep-0517/)
 
-## [0.7.0] - 2021-03-19
+## [v0.7.0] - 2021-03-19
 ### Added
 - Support for downstream peptide and protein quantitation with
   [FlashLFQ](https://github.com/smith-chem-wisc/FlashLFQ). This is accomplished
@@ -127,23 +130,23 @@ PR #44, which made these things happen!
   `importlib.metadata` to the standard library, saving a few hundred
   milliseconds.
 
-## [0.6.2] - 2021-03-12
+## [v0.6.2] - 2021-03-12
 ### Added
 - Now checks to verify there are no debugging print statements in the code
   base when linting.
 
 ### Fixed
 - Removed debugging print statements.
 
-## [0.6.1] - 2021-03-11
+## [v0.6.1] - 2021-03-11
 ### Fixed
 - Parsing Percolator tab-delimited files with a "DefaultDirection" line.
 - `Label` column is now converted to boolean during PIN file parsing.
   Previously, problems occurred if the `Label` column was of dtype `object`.
 - Parsing modifications from pepXML files were indexed incorrectly on the
   peptide string.
 
-## [0.6.0] - 2021-03-03
+## [v0.6.0] - 2021-03-03
 ### Added
 - Support for parsing PSMs from PepXML input files.
 - This changelog.

diff --git a/mokapot/brew.py b/mokapot/brew.py
@@ -106,9 +106,10 @@ def brew(psms, model=None, test_fdr=0.01, folds=3, max_workers=1, rng=None):
     LOGGER.info("Splitting PSMs into %i folds...", folds)
     test_idx = [p._split(folds) for p in psms]
     train_sets = _make_train_sets(psms, test_idx)
+
     if max_workers != 1:
         # train_sets can't be a generator for joblib :(
-        train_sets = list(train_sets)
+        train_sets = [copy.copy(d) for d in train_sets]
 
     # If trained models are provided, use the them as-is.
     try:

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -1,12 +1,19 @@
 """
 This file contains fixtures that are used at multiple points in the tests.
 """
+import logging
 import pytest
 import numpy as np
 import pandas as pd
 from mokapot import LinearPsmDataset
 
 
+@pytest.fixture(autouse=True)
+def set_logging(caplog):
+    """Add logging to everything."""
+    caplog.set_level(level=logging.INFO, logger="mokapot")
+
+
 @pytest.fixture(scope="session")
 def psm_df_6():
     """A DataFrame containing 6 PSMs"""
@@ -34,6 +41,9 @@ def psm_df_1000(tmp_path):
         "score": np.concatenate(
             [rng.normal(3, size=200), rng.normal(size=300)]
         ),
+        "score2": np.concatenate(
+            [rng.normal(3, size=200), rng.normal(size=300)]
+        ),
         "filename": "test.mzML",
         "calcmass": rng.uniform(500, 2000, size=500),
         "expmass": rng.uniform(500, 2000, size=500),
@@ -47,6 +57,7 @@ def psm_df_1000(tmp_path):
         "group": rng.choice(2, size=500),
         "peptide": [_random_peptide(5, rng) for _ in range(500)],
         "score": rng.normal(size=500),
+        "score2": rng.normal(size=500),
         "filename": "test.mzML",
         "calcmass": rng.uniform(500, 2000, size=500),
         "expmass": rng.uniform(500, 2000, size=500),
@@ -75,7 +86,7 @@ def psms(psm_df_1000):
         target_column="target",
         spectrum_columns="spectrum",
         peptide_column="peptide",
-        feature_columns="score",
+        feature_columns=["score", "score2"],
         filename_column="filename",
         scan_column="spectrum",
         calcmass_column="calcmass",

diff --git a/tests/unit_tests/test_brew.py b/tests/unit_tests/test_brew.py
@@ -44,7 +44,7 @@ def test_brew_joint(psms, svm):
 
 def test_brew_folds(psms, svm):
     """Test that changing the number of folds works"""
-    results, models = mokapot.brew(psms, svm, test_fdr=0.05, folds=4)
+    results, models = mokapot.brew(psms, svm, test_fdr=0.1, folds=4)
     assert isinstance(results, mokapot.confidence.LinearConfidence)
     assert len(models) == 4
 
@@ -92,7 +92,12 @@ def test_brew_test_fdr_error(psms, svm):
 # @pytest.mark.skip(reason="Not currently working, at least on MacOS.")
 def test_brew_multiprocess(psms, svm):
     """Test that multiprocessing doesn't yield an error"""
-    mokapot.brew(psms, svm, test_fdr=0.05, max_workers=2)
+    _, models = mokapot.brew(psms, svm, test_fdr=0.05, max_workers=2)
+
+    # The models should not be the same:
+    assert_not_close(models[0].estimator.coef_, models[1].estimator.coef_)
+    assert_not_close(models[1].estimator.coef_, models[2].estimator.coef_)
+    assert_not_close(models[2].estimator.coef_, models[0].estimator.coef_)
 
 
 def test_brew_trained_models(psms, svm):
@@ -131,3 +136,8 @@ def test_brew_using_non_trained_models_error(psms, svm):
         "One or more of the provided models was not previously trained"
         in str(err)
     )
+
+
+def assert_not_close(x, y):
+    """Assert that two arrays are not equal"""
+    np.testing.assert_raises(AssertionError, np.testing.assert_allclose, x, y)
diff --git a/tests/unit_tests/test_confidence.py b/tests/unit_tests/test_confidence.py
@@ -28,12 +28,12 @@ def test_one_group(psm_df_1000):
     )
 
     np.random.seed(42)
-    grouped = psms.assign_confidence()
+    grouped = psms.assign_confidence(eval_fdr=0.05)
     scores1 = grouped.group_confidence_estimates[0].psms["mokapot score"]
 
     np.random.seed(42)
     psms._group_column = None
-    ungrouped = psms.assign_confidence()
+    ungrouped = psms.assign_confidence(eval_fdr=0.05)
     scores2 = ungrouped.psms["mokapot score"]
 
     pd.testing.assert_series_equal(scores1, scores2)
@@ -59,7 +59,7 @@ def test_pickle(psm_df_1000, tmp_path):
         copy_data=True,
     )
 
-    results = psms.assign_confidence()
+    results = psms.assign_confidence(eval_fdr=0.05)
     pkl_file = tmp_path / "results.pkl"
     with pkl_file.open("wb+") as pkl_dat:
         pickle.dump(results, pkl_dat)

diff --git a/tests/unit_tests/test_writer_flashlfq.py b/tests/unit_tests/test_writer_flashlfq.py
@@ -8,7 +8,7 @@
 
 def test_sanity(psms, tmp_path):
     """Run simple sanity checks"""
-    conf = psms.assign_confidence()
+    conf = psms.assign_confidence(eval_fdr=0.05)
     test1 = conf.to_flashlfq(tmp_path / "test1.txt")
     mokapot.to_flashlfq(conf, tmp_path / "test2.txt")
     test3 = mokapot.to_flashlfq([conf, conf], tmp_path / "test3.txt")

diff --git a/tests/unit_tests/test_writer_txt.py b/tests/unit_tests/test_writer_txt.py
@@ -8,7 +8,7 @@
 
 def test_sanity(psms, tmp_path):
     """Run simple sanity checks"""
-    conf = psms.assign_confidence()
+    conf = psms.assign_confidence(eval_fdr=0.05)
     test1 = conf.to_txt(dest_dir=tmp_path, file_root="test1")
     mokapot.to_txt(conf, dest_dir=tmp_path, file_root="test2")
     test3 = mokapot.to_txt([conf, conf], dest_dir=tmp_path, file_root="test3")