Merge pull request #50 from rmcgibbo/cv-averaging

Mixtape CV fold averaging
msmbuilder · Nov 14, 2014 · 2c53f9e · 2c53f9e
2 parents a0382e0 + 5f425be
commit 2c53f9e
Show file tree

Hide file tree

Showing 6 changed files with 154 additions and 26 deletions.
diff --git a/osprey/execute_worker.py b/osprey/execute_worker.py
@@ -118,8 +118,12 @@ def run_single_trial(estimator, params, trial_id, scoring, X, y, cv,
         with sessionbuilder() as session:
             trial = session.query(Trial).get(trial_id)
             trial.mean_test_score = score['mean_test_score']
+            trial.mean_train_score = score['mean_train_score']
             trial.test_scores = score['test_scores']
             trial.train_scores = score['train_scores']
+            trial.n_test_samples = score['n_test_samples']
+            trial.n_train_samples = score['n_train_samples']
+
             trial.status = 'SUCCEEDED'
             best_so_far = session.query(
                 func.max(Trial.mean_test_score)).first()

diff --git a/osprey/fit_estimator.py b/osprey/fit_estimator.py
@@ -1,13 +1,20 @@
 from __future__ import print_function, absolute_import, division
 
+import time
 from distutils.version import LooseVersion
+
+import numpy as np
 import sklearn
+from sklearn.pipeline import Pipeline
 from sklearn.base import is_classifier, clone
-from sklearn.cross_validation import _fit_and_score
 from sklearn.metrics.scorer import check_scoring
-from sklearn.utils.validation import _num_samples, check_arrays
+from sklearn.utils.validation import check_arrays
 from sklearn.externals.joblib import Parallel, delayed
-from sklearn.cross_validation import _check_cv as check_cv
+from sklearn.cross_validation import _check_cv as check_cv, _safe_split, _score
+
+from .utils import short_format_time
+from .eval_scopes import import_all_estimators
+
 
 if LooseVersion(sklearn.__version__) < LooseVersion('0.15.0'):
     raise ImportError('Please upgrade to the latest version of scikit-learn')
@@ -51,29 +58,108 @@ def fit_and_score_estimator(estimator, parameters, cv, X, y=None, scoring=None,
     )(
         delayed(_fit_and_score)(clone(estimator), X, y, scorer,
                                 train, test, verbose, parameters,
-                                fit_params=None, return_train_score=True,
-                                return_parameters=True)
+                                fit_params=None)
         for train, test in cv)
 
     assert len(out) == len(cv)
 
-    grid_scores = list()
-    n_test_samples = 0
-    score = 0
-    all_train_scores = []
-    all_test_scores = []
-    for train_score, test_score, this_n_test_samples, _, _ in out:
-        all_train_scores.append(train_score)
-        all_test_scores.append(test_score)
-        if iid:
-            test_score *= this_n_test_samples
-            n_test_samples += this_n_test_samples
-        score += test_score
+    train_scores, test_scores = [], []
+    n_train_samples, n_test_samples = [], []
+    for test_score, n_test, train_score, n_train, _ in out:
+        train_scores.append(train_score)
+        test_scores.append(test_score)
+        n_test_samples.append(n_test)
+        n_train_samples.append(n_train)
+
     if iid:
-        score /= float(n_test_samples)
+        if verbose > 0 and _is_mixtape_estimator(estimator):
+            print('[CV] Using Mixtape API n_samples averaging')
+            print('[CV]   n_train_samples: %s' % str(n_train_samples))
+            print('[CV]   n_test_samples: %s' % str(n_test_samples))
+        mean_test_score = np.average(test_scores, weights=n_test_samples)
+        mean_train_score = np.average(train_scores, weights=n_train_samples)
     else:
-        score /= len(cv)
+        mean_test_score = np.average(test_scores)
+        mean_train_score = np.average(train_scores)
 
-    grid_scores = {'mean_test_score': score, 'train_scores': all_train_scores,
-                   'test_scores': all_test_scores}
+    grid_scores = {
+        'mean_test_score': mean_test_score, 'test_scores': test_scores,
+        'mean_train_score': mean_train_score, 'train_scores': train_scores,
+        'n_test_samples': n_test_samples, 'n_train_samples': n_train_samples}
     return grid_scores
+
+
+def _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters,
+                   fit_params=None):
+    if verbose > 1:
+        if parameters is None:
+            msg = "no parameters to be set"
+        else:
+            msg = '%s' % (', '.join('%s=%s' % (k, v)
+                          for k, v in parameters.items()))
+        print("[CV] %s %s" % (msg, (64 - len(msg)) * '.'))
+
+    # adjust length of sample weights
+    n_samples = _num_samples(X)
+    fit_params = fit_params if fit_params is not None else {}
+    fit_params = dict([(k, np.asarray(v)[train]
+                       if hasattr(v, '__len__') and len(v) == n_samples else v)
+                       for k, v in fit_params.items()])
+
+    if parameters is not None:
+        estimator.set_params(**parameters)
+
+    # fit and score
+    start_time = time.time()
+
+    X_train, y_train = _safe_split(estimator, X, y, train)
+    X_test, y_test = _safe_split(estimator, X, y, test, train)
+    if y_train is None:
+        estimator.fit(X_train, **fit_params)
+    else:
+        estimator.fit(X_train, y_train, **fit_params)
+    test_score = _score(estimator, X_test, y_test, scorer)
+    train_score = _score(estimator, X_train, y_train, scorer)
+
+    scoring_time = time.time() - start_time
+
+    mixtape_api = _is_mixtape_estimator(estimator)
+    n_samples_test = _num_samples(X_test, mixtape_api=mixtape_api)
+    n_samples_train = _num_samples(X_train, mixtape_api=mixtape_api)
+    if verbose > 2:
+        msg += ", score=%f" % test_score
+    if verbose > 1:
+        end_msg = "%s -%s" % (msg, short_format_time(scoring_time))
+        print("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg))
+
+    return (test_score, n_samples_test, train_score, n_samples_train,
+            scoring_time)
+
+
+def _num_samples(x, mixtape_api=False):
+    """Return number of samples in array-like x."""
+    if not hasattr(x, '__len__') and not hasattr(x, 'shape'):
+        if hasattr(x, '__array__'):
+            x = np.asarray(x)
+        else:
+            raise TypeError("Expected sequence or array-like, got %r" % x)
+
+    if mixtape_api:
+        assert isinstance(x, list)
+        return sum(len(xx) for xx in x)
+
+    return x.shape[0] if hasattr(x, 'shape') else len(x)
+
+
+def _is_mixtape_estimator(estimator):
+    try:
+        import mixtape
+    except ImportError:
+        return False
+    mixtape_estimators = import_all_estimators(mixtape).values()
+
+    out = estimator.__class__ in mixtape_estimators
+    if isinstance(estimator, Pipeline):
+        out = any(step.__class__ in mixtape_estimators
+                  for name, step in estimator.steps)
+    return out
diff --git a/osprey/tests/test_fit_estimator.py b/osprey/tests/test_fit_estimator.py
@@ -1,6 +1,7 @@
 from __future__ import print_function, absolute_import, division
 
 import numpy as np
+from nose.plugins.skip import SkipTest
 from six import iteritems
 from sklearn.datasets import make_regression
 from sklearn.linear_model import Lasso
@@ -14,12 +15,27 @@ def test_1():
 
     lasso = Lasso()
     params = {'alpha': 2}
-    cv = 5
+    cv = 6
     out = fit_and_score_estimator(lasso, params, cv=cv, X=X, y=y, verbose=0)
 
     param_grid = dict((k, [v]) for k, v in iteritems(params))
     g = GridSearchCV(estimator=lasso, param_grid=param_grid, cv=cv)
     g.fit(X, y)
 
-    assert out['mean_test_score'] == g.grid_scores_[0].mean_validation_score
+    np.testing.assert_almost_equal(
+        out['mean_test_score'], g.grid_scores_[0].mean_validation_score)
+
     assert np.all(out['test_scores'] == g.grid_scores_[0].cv_validation_scores)
+
+
+def test_2():
+    try:
+        from mixtape.markovstatemodel import MarkovStateModel
+    except ImportError as e:
+        raise SkipTest(e)
+
+    X = [np.random.randint(2, size=10), np.random.randint(2, size=11)]
+    out = fit_and_score_estimator(
+        MarkovStateModel(), {'verbose': False}, cv=2, X=X, y=None, verbose=0)
+    np.testing.assert_array_equal(out['n_train_samples'], [11, 10])
+    np.testing.assert_array_equal(out['n_test_samples'], [10, 11])
diff --git a/osprey/tests/test_trials.py b/osprey/tests/test_trials.py
@@ -20,10 +20,10 @@ def test_1():
         con = sqlite3.connect('db')
         table_names = con.execute("SELECT name FROM sqlite_master "
                                   "WHERE type='table'").fetchone()
-        assert table_names == (u'trials_v2',)
+        assert table_names == (u'trials_v3',)
 
         table_names = con.execute(
-            "SELECT project_name FROM trials_v2").fetchone()
+            "SELECT project_name FROM trials_v3").fetchone()
         assert table_names == (u'abc123',)
 
     finally:

diff --git a/osprey/trials.py b/osprey/trials.py
@@ -33,7 +33,7 @@ def process_result_value(self, value, dialect):
 
 
 class Trial(Base):
-    __tablename__ = 'trials_v2'
+    __tablename__ = 'trials_v3'
     default_project_name = None
 
     id = Column(Integer, primary_key=True)
@@ -42,8 +42,11 @@ class Trial(Base):
     parameters = Column(JSONEncoded())
 
     mean_test_score = Column(Float)
+    mean_train_score = Column(Float)
     train_scores = Column(JSONEncoded())
     test_scores = Column(JSONEncoded())
+    n_train_samples = Column(JSONEncoded())
+    n_test_samples = Column(JSONEncoded())
 
     started = Column(DateTime())
     completed = Column(DateTime())

diff --git a/osprey/utils.py b/osprey/utils.py
@@ -96,6 +96,25 @@ def current_pretty_time():
     return datetime.now().strftime("%B %d, %Y %l:%M %p")
 
 
+def _squeeze_time(t):
+    """Remove .1s to the time under Windows: this is the time it take to
+    stat files. This is needed to make results similar to timings under
+    Unix, for tests
+    """
+    if sys.platform.startswith('win'):
+        return max(0, t - .1)
+    else:
+        return t
+
+
+def short_format_time(t):
+    t = _squeeze_time(t)
+    if t > 60:
+        return "%4.1fmin" % (t / 60.)
+    else:
+        return " %5.1fs" % (t)
+
+
 def mock_module(name):
 
     class MockModule(object):