Skip to content

Commit

Permalink
Merge pull request #50 from rmcgibbo/cv-averaging
Browse files Browse the repository at this point in the history
Mixtape CV fold averaging
  • Loading branch information
rmcgibbo committed Nov 14, 2014
2 parents a0382e0 + 5f425be commit 2c53f9e
Show file tree
Hide file tree
Showing 6 changed files with 154 additions and 26 deletions.
4 changes: 4 additions & 0 deletions osprey/execute_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,8 +118,12 @@ def run_single_trial(estimator, params, trial_id, scoring, X, y, cv,
with sessionbuilder() as session:
trial = session.query(Trial).get(trial_id)
trial.mean_test_score = score['mean_test_score']
trial.mean_train_score = score['mean_train_score']
trial.test_scores = score['test_scores']
trial.train_scores = score['train_scores']
trial.n_test_samples = score['n_test_samples']
trial.n_train_samples = score['n_train_samples']

trial.status = 'SUCCEEDED'
best_so_far = session.query(
func.max(Trial.mean_test_score)).first()
Expand Down
128 changes: 107 additions & 21 deletions osprey/fit_estimator.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,20 @@
from __future__ import print_function, absolute_import, division

import time
from distutils.version import LooseVersion

import numpy as np
import sklearn
from sklearn.pipeline import Pipeline
from sklearn.base import is_classifier, clone
from sklearn.cross_validation import _fit_and_score
from sklearn.metrics.scorer import check_scoring
from sklearn.utils.validation import _num_samples, check_arrays
from sklearn.utils.validation import check_arrays
from sklearn.externals.joblib import Parallel, delayed
from sklearn.cross_validation import _check_cv as check_cv
from sklearn.cross_validation import _check_cv as check_cv, _safe_split, _score

from .utils import short_format_time
from .eval_scopes import import_all_estimators


if LooseVersion(sklearn.__version__) < LooseVersion('0.15.0'):
raise ImportError('Please upgrade to the latest version of scikit-learn')
Expand Down Expand Up @@ -51,29 +58,108 @@ def fit_and_score_estimator(estimator, parameters, cv, X, y=None, scoring=None,
)(
delayed(_fit_and_score)(clone(estimator), X, y, scorer,
train, test, verbose, parameters,
fit_params=None, return_train_score=True,
return_parameters=True)
fit_params=None)
for train, test in cv)

assert len(out) == len(cv)

grid_scores = list()
n_test_samples = 0
score = 0
all_train_scores = []
all_test_scores = []
for train_score, test_score, this_n_test_samples, _, _ in out:
all_train_scores.append(train_score)
all_test_scores.append(test_score)
if iid:
test_score *= this_n_test_samples
n_test_samples += this_n_test_samples
score += test_score
train_scores, test_scores = [], []
n_train_samples, n_test_samples = [], []
for test_score, n_test, train_score, n_train, _ in out:
train_scores.append(train_score)
test_scores.append(test_score)
n_test_samples.append(n_test)
n_train_samples.append(n_train)

if iid:
score /= float(n_test_samples)
if verbose > 0 and _is_mixtape_estimator(estimator):
print('[CV] Using Mixtape API n_samples averaging')
print('[CV] n_train_samples: %s' % str(n_train_samples))
print('[CV] n_test_samples: %s' % str(n_test_samples))
mean_test_score = np.average(test_scores, weights=n_test_samples)
mean_train_score = np.average(train_scores, weights=n_train_samples)
else:
score /= len(cv)
mean_test_score = np.average(test_scores)
mean_train_score = np.average(train_scores)

grid_scores = {'mean_test_score': score, 'train_scores': all_train_scores,
'test_scores': all_test_scores}
grid_scores = {
'mean_test_score': mean_test_score, 'test_scores': test_scores,
'mean_train_score': mean_train_score, 'train_scores': train_scores,
'n_test_samples': n_test_samples, 'n_train_samples': n_train_samples}
return grid_scores


def _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters,
fit_params=None):
if verbose > 1:
if parameters is None:
msg = "no parameters to be set"
else:
msg = '%s' % (', '.join('%s=%s' % (k, v)
for k, v in parameters.items()))
print("[CV] %s %s" % (msg, (64 - len(msg)) * '.'))

# adjust length of sample weights
n_samples = _num_samples(X)
fit_params = fit_params if fit_params is not None else {}
fit_params = dict([(k, np.asarray(v)[train]
if hasattr(v, '__len__') and len(v) == n_samples else v)
for k, v in fit_params.items()])

if parameters is not None:
estimator.set_params(**parameters)

# fit and score
start_time = time.time()

X_train, y_train = _safe_split(estimator, X, y, train)
X_test, y_test = _safe_split(estimator, X, y, test, train)
if y_train is None:
estimator.fit(X_train, **fit_params)
else:
estimator.fit(X_train, y_train, **fit_params)
test_score = _score(estimator, X_test, y_test, scorer)
train_score = _score(estimator, X_train, y_train, scorer)

scoring_time = time.time() - start_time

mixtape_api = _is_mixtape_estimator(estimator)
n_samples_test = _num_samples(X_test, mixtape_api=mixtape_api)
n_samples_train = _num_samples(X_train, mixtape_api=mixtape_api)
if verbose > 2:
msg += ", score=%f" % test_score
if verbose > 1:
end_msg = "%s -%s" % (msg, short_format_time(scoring_time))
print("[CV] %s %s" % ((64 - len(end_msg)) * '.', end_msg))

return (test_score, n_samples_test, train_score, n_samples_train,
scoring_time)


def _num_samples(x, mixtape_api=False):
"""Return number of samples in array-like x."""
if not hasattr(x, '__len__') and not hasattr(x, 'shape'):
if hasattr(x, '__array__'):
x = np.asarray(x)
else:
raise TypeError("Expected sequence or array-like, got %r" % x)

if mixtape_api:
assert isinstance(x, list)
return sum(len(xx) for xx in x)

return x.shape[0] if hasattr(x, 'shape') else len(x)


def _is_mixtape_estimator(estimator):
try:
import mixtape
except ImportError:
return False
mixtape_estimators = import_all_estimators(mixtape).values()

out = estimator.__class__ in mixtape_estimators
if isinstance(estimator, Pipeline):
out = any(step.__class__ in mixtape_estimators
for name, step in estimator.steps)
return out
20 changes: 18 additions & 2 deletions osprey/tests/test_fit_estimator.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from __future__ import print_function, absolute_import, division

import numpy as np
from nose.plugins.skip import SkipTest
from six import iteritems
from sklearn.datasets import make_regression
from sklearn.linear_model import Lasso
Expand All @@ -14,12 +15,27 @@ def test_1():

lasso = Lasso()
params = {'alpha': 2}
cv = 5
cv = 6
out = fit_and_score_estimator(lasso, params, cv=cv, X=X, y=y, verbose=0)

param_grid = dict((k, [v]) for k, v in iteritems(params))
g = GridSearchCV(estimator=lasso, param_grid=param_grid, cv=cv)
g.fit(X, y)

assert out['mean_test_score'] == g.grid_scores_[0].mean_validation_score
np.testing.assert_almost_equal(
out['mean_test_score'], g.grid_scores_[0].mean_validation_score)

assert np.all(out['test_scores'] == g.grid_scores_[0].cv_validation_scores)


def test_2():
try:
from mixtape.markovstatemodel import MarkovStateModel
except ImportError as e:
raise SkipTest(e)

X = [np.random.randint(2, size=10), np.random.randint(2, size=11)]
out = fit_and_score_estimator(
MarkovStateModel(), {'verbose': False}, cv=2, X=X, y=None, verbose=0)
np.testing.assert_array_equal(out['n_train_samples'], [11, 10])
np.testing.assert_array_equal(out['n_test_samples'], [10, 11])
4 changes: 2 additions & 2 deletions osprey/tests/test_trials.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,10 @@ def test_1():
con = sqlite3.connect('db')
table_names = con.execute("SELECT name FROM sqlite_master "
"WHERE type='table'").fetchone()
assert table_names == (u'trials_v2',)
assert table_names == (u'trials_v3',)

table_names = con.execute(
"SELECT project_name FROM trials_v2").fetchone()
"SELECT project_name FROM trials_v3").fetchone()
assert table_names == (u'abc123',)

finally:
Expand Down
5 changes: 4 additions & 1 deletion osprey/trials.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def process_result_value(self, value, dialect):


class Trial(Base):
__tablename__ = 'trials_v2'
__tablename__ = 'trials_v3'
default_project_name = None

id = Column(Integer, primary_key=True)
Expand All @@ -42,8 +42,11 @@ class Trial(Base):
parameters = Column(JSONEncoded())

mean_test_score = Column(Float)
mean_train_score = Column(Float)
train_scores = Column(JSONEncoded())
test_scores = Column(JSONEncoded())
n_train_samples = Column(JSONEncoded())
n_test_samples = Column(JSONEncoded())

started = Column(DateTime())
completed = Column(DateTime())
Expand Down
19 changes: 19 additions & 0 deletions osprey/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,25 @@ def current_pretty_time():
return datetime.now().strftime("%B %d, %Y %l:%M %p")


def _squeeze_time(t):
"""Remove .1s to the time under Windows: this is the time it take to
stat files. This is needed to make results similar to timings under
Unix, for tests
"""
if sys.platform.startswith('win'):
return max(0, t - .1)
else:
return t


def short_format_time(t):
t = _squeeze_time(t)
if t > 60:
return "%4.1fmin" % (t / 60.)
else:
return " %5.1fs" % (t)


def mock_module(name):

class MockModule(object):
Expand Down

0 comments on commit 2c53f9e

Please sign in to comment.