Skip to content

Commit

Permalink
test user coder path
Browse files Browse the repository at this point in the history
  • Loading branch information
JohnMount committed Mar 7, 2020
1 parent c3298bd commit 2b781bf
Show file tree
Hide file tree
Showing 5 changed files with 391 additions and 342 deletions.
558 changes: 233 additions & 325 deletions Examples/UserCoders/UserCoders.ipynb

Large diffs are not rendered by default.

35 changes: 18 additions & 17 deletions coverage.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,32 +2,33 @@
platform darwin -- Python 3.7.5, pytest-5.2.4, py-1.8.0, pluggy-0.13.0
rootdir: /Users/johnmount/Documents/work/pyvtreat/pkg
plugins: cov-2.8.1
collected 15 items
collected 16 items

pkg/tests/test_classification.py .. [ 13%]
pkg/tests/test_col_name_issues.py ... [ 33%]
pkg/tests/test_imputation_controls.py . [ 40%]
pkg/tests/test_multinomial.py . [ 46%]
pkg/tests/test_nan_inf.py . [ 53%]
pkg/tests/test_outcome_name_required.py . [ 60%]
pkg/tests/test_perm_cor.py . [ 66%]
pkg/tests/test_r1_issue.py . [ 73%]
pkg/tests/test_range.py . [ 80%]
pkg/tests/test_regression.py . [ 86%]
pkg/tests/test_unsupervised.py . [ 93%]
pkg/tests/test_classification.py .. [ 12%]
pkg/tests/test_col_name_issues.py ... [ 31%]
pkg/tests/test_imputation_controls.py . [ 37%]
pkg/tests/test_multinomial.py . [ 43%]
pkg/tests/test_nan_inf.py . [ 50%]
pkg/tests/test_outcome_name_required.py . [ 56%]
pkg/tests/test_perm_cor.py . [ 62%]
pkg/tests/test_r1_issue.py . [ 68%]
pkg/tests/test_range.py . [ 75%]
pkg/tests/test_regression.py . [ 81%]
pkg/tests/test_unsupervised.py . [ 87%]
pkg/tests/test_user_coders.py . [ 93%]
pkg/tests/test_util.py . [100%]

---------- coverage: platform darwin, python 3.7.5-final-0 -----------
Name Stmts Miss Cover
-----------------------------------------------
pkg/vtreat/__init__.py 6 0 100%
pkg/vtreat/cross_plan.py 104 57 45%
pkg/vtreat/transform.py 17 10 41%
pkg/vtreat/transform.py 17 4 76%
pkg/vtreat/util.py 161 25 84%
pkg/vtreat/vtreat_api.py 218 43 80%
pkg/vtreat/vtreat_impl.py 575 107 81%
pkg/vtreat/vtreat_api.py 218 42 81%
pkg/vtreat/vtreat_impl.py 575 79 86%
-----------------------------------------------
TOTAL 1081 242 78%
TOTAL 1081 207 81%


============================== 15 passed in 9.39s ==============================
============================= 16 passed in 10.34s ==============================
Binary file modified pkg/dist/vtreat-0.3.8-py3-none-any.whl
Binary file not shown.
Binary file modified pkg/dist/vtreat-0.3.8.tar.gz
Binary file not shown.
140 changes: 140 additions & 0 deletions pkg/tests/test_user_coders.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@

# From:
# https://github.com/WinVector/pyvtreat/blob/master/Examples/UserCoders/UserCoders.ipynb


import pandas
import numpy
import numpy.random
# import seaborn

import vtreat
import vtreat.util
import vtreat.transform

have_sklearn = True
try:
import sklearn.linear_model
import sklearn
except Exception:
have_sklean = False


def test_user_coders():
sklearn.warnings.filterwarnings('ignore')

# avoid depending on sklearn.metrics.r2_score
def r_squared(*, y_true, y_pred):
y_true = numpy.asarray(y_true)
y_pred = numpy.asarray(y_pred)
return 1 - numpy.sum((y_true - y_pred)**2)/numpy.sum((y_true - numpy.mean(y_true))**2)

# %%

class PolyTransform(vtreat.transform.UserTransform):
"""a polynomial model"""

def __init__(self, *, deg=5, alpha=0.1):
vtreat.transform.UserTransform.__init__(self, treatment='poly')
self.models_ = None
self.deg = deg
self.alpha = alpha

def poly_terms(self, vname, vec):
vec = numpy.asarray(vec)
r = pandas.DataFrame({'x': vec})
for d in range(1, self.deg + 1):
r[vname + '_' + str(d)] = vec ** d
return r

def fit(self, X, y):
self.models_ = {}
self.incoming_vars_ = []
self.derived_vars_ = []
for v in X.columns:
if vtreat.util.can_convert_v_to_numeric(X[v]):
X_v = self.poly_terms(v, X[v])
model_v = sklearn.linear_model.Ridge(alpha=self.alpha).fit(X_v, y)
new_var = v + "_poly"
self.models_[v] = (model_v, [c for c in X_v.columns], new_var)
self.incoming_vars_.append(v)
self.derived_vars_.append(new_var)
return self

def transform(self, X):
r = pandas.DataFrame()
for k, v in self.models_.items():
model_k = v[0]
cols_k = v[1]
new_var = v[2]
X_k = self.poly_terms(k, X[k])
xform_k = model_k.predict(X_k)
r[new_var] = xform_k
return r

# %%

d = pandas.DataFrame({'x': [i for i in range(100)]})
d['y'] = numpy.sin(0.2 * d['x']) + 0.2 * numpy.random.normal(size=d.shape[0])
d.head()

# %%

step = PolyTransform(deg=10)

# %%

fit = step.fit_transform(d[['x']], d['y'])
fit['x'] = d['x']
fit.head()

# %%

# seaborn.scatterplot(x='x', y='y', data=d)
# seaborn.lineplot(x='x', y='x_poly', data=fit, color='red', alpha=0.5)

# %%

transform = vtreat.NumericOutcomeTreatment(
outcome_name='y',
params=vtreat.vtreat_parameters({
'filter_to_recommended': False,
'user_transforms': [PolyTransform(deg=10)]
}))

# %%

transform.fit(d, d['y'])

# %%

transform.score_frame_

# %%

x2_overfit = transform.transform(d)

# %%
# seaborn.scatterplot(x='x', y='y', data=x2_overfit)
# seaborn.lineplot(x='x', y='x_poly', data=x2_overfit, color='red', alpha=0.5)

# %%

x2 = transform.fit_transform(d, d['y'])

# %%

transform.score_frame_

# %%

x2.head()

# %%

# seaborn.scatterplot(x='x', y='y', data=x2)
# seaborn.lineplot(x='x', y='x_poly', data=x2, color='red', alpha=0.5)

# %%


0 comments on commit 2b781bf

Please sign in to comment.