Skip to content

Commit

Permalink
warn if same data is used in transform as in fit
Browse files Browse the repository at this point in the history
  • Loading branch information
JohnMount committed Jan 9, 2020
1 parent bab9bb2 commit 01811ed
Show file tree
Hide file tree
Showing 2 changed files with 483 additions and 0 deletions.
318 changes: 318 additions & 0 deletions Examples/Pipeline/Pipeline_Example.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,318 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"From [pyvtreat issue 12](https://github.com/WinVector/pyvtreat/issues/12)"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"pycharm": {
"is_executing": false
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"model score: 0.880\n"
]
}
],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import numpy.random\n",
"import vtreat\n",
"import vtreat.util\n",
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn.pipeline import Pipeline\n",
"from sklearn.model_selection import train_test_split\n",
"\n",
"numpy.random.seed(2019)\n",
"\n",
"def make_data(nrows):\n",
" d = pd.DataFrame({'x': 5*numpy.random.normal(size=nrows)})\n",
" d['y'] = numpy.sin(d['x']) + 0.1*numpy.random.normal(size=nrows)\n",
" d.loc[numpy.arange(3, 10), 'x'] = numpy.nan # introduce a nan level\n",
" d['xc'] = ['level_' + str(5*numpy.round(yi/5, 1)) for yi in d['y']]\n",
" d['x2'] = np.random.normal(size=nrows)\n",
" d.loc[d['xc']=='level_-1.0', 'xc'] = numpy.nan # introduce a nan level\n",
" d['yc'] = d['y']>0.5\n",
" return d\n",
"\n",
"df = make_data(500)\n",
"\n",
"df = df.drop(columns=['y'])\n",
"\n",
"transform = vtreat.BinomialOutcomeTreatment(outcome_target=True)\n",
"\n",
"clf = Pipeline(steps=[\n",
" ('preprocessor', transform),\n",
" ('classifier', LogisticRegression(solver = 'lbfgs'))]\n",
")\n",
"\n",
"X, y = df, df.pop('yc')\n",
"\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)\n",
"\n",
"clf.fit(X_train, y_train)\n",
"\n",
"print(\"model score: %.3f\" % clf.score(X_test, y_test))"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
},
"pycharm": {
"is_executing": false,
"name": "#%%\n"
}
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/johnmount/opt/anaconda3/envs/ai_academy_3_7/lib/python3.7/site-packages/vtreat/vtreat_api.py:369: UserWarning: called transform on same data used to fit (this causes over-fit, please use fit_transform() instead)\n",
" \"called transform on same data used to fit (this causes over-fit, please use fit_transform() instead)\")\n"
]
},
{
"data": {
"text/plain": [
"0.93"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"clf.score(X_train, y_train)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The above fit is an over-fit (not achievable without data leakage). Notice vtreat gave as a warning."
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
},
"pycharm": {
"is_executing": false,
"name": "#%%\n"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Pipeline(memory=None,\n",
" steps=[('preprocessor',\n",
" vtreat.vtreat_api.BinomialOutcomeTreatment(outcome_target=True,\n",
"params={'coders': {'clean_copy',\n",
" 'deviation_code',\n",
" 'impact_code',\n",
" 'indicator_code',\n",
" 'logit_code',\n",
" 'missing_indicator',\n",
" 'prevalence_code'},\n",
" 'cross_validation_k': 5,\n",
" 'cross_validation_plan': <vtreat.cross_plan.KWayCrossPlanYStratified object at 0x10fa81b50>,\n",
" '...\n",
" 'missingness_imputation': <function mean at 0x11093bb90>,\n",
" 'sparse_indicators': True,\n",
" 'use_hierarchical_estimate': True,\n",
" 'user_transforms': []},\n",
")),\n",
" ('classifier',\n",
" LogisticRegression(C=1.0, class_weight=None, dual=False,\n",
" fit_intercept=True, intercept_scaling=1,\n",
" l1_ratio=None, max_iter=100,\n",
" multi_class='warn', n_jobs=None,\n",
" penalty='l2', random_state=None,\n",
" solver='lbfgs', tol=0.0001, verbose=0,\n",
" warm_start=False))],\n",
" verbose=False)\n"
]
}
],
"source": [
"print(clf)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
},
"pycharm": {
"is_executing": false,
"name": "#%%\n"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['x_is_bad', 'xc_is_bad', 'x', 'x2', 'xc_logit_code', 'xc_prevalence_code', 'xc_lev_level_1_0', 'xc_lev__NA_', 'xc_lev_level_-0_5', 'xc_lev_level_0_5']\n"
]
}
],
"source": [
"print(transform.get_feature_names())"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
},
"pycharm": {
"is_executing": false,
"name": "#%%\n"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'use_hierarchical_estimate': True, 'coders': {'prevalence_code', 'logit_code', 'indicator_code', 'deviation_code', 'impact_code', 'missing_indicator', 'clean_copy'}, 'filter_to_recommended': True, 'indicator_min_fraction': 0.1, 'cross_validation_plan': <vtreat.cross_plan.KWayCrossPlanYStratified object at 0x10fa81b50>, 'cross_validation_k': 5, 'user_transforms': [], 'sparse_indicators': True, 'missingness_imputation': <function mean at 0x11093bb90>, 'outcome_target': True}\n"
]
}
],
"source": [
"print(transform.get_params())\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false,
"jupyter": {
"outputs_hidden": false
},
"pycharm": {
"is_executing": false,
"name": "#%%\n"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'memory': None, 'steps': [('preprocessor', vtreat.vtreat_api.BinomialOutcomeTreatment(outcome_target=True,\n",
"params={'coders': {'clean_copy',\n",
" 'deviation_code',\n",
" 'impact_code',\n",
" 'indicator_code',\n",
" 'logit_code',\n",
" 'missing_indicator',\n",
" 'prevalence_code'},\n",
" 'cross_validation_k': 5,\n",
" 'cross_validation_plan': <vtreat.cross_plan.KWayCrossPlanYStratified object at 0x10fa81b50>,\n",
" 'filter_to_recommended': True,\n",
" 'indicator_min_fraction': 0.1,\n",
" 'missingness_imputation': <function mean at 0x11093bb90>,\n",
" 'sparse_indicators': True,\n",
" 'use_hierarchical_estimate': True,\n",
" 'user_transforms': []},\n",
")), ('classifier', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n",
" intercept_scaling=1, l1_ratio=None, max_iter=100,\n",
" multi_class='warn', n_jobs=None, penalty='l2',\n",
" random_state=None, solver='lbfgs', tol=0.0001, verbose=0,\n",
" warm_start=False))], 'verbose': False, 'preprocessor': vtreat.vtreat_api.BinomialOutcomeTreatment(outcome_target=True,\n",
"params={'coders': {'clean_copy',\n",
" 'deviation_code',\n",
" 'impact_code',\n",
" 'indicator_code',\n",
" 'logit_code',\n",
" 'missing_indicator',\n",
" 'prevalence_code'},\n",
" 'cross_validation_k': 5,\n",
" 'cross_validation_plan': <vtreat.cross_plan.KWayCrossPlanYStratified object at 0x10fa81b50>,\n",
" 'filter_to_recommended': True,\n",
" 'indicator_min_fraction': 0.1,\n",
" 'missingness_imputation': <function mean at 0x11093bb90>,\n",
" 'sparse_indicators': True,\n",
" 'use_hierarchical_estimate': True,\n",
" 'user_transforms': []},\n",
"), 'classifier': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n",
" intercept_scaling=1, l1_ratio=None, max_iter=100,\n",
" multi_class='warn', n_jobs=None, penalty='l2',\n",
" random_state=None, solver='lbfgs', tol=0.0001, verbose=0,\n",
" warm_start=False), 'preprocessor__use_hierarchical_estimate': True, 'preprocessor__coders': {'prevalence_code', 'logit_code', 'indicator_code', 'deviation_code', 'impact_code', 'missing_indicator', 'clean_copy'}, 'preprocessor__filter_to_recommended': True, 'preprocessor__indicator_min_fraction': 0.1, 'preprocessor__cross_validation_plan': <vtreat.cross_plan.KWayCrossPlanYStratified object at 0x10fa81b50>, 'preprocessor__cross_validation_k': 5, 'preprocessor__user_transforms': [], 'preprocessor__sparse_indicators': True, 'preprocessor__missingness_imputation': <function mean at 0x11093bb90>, 'preprocessor__outcome_target': True, 'classifier__C': 1.0, 'classifier__class_weight': None, 'classifier__dual': False, 'classifier__fit_intercept': True, 'classifier__intercept_scaling': 1, 'classifier__l1_ratio': None, 'classifier__max_iter': 100, 'classifier__multi_class': 'warn', 'classifier__n_jobs': None, 'classifier__penalty': 'l2', 'classifier__random_state': None, 'classifier__solver': 'lbfgs', 'classifier__tol': 0.0001, 'classifier__verbose': 0, 'classifier__warm_start': False}\n"
]
}
],
"source": [
"print(clf.get_params())\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.5"
},
"pycharm": {
"stem_cell": {
"cell_type": "raw",
"metadata": {
"collapsed": false
},
"source": []
}
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Loading

0 comments on commit 01811ed

Please sign in to comment.