warn if same data is used in transform as in fit

WinVector · Jan 9, 2020 · 01811ed · 01811ed
1 parent bab9bb2
commit 01811ed
Show file tree

Hide file tree

Showing 2 changed files with 483 additions and 0 deletions.
diff --git a/Examples/Pipeline/Pipeline_Example.ipynb b/Examples/Pipeline/Pipeline_Example.ipynb
@@ -0,0 +1,318 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   },
+   "source": [
+    "From [pyvtreat issue 12](https://github.com/WinVector/pyvtreat/issues/12)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "pycharm": {
+     "is_executing": false
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "model score: 0.880\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import numpy.random\n",
+    "import vtreat\n",
+    "import vtreat.util\n",
+    "from sklearn.linear_model import LogisticRegression\n",
+    "from sklearn.pipeline import Pipeline\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "\n",
+    "numpy.random.seed(2019)\n",
+    "\n",
+    "def make_data(nrows):\n",
+    "    d = pd.DataFrame({'x': 5*numpy.random.normal(size=nrows)})\n",
+    "    d['y'] = numpy.sin(d['x']) + 0.1*numpy.random.normal(size=nrows)\n",
+    "    d.loc[numpy.arange(3, 10), 'x'] = numpy.nan                           # introduce a nan level\n",
+    "    d['xc'] = ['level_' + str(5*numpy.round(yi/5, 1)) for yi in d['y']]\n",
+    "    d['x2'] = np.random.normal(size=nrows)\n",
+    "    d.loc[d['xc']=='level_-1.0', 'xc'] = numpy.nan  # introduce a nan level\n",
+    "    d['yc'] = d['y']>0.5\n",
+    "    return d\n",
+    "\n",
+    "df = make_data(500)\n",
+    "\n",
+    "df = df.drop(columns=['y'])\n",
+    "\n",
+    "transform = vtreat.BinomialOutcomeTreatment(outcome_target=True)\n",
+    "\n",
+    "clf = Pipeline(steps=[\n",
+    "    ('preprocessor', transform),\n",
+    "    ('classifier', LogisticRegression(solver = 'lbfgs'))]\n",
+    ")\n",
+    "\n",
+    "X, y = df, df.pop('yc')\n",
+    "\n",
+    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)\n",
+    "\n",
+    "clf.fit(X_train, y_train)\n",
+    "\n",
+    "print(\"model score: %.3f\" % clf.score(X_test, y_test))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "collapsed": false,
+    "jupyter": {
+     "outputs_hidden": false
+    },
+    "pycharm": {
+     "is_executing": false,
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/johnmount/opt/anaconda3/envs/ai_academy_3_7/lib/python3.7/site-packages/vtreat/vtreat_api.py:369: UserWarning: called transform on same data used to fit (this causes over-fit, please use fit_transform() instead)\n",
+      "  \"called transform on same data used to fit (this causes over-fit, please use fit_transform() instead)\")\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "0.93"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "clf.score(X_train, y_train)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The above fit is an over-fit (not achievable without data leakage). Notice vtreat gave as a warning."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "collapsed": false,
+    "jupyter": {
+     "outputs_hidden": false
+    },
+    "pycharm": {
+     "is_executing": false,
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Pipeline(memory=None,\n",
+      "         steps=[('preprocessor',\n",
+      "                 vtreat.vtreat_api.BinomialOutcomeTreatment(outcome_target=True,\n",
+      "params={'coders': {'clean_copy',\n",
+      "            'deviation_code',\n",
+      "            'impact_code',\n",
+      "            'indicator_code',\n",
+      "            'logit_code',\n",
+      "            'missing_indicator',\n",
+      "            'prevalence_code'},\n",
+      " 'cross_validation_k': 5,\n",
+      " 'cross_validation_plan': <vtreat.cross_plan.KWayCrossPlanYStratified object at 0x10fa81b50>,\n",
+      " '...\n",
+      " 'missingness_imputation': <function mean at 0x11093bb90>,\n",
+      " 'sparse_indicators': True,\n",
+      " 'use_hierarchical_estimate': True,\n",
+      " 'user_transforms': []},\n",
+      ")),\n",
+      "                ('classifier',\n",
+      "                 LogisticRegression(C=1.0, class_weight=None, dual=False,\n",
+      "                                    fit_intercept=True, intercept_scaling=1,\n",
+      "                                    l1_ratio=None, max_iter=100,\n",
+      "                                    multi_class='warn', n_jobs=None,\n",
+      "                                    penalty='l2', random_state=None,\n",
+      "                                    solver='lbfgs', tol=0.0001, verbose=0,\n",
+      "                                    warm_start=False))],\n",
+      "         verbose=False)\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(clf)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "collapsed": false,
+    "jupyter": {
+     "outputs_hidden": false
+    },
+    "pycharm": {
+     "is_executing": false,
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['x_is_bad', 'xc_is_bad', 'x', 'x2', 'xc_logit_code', 'xc_prevalence_code', 'xc_lev_level_1_0', 'xc_lev__NA_', 'xc_lev_level_-0_5', 'xc_lev_level_0_5']\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(transform.get_feature_names())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "collapsed": false,
+    "jupyter": {
+     "outputs_hidden": false
+    },
+    "pycharm": {
+     "is_executing": false,
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'use_hierarchical_estimate': True, 'coders': {'prevalence_code', 'logit_code', 'indicator_code', 'deviation_code', 'impact_code', 'missing_indicator', 'clean_copy'}, 'filter_to_recommended': True, 'indicator_min_fraction': 0.1, 'cross_validation_plan': <vtreat.cross_plan.KWayCrossPlanYStratified object at 0x10fa81b50>, 'cross_validation_k': 5, 'user_transforms': [], 'sparse_indicators': True, 'missingness_imputation': <function mean at 0x11093bb90>, 'outcome_target': True}\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(transform.get_params())\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "collapsed": false,
+    "jupyter": {
+     "outputs_hidden": false
+    },
+    "pycharm": {
+     "is_executing": false,
+     "name": "#%%\n"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'memory': None, 'steps': [('preprocessor', vtreat.vtreat_api.BinomialOutcomeTreatment(outcome_target=True,\n",
+      "params={'coders': {'clean_copy',\n",
+      "            'deviation_code',\n",
+      "            'impact_code',\n",
+      "            'indicator_code',\n",
+      "            'logit_code',\n",
+      "            'missing_indicator',\n",
+      "            'prevalence_code'},\n",
+      " 'cross_validation_k': 5,\n",
+      " 'cross_validation_plan': <vtreat.cross_plan.KWayCrossPlanYStratified object at 0x10fa81b50>,\n",
+      " 'filter_to_recommended': True,\n",
+      " 'indicator_min_fraction': 0.1,\n",
+      " 'missingness_imputation': <function mean at 0x11093bb90>,\n",
+      " 'sparse_indicators': True,\n",
+      " 'use_hierarchical_estimate': True,\n",
+      " 'user_transforms': []},\n",
+      ")), ('classifier', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n",
+      "                   intercept_scaling=1, l1_ratio=None, max_iter=100,\n",
+      "                   multi_class='warn', n_jobs=None, penalty='l2',\n",
+      "                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,\n",
+      "                   warm_start=False))], 'verbose': False, 'preprocessor': vtreat.vtreat_api.BinomialOutcomeTreatment(outcome_target=True,\n",
+      "params={'coders': {'clean_copy',\n",
+      "            'deviation_code',\n",
+      "            'impact_code',\n",
+      "            'indicator_code',\n",
+      "            'logit_code',\n",
+      "            'missing_indicator',\n",
+      "            'prevalence_code'},\n",
+      " 'cross_validation_k': 5,\n",
+      " 'cross_validation_plan': <vtreat.cross_plan.KWayCrossPlanYStratified object at 0x10fa81b50>,\n",
+      " 'filter_to_recommended': True,\n",
+      " 'indicator_min_fraction': 0.1,\n",
+      " 'missingness_imputation': <function mean at 0x11093bb90>,\n",
+      " 'sparse_indicators': True,\n",
+      " 'use_hierarchical_estimate': True,\n",
+      " 'user_transforms': []},\n",
+      "), 'classifier': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n",
+      "                   intercept_scaling=1, l1_ratio=None, max_iter=100,\n",
+      "                   multi_class='warn', n_jobs=None, penalty='l2',\n",
+      "                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,\n",
+      "                   warm_start=False), 'preprocessor__use_hierarchical_estimate': True, 'preprocessor__coders': {'prevalence_code', 'logit_code', 'indicator_code', 'deviation_code', 'impact_code', 'missing_indicator', 'clean_copy'}, 'preprocessor__filter_to_recommended': True, 'preprocessor__indicator_min_fraction': 0.1, 'preprocessor__cross_validation_plan': <vtreat.cross_plan.KWayCrossPlanYStratified object at 0x10fa81b50>, 'preprocessor__cross_validation_k': 5, 'preprocessor__user_transforms': [], 'preprocessor__sparse_indicators': True, 'preprocessor__missingness_imputation': <function mean at 0x11093bb90>, 'preprocessor__outcome_target': True, 'classifier__C': 1.0, 'classifier__class_weight': None, 'classifier__dual': False, 'classifier__fit_intercept': True, 'classifier__intercept_scaling': 1, 'classifier__l1_ratio': None, 'classifier__max_iter': 100, 'classifier__multi_class': 'warn', 'classifier__n_jobs': None, 'classifier__penalty': 'l2', 'classifier__random_state': None, 'classifier__solver': 'lbfgs', 'classifier__tol': 0.0001, 'classifier__verbose': 0, 'classifier__warm_start': False}\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(clf.get_params())\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.5"
+  },
+  "pycharm": {
+   "stem_cell": {
+    "cell_type": "raw",
+    "metadata": {
+     "collapsed": false
+    },
+    "source": []
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}