Skip to content

Commit

Permalink
move vtreat parameters out of pipeline parameters
Browse files Browse the repository at this point in the history
  • Loading branch information
JohnMount committed Jan 16, 2020
1 parent b5903de commit e5d8943
Show file tree
Hide file tree
Showing 19 changed files with 108 additions and 310 deletions.
61 changes: 7 additions & 54 deletions Examples/Pipeline/Pipeline_Example.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -88,8 +88,8 @@
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/johnmount/opt/anaconda3/envs/ai_academy_3_7/lib/python3.7/site-packages/vtreat/vtreat_api.py:369: UserWarning: called transform on same data used to fit (this causes over-fit, please use fit_transform() instead)\n",
" \"called transform on same data used to fit (this causes over-fit, please use fit_transform() instead)\")\n"
"/Users/johnmount/opt/anaconda3/envs/ai_academy_3_7/lib/python3.7/site-packages/vtreat/vtreat_api.py:348: UserWarning: possibly called transform on same data used to fit (this causes over-fit, please use fit_transform() instead)\n",
" \"possibly called transform on same data used to fit (this causes over-fit, please use fit_transform() instead)\")\n"
]
},
{
Expand Down Expand Up @@ -134,22 +134,7 @@
"text": [
"Pipeline(memory=None,\n",
" steps=[('preprocessor',\n",
" vtreat.vtreat_api.BinomialOutcomeTreatment(outcome_target=True,\n",
"params={'coders': {'clean_copy',\n",
" 'deviation_code',\n",
" 'impact_code',\n",
" 'indicator_code',\n",
" 'logit_code',\n",
" 'missing_indicator',\n",
" 'prevalence_code'},\n",
" 'cross_validation_k': 5,\n",
" 'cross_validation_plan': <vtreat.cross_plan.KWayCrossPlanYStratified object at 0x10fa81b50>,\n",
" '...\n",
" 'missingness_imputation': <function mean at 0x11093bb90>,\n",
" 'sparse_indicators': True,\n",
" 'use_hierarchical_estimate': True,\n",
" 'user_transforms': []},\n",
")),\n",
" vtreat.vtreat_api.BinomialOutcomeTreatment(outcome_target=True, )),\n",
" ('classifier',\n",
" LogisticRegression(C=1.0, class_weight=None, dual=False,\n",
" fit_intercept=True, intercept_scaling=1,\n",
Expand Down Expand Up @@ -210,7 +195,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"{'use_hierarchical_estimate': True, 'coders': {'prevalence_code', 'logit_code', 'indicator_code', 'deviation_code', 'impact_code', 'missing_indicator', 'clean_copy'}, 'filter_to_recommended': True, 'indicator_min_fraction': 0.1, 'cross_validation_plan': <vtreat.cross_plan.KWayCrossPlanYStratified object at 0x10fa81b50>, 'cross_validation_k': 5, 'user_transforms': [], 'sparse_indicators': True, 'missingness_imputation': <function mean at 0x11093bb90>, 'outcome_target': True}\n"
"{}\n"
]
}
],
Expand All @@ -236,47 +221,15 @@
"name": "stdout",
"output_type": "stream",
"text": [
"{'memory': None, 'steps': [('preprocessor', vtreat.vtreat_api.BinomialOutcomeTreatment(outcome_target=True,\n",
"params={'coders': {'clean_copy',\n",
" 'deviation_code',\n",
" 'impact_code',\n",
" 'indicator_code',\n",
" 'logit_code',\n",
" 'missing_indicator',\n",
" 'prevalence_code'},\n",
" 'cross_validation_k': 5,\n",
" 'cross_validation_plan': <vtreat.cross_plan.KWayCrossPlanYStratified object at 0x10fa81b50>,\n",
" 'filter_to_recommended': True,\n",
" 'indicator_min_fraction': 0.1,\n",
" 'missingness_imputation': <function mean at 0x11093bb90>,\n",
" 'sparse_indicators': True,\n",
" 'use_hierarchical_estimate': True,\n",
" 'user_transforms': []},\n",
")), ('classifier', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n",
"{'memory': None, 'steps': [('preprocessor', vtreat.vtreat_api.BinomialOutcomeTreatment(outcome_target=True, )), ('classifier', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n",
" intercept_scaling=1, l1_ratio=None, max_iter=100,\n",
" multi_class='warn', n_jobs=None, penalty='l2',\n",
" random_state=None, solver='lbfgs', tol=0.0001, verbose=0,\n",
" warm_start=False))], 'verbose': False, 'preprocessor': vtreat.vtreat_api.BinomialOutcomeTreatment(outcome_target=True,\n",
"params={'coders': {'clean_copy',\n",
" 'deviation_code',\n",
" 'impact_code',\n",
" 'indicator_code',\n",
" 'logit_code',\n",
" 'missing_indicator',\n",
" 'prevalence_code'},\n",
" 'cross_validation_k': 5,\n",
" 'cross_validation_plan': <vtreat.cross_plan.KWayCrossPlanYStratified object at 0x10fa81b50>,\n",
" 'filter_to_recommended': True,\n",
" 'indicator_min_fraction': 0.1,\n",
" 'missingness_imputation': <function mean at 0x11093bb90>,\n",
" 'sparse_indicators': True,\n",
" 'use_hierarchical_estimate': True,\n",
" 'user_transforms': []},\n",
"), 'classifier': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n",
" warm_start=False))], 'verbose': False, 'preprocessor': vtreat.vtreat_api.BinomialOutcomeTreatment(outcome_target=True, ), 'classifier': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n",
" intercept_scaling=1, l1_ratio=None, max_iter=100,\n",
" multi_class='warn', n_jobs=None, penalty='l2',\n",
" random_state=None, solver='lbfgs', tol=0.0001, verbose=0,\n",
" warm_start=False), 'preprocessor__use_hierarchical_estimate': True, 'preprocessor__coders': {'prevalence_code', 'logit_code', 'indicator_code', 'deviation_code', 'impact_code', 'missing_indicator', 'clean_copy'}, 'preprocessor__filter_to_recommended': True, 'preprocessor__indicator_min_fraction': 0.1, 'preprocessor__cross_validation_plan': <vtreat.cross_plan.KWayCrossPlanYStratified object at 0x10fa81b50>, 'preprocessor__cross_validation_k': 5, 'preprocessor__user_transforms': [], 'preprocessor__sparse_indicators': True, 'preprocessor__missingness_imputation': <function mean at 0x11093bb90>, 'preprocessor__outcome_target': True, 'classifier__C': 1.0, 'classifier__class_weight': None, 'classifier__dual': False, 'classifier__fit_intercept': True, 'classifier__intercept_scaling': 1, 'classifier__l1_ratio': None, 'classifier__max_iter': 100, 'classifier__multi_class': 'warn', 'classifier__n_jobs': None, 'classifier__penalty': 'l2', 'classifier__random_state': None, 'classifier__solver': 'lbfgs', 'classifier__tol': 0.0001, 'classifier__verbose': 0, 'classifier__warm_start': False}\n"
" warm_start=False), 'classifier__C': 1.0, 'classifier__class_weight': None, 'classifier__dual': False, 'classifier__fit_intercept': True, 'classifier__intercept_scaling': 1, 'classifier__l1_ratio': None, 'classifier__max_iter': 100, 'classifier__multi_class': 'warn', 'classifier__n_jobs': None, 'classifier__penalty': 'l2', 'classifier__random_state': None, 'classifier__solver': 'lbfgs', 'classifier__tol': 0.0001, 'classifier__verbose': 0, 'classifier__warm_start': False}\n"
]
}
],
Expand Down
165 changes: 0 additions & 165 deletions Examples/Pipeline/Pipeline_Example.md

This file was deleted.

2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ in a statistically sound manner.
Install `vtreat` with either of:

* `pip install vtreat`
* `pip install https://github.com/WinVector/pyvtreat/raw/master/pkg/dist/vtreat-0.3.5.tar.gz`
* `pip install https://github.com/WinVector/pyvtreat/raw/master/pkg/dist/vtreat-0.3.6.tar.gz`

# Details

Expand Down
10 changes: 5 additions & 5 deletions coverage.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,13 @@ pkg/tests/test_util.py . [100%]
Name Stmts Miss Cover
-----------------------------------------------
pkg/vtreat/__init__.py 6 0 100%
pkg/vtreat/cross_plan.py 94 52 45%
pkg/vtreat/transform.py 13 8 38%
pkg/vtreat/cross_plan.py 104 57 45%
pkg/vtreat/transform.py 17 10 41%
pkg/vtreat/util.py 161 26 84%
pkg/vtreat/vtreat_api.py 327 133 59%
pkg/vtreat/vtreat_api.py 295 101 66%
pkg/vtreat/vtreat_impl.py 481 79 84%
-----------------------------------------------
TOTAL 1082 298 72%
TOTAL 1064 273 74%


============================== 13 passed in 7.16s ==============================
============================== 13 passed in 6.62s ==============================
2 changes: 1 addition & 1 deletion pkg/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ in a statistically sound manner.
Install `vtreat` with either of:

* `pip install vtreat`
* `pip install https://github.com/WinVector/pyvtreat/raw/master/pkg/dist/vtreat-0.3.4.tar.gz`
* `pip install https://github.com/WinVector/pyvtreat/raw/master/pkg/dist/vtreat-0.3.6.tar.gz`

# Details

Expand Down
2 changes: 1 addition & 1 deletion pkg/build/lib/vtreat/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from vtreat.vtreat_api import *

__docformat__ = "restructuredtext"
__version__ = "0.3.5"
__version__ = "0.3.6"

__doc__ = """
This<https://github.com/WinVector/pyvtreat> is the Python version of the vtreat data preparation system
Expand Down
17 changes: 16 additions & 1 deletion pkg/build/lib/vtreat/cross_plan.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,17 @@ class CrossValidationPlan:
"""Data splitting plan"""

def __init__(self):
self.verbose_ = False
pass

def split_plan(self, *, n_rows=None, k_folds=None, data=None, y=None):
raise NotImplementedError("base class called")

def __repr__(self):
return "vtreat.cross_plan.CrossValidationPlan()"

def __str__(self):
return self.__repr__()


def k_way_cross_plan(n_rows, k_folds):
"""randomly split range(n_rows) into k_folds disjoint groups"""
Expand Down Expand Up @@ -59,6 +65,9 @@ def split_plan(self, *, n_rows=None, k_folds=None, data=None, y=None):
raise ValueError("k_folds must not be None")
return k_way_cross_plan(n_rows=n_rows, k_folds=k_folds)

def __repr__(self):
return "vtreat.cross_plan.KWayCrossPlan()"


def k_way_cross_plan_y_stratified(n_rows, k_folds, y):
"""randomly split range(n_rows) into k_folds disjoint groups, attempting an even y-distribution"""
Expand Down Expand Up @@ -116,6 +125,9 @@ def split_plan(self, *, n_rows=None, k_folds=None, data=None, y=None):
raise ValueError("y must not be None")
return k_way_cross_plan_y_stratified(n_rows=n_rows, k_folds=k_folds, y=y)

def __repr__(self):
return "vtreat.cross_plan.KWayCrossPlanYStratified()"


def order_cross_plan(k_folds, order_vector):
"""Build a k_folds cross validation plan based on the ordered series"""
Expand Down Expand Up @@ -170,3 +182,6 @@ def split_plan(self, *, n_rows=None, k_folds=None, data=None, y=None):
raise ValueError("k_folds must not be None")
order_vector = data[self.order_column_name_]
return order_cross_plan(k_folds=k_folds, order_vector=order_vector)

def __repr__(self):
return "vtreat.cross_plan.OrderedCrossPlan()"
12 changes: 12 additions & 0 deletions pkg/build/lib/vtreat/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,15 @@ def transform(self, X):
def fit_transform(self, X, y):
self.fit(X, y)
return self.transform(X)

def __repr__(self):
return ("vtreat.transform.UserTransform("
+ "treatment=" + self.treatment_.__repr__()
+ ") {"
+ "'y_aware_': " + str(self.y_aware_)
+ ", " + "'treatment_': " + str(self.treatment_)
+ ", " + "'incoming_vars_': " + str(self.incoming_vars_)
+ "}")

def __str__(self):
return self.__repr__()
Loading

0 comments on commit e5d8943

Please sign in to comment.