Skip to content

Commit

Permalink
Support FunctionTransformer steps in definition
Browse files Browse the repository at this point in the history
Allows user to specify a function included in gordo_components
to be included in a sklearn FunctionTransformer within a config file
  • Loading branch information
milesgranger committed Dec 4, 2018
1 parent 72eda92 commit e157da7
Show file tree
Hide file tree
Showing 7 changed files with 127 additions and 17 deletions.
Empty file.
27 changes: 27 additions & 0 deletions gordo_components/model/transformer_funcs/general.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# -*- coding: utf-8 -*-

"""
Functions to be used within sklearn's FunctionTransformer
https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.FunctionTransformer.html
Each function SHALL take an X, and optionally a y.
Functions CAN take additional arguments which should be given during the initialization of the FunctionTransformer
Example:
>>> from sklearn.preprocessing import FunctionTransformer
>>> import numpy as np
>>> def my_function(X, another_arg):
... # Some fancy X manipulation...
... return X
>>> transformer = FunctionTransformer(func=my_function, kw_args={'another_arg': 'this thing'})
>>> out = transformer.fit_transform(np.random.random(100).reshape(10, 10))
"""


def multiply_by(X, factor):
"""
Multiplies X by a given factor
"""
return X * factor
10 changes: 10 additions & 0 deletions gordo_components/serializer/pipeline_from_definition.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import copy
from typing import List, Union, Dict, Any, Optional, Iterable
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer
from sklearn.base import BaseEstimator


Expand Down Expand Up @@ -130,6 +131,15 @@ def _build_step(step: Union[str, Dict[str, Dict[str, Any]]]
f'Got {StepClass} but the supplied parameters'
f'seem invalid: {params}')

# FunctionTransformer needs to have its `func` param loaded from
# gordo_components
elif StepClass == FunctionTransformer:
for func_arg in ['func', 'inverse_func']:
if params.get(func_arg) is not None:
func = pydoc.locate(params[func_arg])
if func is None:
raise ValueError(f'Was unable to locate function: {params[func_arg]}')
params[func_arg] = func
return StepClass(**params)

# If step is just a string, can initialize it without any params
Expand Down
11 changes: 9 additions & 2 deletions gordo_components/serializer/pipeline_into_definition.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@

import inspect
import logging
from typing import Iterable
from typing import Iterable, Any, Dict, Union, List

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer


logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -52,7 +53,7 @@ def _decompose_node(step: object, prune_default_params: bool=False):
import_str = f'{step.__module__}.{step.__class__.__name__}'
init_params = inspect.getfullargspec(step.__class__.__init__).args

params = dict()
params = dict() # type: Dict[str, Union[str, int, float, List[Dict[str, Dict[str, Union[str, int, float]]]]]]

for param in [p for p in init_params if p != 'self']:

Expand All @@ -67,6 +68,12 @@ def _decompose_node(step: object, prune_default_params: bool=False):
and param in ['steps', 'transformer_list'] \
and any(isinstance(step, Obj) for Obj in [FeatureUnion, Pipeline]):
params[param] = [_decompose_node(leaf[1]) for leaf in param_val]

# Handle FunctionTransformer function object type parameters
elif isinstance(step, FunctionTransformer) and param in ['func', 'inverse_func'] and callable(param_val):
# param_val is a function for FunctionTransformer.func init param
params[param] = f'{param_val.__module__}.{param_val.__name__}'

else:
params[param] = param_val
params = _prune_default_parameters(step, params) if prune_default_params else params
Expand Down
44 changes: 31 additions & 13 deletions tests/test_serializer_from_definition.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,11 @@

from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import MinMaxScaler
from gordo_components.model.models import KerasAutoEncoder
from sklearn.preprocessing import MinMaxScaler, FunctionTransformer

from gordo_components.model.models import KerasAutoEncoder
from gordo_components.serializer import pipeline_from_definition
import gordo_components.model.transformer_funcs.general


logger = logging.getLogger(__name__)
Expand All @@ -34,6 +35,10 @@ def setUp(self):
tol: 0.0
iterated_power: auto
random_state:
- sklearn.preprocessing._function_transformer.FunctionTransformer:
func: gordo_components.model.transformer_funcs.general.multiply_by
kw_args:
factor: 1
- sklearn.pipeline.FeatureUnion:
transformer_list:
- sklearn.decomposition.pca.PCA:
Expand Down Expand Up @@ -70,6 +75,10 @@ def setUp(self):
steps:
- sklearn.decomposition.pca.PCA:
n_components: 2
- sklearn.preprocessing._function_transformer.FunctionTransformer:
func: gordo_components.model.transformer_funcs.general.multiply_by
kw_args:
factor: 1
- sklearn.pipeline.FeatureUnion:
- sklearn.decomposition.pca.PCA:
n_components: 3
Expand All @@ -94,6 +103,10 @@ def setUp(self):
tol: 0.0
iterated_power: auto
random_state:
- sklearn.preprocessing._function_transformer.FunctionTransformer:
func: gordo_components.model.transformer_funcs.general.multiply_by
kw_args:
factor: 1
- sklearn.pipeline.FeatureUnion:
transformer_list:
- sklearn.decomposition.pca.PCA:
Expand Down Expand Up @@ -140,7 +153,7 @@ def test_pipeline_from_definition(self):
# Special tests that defining non-default argument holds for a
# 'key: ' is evaled to 'key=None'
if 'memory: /tmp' in raw_yaml:
self.assertEqual(pipe.steps[1][1].transformer_list[1][1].memory, '/tmp')
self.assertEqual(pipe.steps[2][1].transformer_list[1][1].memory, '/tmp')
self._verify_pipe(pipe)

def _verify_pipe(self, pipe):
Expand All @@ -155,16 +168,21 @@ def _verify_pipe(self, pipe):
self.assertIsInstance(step1, PCA)
self.assertEqual(step1.n_components, 2)

# STEP 2 TEST: Test expected FeatureUnion Step
# STEP 2 TEST: Test expected FunctionTransformer step
step2 = pipe.steps[1][1]
self.assertIsInstance(step2, FeatureUnion)
self.assertIsInstance(step2, FunctionTransformer)
self.assertEqual(step2.func, gordo_components.model.transformer_funcs.general.multiply_by)

# STEP 3 TEST: Test expected FeatureUnion Step
step3 = pipe.steps[2][1]
self.assertIsInstance(step3, FeatureUnion)

# First transformer of feature_transformers should be PCA(n_components=3)
self.assertIsInstance(step2.transformer_list[0][1], PCA)
self.assertEqual(step2.transformer_list[0][1].n_components, 3)
self.assertIsInstance(step3.transformer_list[0][1], PCA)
self.assertEqual(step3.transformer_list[0][1].n_components, 3)

# Second transformer in feature_transformers should be Pipeline
sub_pipeline = step2.transformer_list[1][1]
# Third transformer in feature_transformers should be Pipeline
sub_pipeline = step3.transformer_list[1][1]
self.assertIsInstance(sub_pipeline, Pipeline)

# First step in the sub pipeline is MinMaxScalar
Expand All @@ -174,7 +192,7 @@ def _verify_pipe(self, pipe):
self.assertIsInstance(sub_pipeline.steps[1][1], TruncatedSVD)
self.assertEqual(sub_pipeline.steps[1][1].n_components, 2)

# STEP 3 TEST: Finally, the last step should be a KerasBaseEstimator
step3 = pipe.steps[2][1]
self.assertIsInstance(step3, KerasAutoEncoder)
self.assertTrue(step3.kind, 'feedforward_symetric')
# STEP 4 TEST: Finally, the last step should be a KerasModel
step4 = pipe.steps[3][1]
self.assertIsInstance(step4, KerasAutoEncoder)
self.assertTrue(step4.kind, 'feedforward_symetric')
13 changes: 11 additions & 2 deletions tests/test_serializer_into_definition.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MinMaxScaler, FunctionTransformer

from gordo_components.model.models import KerasAutoEncoder
from gordo_components.serializer import pipeline_into_definition, pipeline_from_definition
Expand Down Expand Up @@ -135,6 +135,7 @@ def test_into_from(self):
"""
Pass Pipeline into definition, and then from that definition
"""
from gordo_components.model.transformer_funcs.general import multiply_by
pipe = Pipeline([
('step_0', PCA(n_components=2)),
('step_1', FeatureUnion([
Expand All @@ -144,7 +145,8 @@ def test_into_from(self):
('step_1', TruncatedSVD(n_components=2))
]))
])),
('step_2', KerasAutoEncoder(kind='feedforward_symetric'))
('step_2', FunctionTransformer(func=multiply_by, kw_args={'factor': 1})),
('step_3', KerasAutoEncoder(kind='feedforward_symetric'))
])

pipeline_from_definition(pipeline_into_definition(pipe))
Expand All @@ -165,6 +167,13 @@ def test_from_into(self):
tol: 0.0
iterated_power: auto
random_state:
- sklearn.preprocessing._function_transformer.FunctionTransformer:
func: gordo_components.model.transformer_funcs.general.multiply_by
kw_args:
factor: 1
inverse_func: gordo_components.model.transformer_funcs.general.multiply_by
inv_kw_args:
factor: 1
- sklearn.pipeline.FeatureUnion:
transformer_list:
- sklearn.decomposition.pca.PCA:
Expand Down
39 changes: 39 additions & 0 deletions tests/test_transformers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# -*- coding: utf-8 -*-

import unittest
import numpy as np

from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA


class GordoFunctionTransformerFuncsTestCase(unittest.TestCase):
"""
Test all functions within gordo_components meants for use in a Scikit-Learn
FunctionTransformer work as expected
"""

def _validate_transformer(self, transformer):
"""
Inserts a transformer into the middle of a pipeline and runs it
"""
pipe = Pipeline([
('pca1', PCA()),
('custom', transformer),
('pca2', PCA())
])
X = np.random.random(size=100).reshape(10, 10)
pipe.fit_transform(X)

def test_multiply_by_function_transformer(self):
from gordo_components.model.transformer_funcs.general import multiply_by

# Provide a require argument
tf = FunctionTransformer(func=multiply_by, kw_args={'factor': 2})
self._validate_transformer(tf)

# Ignore the required argument
tf = FunctionTransformer(func=multiply_by)
with self.assertRaises(TypeError):
self._validate_transformer(tf)

0 comments on commit e157da7

Please sign in to comment.