Skip to content

Commit

Permalink
Feature/EnhancePreprocessing (#99)
Browse files Browse the repository at this point in the history
* update pipeline test to have standard scaler transformer

* `preprocessing` Transporter

* add StandardScaler

* enhance import

* add preprocessing dict

* add preprocessingTransporter to transporting functions(serialize, deserialize)

* update docstring

* remove trailing whitespaces

* raise concrete Exception

* `CHANGELOG.md` updated

* init preprocessing parameters

* remove concrete preprocessing transporters & add abstract preprocessing transporter

* Preprocessing Aggregator Transporter

* remove concrete preprocessing transporters & add abstract preprocessing transporter

* handle old scikit versions

* refactor numpy import

* generalize `OneHotEncoder` Transporting

* add numpy type itself transporting (for `OneHotEncoder`)

* remove concrete preprocessing transporters

* enhance according to codacy feedback

* `CHANGELOG.md` updated

* `SUPPORTED_MODELS.md` updated

* remove comments
  • Loading branch information
AHReccese authored May 2, 2024
1 parent 01be2d7 commit 050c861
Show file tree
Hide file tree
Showing 17 changed files with 181 additions and 253 deletions.
8 changes: 6 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,13 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.

## [Unreleased]
### Added
- `StandardScaler` Transformer in `pymilo_param.py`
- `PreprocessingTransporter` Transporter
- ndarray shape config in `GeneralDataStructure` Transporter
- `util.py` in chains
- `BinMapperTransporter` Transporter
- `BunchTransporter` Transporter
- `GeneratorTransporter` Transporter
- `LabelEncoderTransporter` Transporter
- `OneHotEncoderTransporter` Transporter
- `TreePredictorTransporter` Transporter
- `AdaboostClassifier` model
- `AdaboostRegressor` model
Expand All @@ -37,6 +37,10 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
- Ensemble chain
- `SECURITY.md`
### Changed
- `Pipeline` test updated
- `LabelBinarizer`,`LabelEncoder` and `OneHotEncoder` got embedded in `PreprocessingTransporter`
- Preprocessing support added to Ensemble chain
- Preprocessing params initialized in `pymilo_param`
- `util.py` in utils updated
- `test_pymilo.py` updated
- `pymilo_func.py` updated
Expand Down
5 changes: 5 additions & 0 deletions SUPPORTED_MODELS.md
Original file line number Diff line number Diff line change
Expand Up @@ -630,4 +630,9 @@
<td><b>LabelEncoder</b></td>
<td>>=0.8</td>
</tr>
<tr align="center">
<td>4</td>
<td><b>StandardScaler</b></td>
<td>>=0.8</td>
</tr>
</table>
2 changes: 2 additions & 0 deletions pymilo/chains/clustering_chain.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from ..transporters.general_data_structure_transporter import GeneralDataStructureTransporter
from ..transporters.function_transporter import FunctionTransporter
from ..transporters.cfnode_transporter import CFNodeTransporter
from ..transporters.preprocessing_transporter import PreprocessingTransporter

from ..utils.util import get_sklearn_type

Expand All @@ -15,6 +16,7 @@

bisecting_kmeans_support = SKLEARN_CLUSTERING_TABLE["BisectingKMeans"] != NOT_SUPPORTED
CLUSTERING_CHAIN = {
"PreprocessingTransporter": PreprocessingTransporter(),
"GeneralDataStructureTransporter": GeneralDataStructureTransporter(),
"FunctionTransporter": FunctionTransporter(),
"CFNodeTransporter": CFNodeTransporter(),
Expand Down
2 changes: 2 additions & 0 deletions pymilo/chains/decision_tree_chain.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from ..transporters.general_data_structure_transporter import GeneralDataStructureTransporter
from ..transporters.tree_transporter import TreeTransporter
from ..transporters.randomstate_transporter import RandomStateTransporter
from ..transporters.preprocessing_transporter import PreprocessingTransporter

from ..utils.util import get_sklearn_type

Expand All @@ -16,6 +17,7 @@


DECISION_TREE_CHAIN = {
"PreprocessingTransporter": PreprocessingTransporter(),
"GeneralDataStructureTransporter": GeneralDataStructureTransporter(),
"RandomStateTransporter": RandomStateTransporter(),
"TreeTransporter": TreeTransporter(),
Expand Down
26 changes: 16 additions & 10 deletions pymilo/chains/ensemble_chain.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,11 @@
from ..transporters.general_data_structure_transporter import GeneralDataStructureTransporter
from ..transporters.randomstate_transporter import RandomStateTransporter
from ..transporters.lossfunction_transporter import LossFunctionTransporter
from ..transporters.onehotencoder_transporter import OneHotEncoderTransporter
from ..transporters.bunch_transporter import BunchTransporter
from ..transporters.labelencoder_transporter import LabelEncoderTransporter
from ..transporters.generator_transporter import GeneratorTransporter
from ..transporters.treepredictor_transporter import TreePredictorTransporter
from ..transporters.binmapper_transporter import BinMapperTransporter
from ..transporters.preprocessing_transporter import PreprocessingTransporter

from ..pymilo_param import SKLEARN_ENSEMBLE_TABLE

Expand All @@ -27,14 +26,13 @@
import copy

ENSEMBLE_CHAIN = {
"PreprocessingTransporter": PreprocessingTransporter(),
"GeneralDataStructureTransporter": GeneralDataStructureTransporter(),
"TreePredictorTransporter": TreePredictorTransporter(),
"BinMapperTransporter": BinMapperTransporter(),
"GeneratorTransporter": GeneratorTransporter(),
"RandomStateTransporter": RandomStateTransporter(),
"LossFunctionTransporter": LossFunctionTransporter(),
"OneHotEncoderTransporter": OneHotEncoderTransporter(),
"LabelEncoderTransporter": LabelEncoderTransporter(),
"BunchTransporter": BunchTransporter(),
}

Expand Down Expand Up @@ -166,14 +164,18 @@ def serialize_ensemble(ensemble_object):
for key, value in ensemble_object.__dict__.items():
if isinstance(value, list):
has_inner_tuple_with_ml_model = False
pt = PreprocessingTransporter()
for idx, item in enumerate(value):
if isinstance(item, tuple):
listed_tuple = list(item)
for inner_idx, inner_item in enumerate(listed_tuple):
has_inner_model, result = serialize_possible_ml_model(inner_item)
if has_inner_model:
has_inner_tuple_with_ml_model = True
listed_tuple[inner_idx] = result
if pt.is_preprocessing_module(inner_item):
listed_tuple[inner_idx] = pt.serialize_pre_module(inner_item)
else:
has_inner_model, result = serialize_possible_ml_model(inner_item)
if has_inner_model:
has_inner_tuple_with_ml_model = True
listed_tuple[inner_idx] = result
value[idx] = listed_tuple
else:
value[idx] = serialize_possible_ml_model(item)[1]
Expand Down Expand Up @@ -325,12 +327,16 @@ def deserialize_ensemble(ensemble, is_inner_model=False):
value) and value["pymiloed-data-structure"] == "list of (str, estimator) tuples":
listed_tuples = value["pymiloed-data"]
list_of_tuples = []
pt = PreprocessingTransporter()
for listed_tuple in listed_tuples:
name, serialized_ml_model = listed_tuple
name, serialized_model = listed_tuple
retrieved_model = pt.deserialize_pre_module(serialized_model) if pt.is_preprocessing_module(
serialized_model) else deserialize_possible_ml_model(serialized_model)[1]
list_of_tuples.append(
(name, deserialize_possible_ml_model(serialized_ml_model)[1])
(name, retrieved_model)
)
data[key] = list_of_tuples

elif GeneralDataStructureTransporter().is_deserialized_ndarray(value):
has_inner_model, result = deserialize_models_in_ndarray(value)
if has_inner_model:
Expand Down
9 changes: 5 additions & 4 deletions pymilo/chains/linear_model_chain.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from ..transporters.general_data_structure_transporter import GeneralDataStructureTransporter
from ..transporters.baseloss_transporter import BaseLossTransporter
from ..transporters.lossfunction_transporter import LossFunctionTransporter
from ..transporters.labelbinarizer_transporter import LabelBinarizerTransporter
from ..transporters.preprocessing_transporter import PreprocessingTransporter

from ..pymilo_param import SKLEARN_LINEAR_MODEL_TABLE
from ..utils.util import get_sklearn_type, is_iterable
Expand All @@ -16,10 +16,11 @@


LINEAR_MODEL_CHAIN = {
"PreprocessingTransporter": PreprocessingTransporter(),
"GeneralDataStructureTransporter": GeneralDataStructureTransporter(),
"BaseLossTransporter": BaseLossTransporter(),
"LossFunctionTransporter": LossFunctionTransporter(),
"LabelBinarizerTransporter": LabelBinarizerTransporter()}
}


def is_linear_model(model):
Expand Down Expand Up @@ -101,9 +102,9 @@ def serialize_linear_model(linear_model_object):
for key in linear_model_object.__dict__:
if is_linear_model(linear_model_object.__dict__[key]):
linear_model_object.__dict__[key] = {
"pymilo-inner-model-data": transport_linear_model(linear_model_object.__dict__[key], Command.SERIALIZE),
"pymilo-inner-model-data": transport_linear_model(linear_model_object.__dict__[key], Command.SERIALIZE, True),
"pymilo-inner-model-type": get_sklearn_type(linear_model_object.__dict__[key]),
"by-pass": True
"pymilo-by-pass": True
}
# now serializing non-linear model fields
for transporter in LINEAR_MODEL_CHAIN:
Expand Down
2 changes: 2 additions & 0 deletions pymilo/chains/naive_bayes_chain.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from ..transporters.transporter import Command

from ..transporters.general_data_structure_transporter import GeneralDataStructureTransporter
from ..transporters.preprocessing_transporter import PreprocessingTransporter

from ..pymilo_param import SKLEARN_NAIVE_BAYES_TABLE
from ..exceptions.serialize_exception import PymiloSerializationException, SerilaizatoinErrorTypes
Expand All @@ -13,6 +14,7 @@
from traceback import format_exc

NAIVE_BAYES_CHAIN = {
"PreprocessingTransporter": PreprocessingTransporter(),
"GeneralDataStructureTransporter": GeneralDataStructureTransporter(),
}

Expand Down
2 changes: 2 additions & 0 deletions pymilo/chains/neighbours_chain.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from ..transporters.general_data_structure_transporter import GeneralDataStructureTransporter
from ..transporters.neighbors_tree_transporter import NeighborsTreeTransporter
from ..transporters.preprocessing_transporter import PreprocessingTransporter

from ..pymilo_param import SKLEARN_NEIGHBORS_TABLE
from ..exceptions.serialize_exception import PymiloSerializationException, SerilaizatoinErrorTypes
Expand All @@ -14,6 +15,7 @@
from traceback import format_exc

NEIGHBORS_CHAIN = {
"PreprocessingTransporter": PreprocessingTransporter(),
"GeneralDataStructureTransporter": GeneralDataStructureTransporter(),
"NeighborsTreeTransporter": NeighborsTreeTransporter(),
}
Expand Down
4 changes: 2 additions & 2 deletions pymilo/chains/neural_network_chain.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from ..transporters.randomstate_transporter import RandomStateTransporter
from ..transporters.sgdoptimizer_transporter import SGDOptimizerTransporter
from ..transporters.adamoptimizer_transporter import AdamOptimizerTransporter
from ..transporters.labelbinarizer_transporter import LabelBinarizerTransporter
from ..transporters.preprocessing_transporter import PreprocessingTransporter

from ..pymilo_param import SKLEARN_NEURAL_NETWORK_TABLE

Expand All @@ -19,11 +19,11 @@


NEURAL_NETWORK_CHAIN = {
"PreprocessingTransporter": PreprocessingTransporter(),
"GeneralDataStructureTransporter": GeneralDataStructureTransporter(),
"RandomStateTransporter": RandomStateTransporter(),
"SGDOptimizer": SGDOptimizerTransporter(),
"AdamOptimizerTransporter": AdamOptimizerTransporter(),
"LabelBinarizerTransporter": LabelBinarizerTransporter(),
}


Expand Down
2 changes: 2 additions & 0 deletions pymilo/chains/svm_chain.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from ..transporters.general_data_structure_transporter import GeneralDataStructureTransporter
from ..transporters.randomstate_transporter import RandomStateTransporter
from ..transporters.preprocessing_transporter import PreprocessingTransporter

from ..pymilo_param import SKLEARN_SVM_TABLE
from ..exceptions.serialize_exception import PymiloSerializationException, SerilaizatoinErrorTypes
Expand All @@ -14,6 +15,7 @@
from traceback import format_exc

SVM_CHAIN = {
"PreprocessingTransporter": PreprocessingTransporter(),
"GeneralDataStructureTransporter": GeneralDataStructureTransporter(),
"RandomStateTransporter": RandomStateTransporter(),
}
Expand Down
40 changes: 20 additions & 20 deletions pymilo/pymilo_param.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,6 @@
# -*- coding: utf-8 -*-
"""Parameters and constants."""
from numpy import uint8
from numpy import intc
from numpy import inf
from numpy import float64
from numpy import int32
from numpy import int64
from numpy import uint64
from sklearn.preprocessing import LabelBinarizer

import numpy as np
import sklearn.linear_model as linear_model
import sklearn.neural_network as neural_network
import sklearn.tree as tree
Expand All @@ -20,7 +12,7 @@
import sklearn.dummy as dummy
import sklearn.ensemble as ensemble
import sklearn.pipeline as pipeline

import sklearn.preprocessing as preprocessing

quantile_regressor_support = False
try:
Expand Down Expand Up @@ -205,24 +197,32 @@
"Pipeline": pipeline.Pipeline,
}

SKLEARN_PREPROCESSING_TABLE = {
"StandardScaler": preprocessing.StandardScaler,
"OneHotEncoder": preprocessing.OneHotEncoder,
"LabelBinarizer": preprocessing.LabelBinarizer,
"LabelEncoder": preprocessing.LabelEncoder,
}

KEYS_NEED_PREPROCESSING_BEFORE_DESERIALIZATION = {
"_label_binarizer": LabelBinarizer, # in Ridge Classifier
"active_": int32, # in Lasso Lars
"n_nonzero_coefs_": int64, # in OMP-CV
"_label_binarizer": preprocessing.LabelBinarizer, # in Ridge Classifier
"active_": np.int32, # in Lasso Lars
"n_nonzero_coefs_": np.int64, # in OMP-CV
"scores_": dict, # in Logistic Regression CV,
"_base_loss": {}, # BaseLoss in Logistic Regression,
"loss_function_": {}, # LossFunction in SGD Classifier,
"estimator_": {}, # LinearRegression model inside RANSAC
}

NUMPY_TYPE_DICT = {
"numpy.intc": intc,
"numpy.int32": int32,
"numpy.int64": int64,
"numpy.float64": float64,
"numpy.infinity": lambda _: inf,
"numpy.uint8": uint8,
"numpy.uint64": uint64,
"numpy.intc": np.intc,
"numpy.int32": np.int32,
"numpy.int64": np.int64,
"numpy.float64": np.float64,
"numpy.infinity": lambda _: np.inf,
"numpy.uint8": np.uint8,
"numpy.uint64": np.uint64,
"numpy.dtype": np.dtype,
}

EXPORTED_MODELS_PATH = {
Expand Down
17 changes: 13 additions & 4 deletions pymilo/transporters/general_data_structure_transporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,8 +93,16 @@ def serialize(self, data, key, model_type):
:type model_type: str
:return: pymilo serialized output of data[key]
"""
if isinstance(data[key], type):
raw_type = str(data[key])
raw_type = "numpy" + str(raw_type).split("numpy")[-1][:-2]
if raw_type in NUMPY_TYPE_DICT.keys():
data[key] = {
"np-type": "numpy.dtype",
"value": raw_type
}
# 1. Handling numpy infinity, ransac
if isinstance(data[key], np.float64):
elif isinstance(data[key], np.float64):
if np.inf == data[key]:
data[key] = {
"np-type": "numpy.infinity",
Expand Down Expand Up @@ -209,7 +217,7 @@ def get_deserialized_dict(self, content):
return self.deep_deserialize_ndarray(content)

if check_str_in_iterable("np-type", content) and check_str_in_iterable("value", content):
return NUMPY_TYPE_DICT[content["np-type"]](content["value"])
return self.get_deserialized_regular_primary_types(content)

for key in content:

Expand Down Expand Up @@ -271,6 +279,8 @@ def get_deserialized_regular_primary_types(self, content):
:return: the associated np.int32|np.int64|np.inf
"""
if "np-type" in content:
if content["np-type"] == "numpy.dtype":
return NUMPY_TYPE_DICT[content["np-type"]](NUMPY_TYPE_DICT[content['value']])
return NUMPY_TYPE_DICT[content["np-type"]](content['value'])

def is_numpy_primary_type(self, content):
Expand Down Expand Up @@ -359,8 +369,7 @@ def deserialize_primitive_type(self, primitive):
if is_primitive(primitive):
return primitive
elif check_str_in_iterable("np-type", primitive):
return NUMPY_TYPE_DICT[primitive["np-type"]
](primitive['value'])
return self.get_deserialized_regular_primary_types(primitive)
else:
return primitive

Expand Down
Loading

0 comments on commit 050c861

Please sign in to comment.