From a409d767b8397a0da6738022a44d77388b48dfc9 Mon Sep 17 00:00:00 2001 From: Yusik Kim Date: Wed, 2 Nov 2022 19:42:13 +0100 Subject: [PATCH 1/3] Implemented trxf scorecard reader for PMML export Signed-off-by: Yusik Kim --- .../trxf/pmml_export/__init__.py | 2 +- .../trxf/pmml_export/pmml_exporter.py | 9 ++- .../trxf/pmml_export/reader/__init__.py | 2 +- ...{trxf_reader.py => trxf_ruleset_reader.py} | 52 ++------------ .../reader/trxf_scorecard_reader.py | 68 +++++++++++++++++++ .../trxf/pmml_export/utilities.py | 51 ++++++++++++++ .../rule_induction/trxf/scorecard/bins.py | 13 +++- .../trxf/pmml_export/test_pmml_exporter.py | 12 ++-- .../trxf/pmml_export/test_trxf_reader.py | 19 ++++-- tests/rule_induction/trxf/utilities.py | 39 ++++++++++- 10 files changed, 198 insertions(+), 69 deletions(-) rename aix360/algorithms/rule_induction/trxf/pmml_export/reader/{trxf_reader.py => trxf_ruleset_reader.py} (55%) create mode 100644 aix360/algorithms/rule_induction/trxf/pmml_export/reader/trxf_scorecard_reader.py create mode 100644 aix360/algorithms/rule_induction/trxf/pmml_export/utilities.py diff --git a/aix360/algorithms/rule_induction/trxf/pmml_export/__init__.py b/aix360/algorithms/rule_induction/trxf/pmml_export/__init__.py index 5fc9a2e..5b86d0f 100644 --- a/aix360/algorithms/rule_induction/trxf/pmml_export/__init__.py +++ b/aix360/algorithms/rule_induction/trxf/pmml_export/__init__.py @@ -1,4 +1,4 @@ from .__version__ import version -from .reader import AbstractReader, TrxfReader +from .reader import AbstractReader, TrxfRuleSetReader from .serializer import AbstractSerializer, NyokaSerializer from .pmml_exporter import PmmlExporter diff --git a/aix360/algorithms/rule_induction/trxf/pmml_export/pmml_exporter.py b/aix360/algorithms/rule_induction/trxf/pmml_export/pmml_exporter.py index 5b6202b..28d1a1e 100644 --- a/aix360/algorithms/rule_induction/trxf/pmml_export/pmml_exporter.py +++ b/aix360/algorithms/rule_induction/trxf/pmml_export/pmml_exporter.py @@ -1,4 +1,3 @@ -from aix360.algorithms.rule_induction.trxf.classifier.ruleset_classifier import RuleSetClassifier from aix360.algorithms.rule_induction.trxf.pmml_export import AbstractSerializer, AbstractReader @@ -7,12 +6,12 @@ def __init__(self, reader: AbstractReader, serializer: AbstractSerializer): self._serializer = serializer self._reader = reader - def export(self, trxf_classifier: RuleSetClassifier): + def export(self, trxf_input): """ - Translate a given TRXF RuleSetClassifier to a PMML string - @param trxf_classifier: A TRXF RuleSetClassifier + Translate a given TRXF RuleSetClassifier or Scorecard to a PMML string + @param trxf_input: A TRXF RuleSetClassifier or Scorecard object @return: The corresponding PMML string """ if self._reader.data_dictionary is None: raise AssertionError("Missing data dictionary in reader object") - return self._serializer.serialize(self._reader.read(trxf_classifier)) + return self._serializer.serialize(self._reader.read(trxf_input)) diff --git a/aix360/algorithms/rule_induction/trxf/pmml_export/reader/__init__.py b/aix360/algorithms/rule_induction/trxf/pmml_export/reader/__init__.py index 317f62e..4ccd59a 100644 --- a/aix360/algorithms/rule_induction/trxf/pmml_export/reader/__init__.py +++ b/aix360/algorithms/rule_induction/trxf/pmml_export/reader/__init__.py @@ -1,2 +1,2 @@ from .abstract_reader import AbstractReader -from .trxf_reader import TrxfReader +from .trxf_ruleset_reader import TrxfRuleSetReader diff --git a/aix360/algorithms/rule_induction/trxf/pmml_export/reader/trxf_reader.py b/aix360/algorithms/rule_induction/trxf/pmml_export/reader/trxf_ruleset_reader.py similarity index 55% rename from aix360/algorithms/rule_induction/trxf/pmml_export/reader/trxf_reader.py rename to aix360/algorithms/rule_induction/trxf/pmml_export/reader/trxf_ruleset_reader.py index ba60510..1a8e19d 100644 --- a/aix360/algorithms/rule_induction/trxf/pmml_export/reader/trxf_reader.py +++ b/aix360/algorithms/rule_induction/trxf/pmml_export/reader/trxf_ruleset_reader.py @@ -1,19 +1,13 @@ -from typing import Dict - -import numpy as np import pandas as pd from aix360.algorithms.rule_induction.trxf.classifier import ruleset_classifier from aix360.algorithms.rule_induction.trxf.classifier.ruleset_classifier import RuleSetClassifier -from aix360.algorithms.rule_induction.trxf.core import Conjunction, Relation from aix360.algorithms.rule_induction.trxf.pmml_export import models -from aix360.algorithms.rule_induction.trxf.pmml_export.models.data_dictionary import Value from aix360.algorithms.rule_induction.trxf.pmml_export.reader import AbstractReader -from aix360.algorithms.rule_induction.trxf.pmml_export.models import SimplePredicate, Operator, CompoundPredicate, \ - BooleanOperator +from aix360.algorithms.rule_induction.trxf.pmml_export.utilities import extract_data_dictionary, trxf_to_pmml_predicate -class TrxfReader(AbstractReader): +class TrxfRuleSetReader(AbstractReader): def __init__(self, data_dictionary=None): self._data_dictionary = data_dictionary @@ -48,35 +42,13 @@ def load_data_dictionary(self, X: pd.DataFrame, values: Dict = None): @param X: Input dataframe @param values: A dict mapping column name to a list of possible categorical values. It will be inferred from X if not provided. """ - dtypes = X.dtypes - data_fields = [] - for index, value in dtypes.items(): - vals = None - if np.issubdtype(value, np.integer): - data_type = models.DataType.integer - op_type = models.OpType.ordinal - elif np.issubdtype(value, np.double): - data_type = models.DataType.double - op_type = models.OpType.continuous - elif np.issubdtype(value, np.floating): - data_type = models.DataType.float - op_type = models.OpType.continuous - elif np.issubdtype(value, np.bool_): - data_type = models.DataType.boolean - op_type = models.OpType.categorical - else: - data_type = models.DataType.string - op_type = models.OpType.categorical - vals = values[index] if values is not None and index in values else list(X[index].unique()) - wrapped_vals = list(map(lambda v: Value(v), vals)) if vals is not None else vals - data_fields.append(models.DataField(name=str(index), optype=op_type, dataType=data_type, values=wrapped_vals)) - self._data_dictionary = models.DataDictionary(data_fields) + self._data_dictionary = extract_data_dictionary(X) def _convert_to_simple_rules(trxf_rules): simple_rules = [] for rule in trxf_rules: - predicate = _convert_to_pmml_predicate(rule.conjunction) + predicate = trxf_to_pmml_predicate(rule.conjunction) confidence = rule.confidence if rule.confidence is not None else models.DEFAULT_CONFIDENCE weight = rule.weight if rule.weight is not None else models.DEFAULT_WEIGHT simple_rule = models.SimpleRule(predicate=predicate, score=str(rule.label), id=str(rule.conjunction), @@ -86,22 +58,6 @@ def _convert_to_simple_rules(trxf_rules): return simple_rules -def _convert_to_pmml_predicate(trxf_conjunction: Conjunction): - trxf_to_pmml_op = { - Relation.EQ: Operator.equal, - Relation.NEQ: Operator.notEqual, - Relation.LT: Operator.lessThan, - Relation.LE: Operator.lessOrEqual, - Relation.GT: Operator.greaterThan, - Relation.GE: Operator.greaterOrEqual - } - simple_predicates = [SimplePredicate(operator=trxf_to_pmml_op[trxf_predicate.relation], - value=str(trxf_predicate.value), - field=str(trxf_predicate.feature.variable_names[0])) - for trxf_predicate in trxf_conjunction.predicates] - return CompoundPredicate(simplePredicates=simple_predicates, booleanOperator=BooleanOperator.and_) - - def _extract_mining_schema(trxf_rules): mining_fields = {} for rule in trxf_rules: diff --git a/aix360/algorithms/rule_induction/trxf/pmml_export/reader/trxf_scorecard_reader.py b/aix360/algorithms/rule_induction/trxf/pmml_export/reader/trxf_scorecard_reader.py new file mode 100644 index 0000000..23b6866 --- /dev/null +++ b/aix360/algorithms/rule_induction/trxf/pmml_export/reader/trxf_scorecard_reader.py @@ -0,0 +1,68 @@ +import pandas as pd + +from aix360.algorithms.rule_induction.trxf import scorecard +from aix360.algorithms.rule_induction.trxf.pmml_export import models +from aix360.algorithms.rule_induction.trxf.pmml_export.reader import AbstractReader +from aix360.algorithms.rule_induction.trxf.pmml_export.utilities import extract_data_dictionary, trxf_to_pmml_predicate + + +class TrxfScorecardReader(AbstractReader): + def __init__(self, data_dictionary=None): + self._data_dictionary = data_dictionary + + @property + def data_dictionary(self): + return self._data_dictionary + + def read(self, trxf_scorecard: scorecard.Scorecard) -> models.Scorecard: + """ + Translate a TRXF Scorecard to an internal Scorecard + """ + mining_schema = _extract_mining_schema(trxf_scorecard.features) + output = models.Output([models.OutputField(name='RawResult', + feature='predictedValue', + dataType=models.DataType.double, + optype=models.OpType.continuous)]) + characteristics = _extract_characteristics(trxf_scorecard) + + assert self._data_dictionary is not None + return models.Scorecard(dataDictionary=self._data_dictionary, + miningSchema=mining_schema, + output=output, + characteristics=characteristics, + initialScore=str(trxf_scorecard.bias)) + + def load_data_dictionary(self, X: pd.DataFrame): + """ + Extract the data dictionary from a feature dataframe, and store it + """ + self._data_dictionary = extract_data_dictionary(X) + + +def _extract_mining_schema(scorecard_features): + mining_fields = {} + for feature in scorecard_features: + name = feature.variable_names[0] + if name not in mining_fields: + mining_field = models.MiningField(name=name) + mining_fields[name] = mining_field + return models.MiningSchema(miningFields=list(mining_fields.values())) + + +def _extract_characteristics(trxf_scorecard): + characteristics = [] + for partition in trxf_scorecard.partitions: + feature_name = partition.feature.variable_names[0] + attributes = [] + for bin in partition.bins: + assert isinstance(bin, scorecard.IntervalBin), "Scorecard is only supported for continuous bins" + conjunction = bin.to_conjunction() + predicate = trxf_to_pmml_predicate(conjunction) + score = str(bin.sub_score) + attribute = models.Attribute(score=score, predicate=predicate) + attributes.append(attribute) + characteristic = models.Characteristic(name=feature_name, attributes=attributes) + characteristics.append(characteristic) + return models.Characteristics(characteristics) + + diff --git a/aix360/algorithms/rule_induction/trxf/pmml_export/utilities.py b/aix360/algorithms/rule_induction/trxf/pmml_export/utilities.py new file mode 100644 index 0000000..c907573 --- /dev/null +++ b/aix360/algorithms/rule_induction/trxf/pmml_export/utilities.py @@ -0,0 +1,51 @@ +import numpy as np +import pandas as pd +from aix360.algorithms.rule_induction.trxf.pmml_export.models import Operator, SimplePredicate, CompoundPredicate, \ + BooleanOperator + +from aix360.algorithms.rule_induction.trxf.core import Conjunction, Relation + +from aix360.algorithms.rule_induction.trxf.pmml_export import models + + +def extract_data_dictionary(X: pd.DataFrame): + """ + Extract the data dictionary from a feature dataframe + """ + dtypes = X.dtypes + data_fields = [] + for index, value in dtypes.items(): + if np.issubdtype(value, np.integer): + data_type = models.DataType.integer + op_type = models.OpType.ordinal + elif np.issubdtype(value, np.double): + data_type = models.DataType.double + op_type = models.OpType.continuous + elif np.issubdtype(value, np.floating): + data_type = models.DataType.float + op_type = models.OpType.continuous + elif np.issubdtype(value, np.bool_): + data_type = models.DataType.boolean + op_type = models.OpType.categorical + else: + data_type = models.DataType.string + op_type = models.OpType.categorical + data_fields.append(models.DataField(name=str(index), optype=op_type, dataType=data_type)) + + return models.DataDictionary(data_fields) + + +def trxf_to_pmml_predicate(trxf_conjunction: Conjunction): + trxf_to_pmml_op = { + Relation.EQ: Operator.equal, + Relation.NEQ: Operator.notEqual, + Relation.LT: Operator.lessThan, + Relation.LE: Operator.lessOrEqual, + Relation.GT: Operator.greaterThan, + Relation.GE: Operator.greaterOrEqual + } + simple_predicates = [SimplePredicate(operator=trxf_to_pmml_op[trxf_predicate.relation], + value=str(trxf_predicate.value), + field=str(trxf_predicate.feature.variable_names[0])) + for trxf_predicate in trxf_conjunction.predicates] + return CompoundPredicate(simplePredicates=simple_predicates, booleanOperator=BooleanOperator.and_) diff --git a/aix360/algorithms/rule_induction/trxf/scorecard/bins.py b/aix360/algorithms/rule_induction/trxf/scorecard/bins.py index 8675a00..8b47a73 100644 --- a/aix360/algorithms/rule_induction/trxf/scorecard/bins.py +++ b/aix360/algorithms/rule_induction/trxf/scorecard/bins.py @@ -1,7 +1,7 @@ import abc from numbers import Real from typing import Dict, Set, Any, Optional -from aix360.algorithms.rule_induction.trxf.core import Feature +from aix360.algorithms.rule_induction.trxf.core import Feature, Predicate, Relation, Conjunction class Bin(abc.ABC): @@ -121,6 +121,17 @@ def overlaps(self, other: 'LinearIntervalBin') -> bool: 'is an instance of "{}"'.format(str(other.__class__))) return (self.left_end < other.right_end) and (self.right_end > other.left_end) + def to_conjunction(self): + """ + Converts bin to trxf.Conjunction + """ + left = Predicate(feature=self.feature, relation=Relation.GE, value=self.left_end) if \ + self.left_end > float('-inf') else None + right = Predicate(feature=self.feature, relation=Relation.LT, value=self.right_end) if \ + self.right_end < float('inf') else None + predicates = [p for p in [left, right] if p is not None] + return Conjunction(predicates) + def _get_feature_value(self, assignment: Dict[str, Any]) -> Real: """ Evaluates the value of the feature for the specified variable assignment. Raises ValueError if the feature diff --git a/tests/rule_induction/trxf/pmml_export/test_pmml_exporter.py b/tests/rule_induction/trxf/pmml_export/test_pmml_exporter.py index 8aac1b5..1dd0199 100644 --- a/tests/rule_induction/trxf/pmml_export/test_pmml_exporter.py +++ b/tests/rule_induction/trxf/pmml_export/test_pmml_exporter.py @@ -12,7 +12,7 @@ WeightMetric, ConfidenceMetric from aix360.algorithms.rule_induction.trxf.pmml_export import NyokaSerializer from aix360.algorithms.rule_induction.trxf.pmml_export.pmml_exporter import PmmlExporter -from aix360.algorithms.rule_induction.trxf.pmml_export.reader.trxf_reader import TrxfReader +from aix360.algorithms.rule_induction.trxf.pmml_export.reader.trxf_ruleset_reader import TrxfRuleSetReader from tests.rule_induction.trxf.utilities import create_test_ruleset, DATA_FRAME TIMESTAMP = datetime.datetime(1970, 1, 1, tzinfo=datetime.timezone.utc) @@ -20,7 +20,7 @@ class TestPmmlExporter(TestCase): def test_export(self): - reader = TrxfReader() + reader = TrxfRuleSetReader() reader.load_data_dictionary(DATA_FRAME) serializer = NyokaSerializer(TIMESTAMP) exporter = PmmlExporter(reader, serializer) @@ -32,7 +32,7 @@ def test_export(self): self.assertEqual(expected, actual) def test_export_with_missing_data_dict_should_raise(self): - reader = TrxfReader() + reader = TrxfRuleSetReader() serializer = NyokaSerializer() exporter = PmmlExporter(reader, serializer) @@ -65,7 +65,7 @@ def test_ripper_iris(self): weight_metric=WeightMetric.CONFIDENCE, default_label='Iris-virginica') classifier.update_rules_with_metrics(x_test, y_test) - reader = TrxfReader() + reader = TrxfRuleSetReader() reader.load_data_dictionary(x_train) serializer = NyokaSerializer(TIMESTAMP) exporter = PmmlExporter(reader, serializer) @@ -126,7 +126,7 @@ def test_ripper_adult(self): weight_metric=WeightMetric.CONFIDENCE, default_label='<=50K') classifier.update_rules_with_metrics(x_test, y_test) - reader = TrxfReader() + reader = TrxfRuleSetReader() reader.load_data_dictionary(x_test) serializer = NyokaSerializer(TIMESTAMP) exporter = PmmlExporter(reader, serializer) @@ -165,7 +165,7 @@ def test_ripper_wifi(self): weight_metric=WeightMetric.CONFIDENCE, default_label='4') classifier.update_rules_with_metrics(x_test, y_test) - reader = TrxfReader() + reader = TrxfRuleSetReader() reader.load_data_dictionary(x_train) serializer = NyokaSerializer(TIMESTAMP) exporter = PmmlExporter(reader, serializer) diff --git a/tests/rule_induction/trxf/pmml_export/test_trxf_reader.py b/tests/rule_induction/trxf/pmml_export/test_trxf_reader.py index 6663eec..6238a2c 100644 --- a/tests/rule_induction/trxf/pmml_export/test_trxf_reader.py +++ b/tests/rule_induction/trxf/pmml_export/test_trxf_reader.py @@ -1,16 +1,18 @@ from unittest import TestCase import aix360.algorithms.rule_induction.trxf.classifier.ruleset_classifier as classifier +import aix360.algorithms.rule_induction.trxf.scorecard as sc from aix360.algorithms.rule_induction.trxf.classifier.ruleset_classifier import RuleSetClassifier from aix360.algorithms.rule_induction.trxf.pmml_export.models import RuleSetModel, SimplePMMLRuleSetModel -from aix360.algorithms.rule_induction.trxf.pmml_export.reader.trxf_reader import TrxfReader +from aix360.algorithms.rule_induction.trxf.pmml_export.reader.trxf_ruleset_reader import TrxfRuleSetReader +from aix360.algorithms.rule_induction.trxf.pmml_export.reader.trxf_scorecard_reader import TrxfScorecardReader from tests.rule_induction.trxf.utilities import create_test_ruleset, DATA_DICTIONARY, TEST_MINING_SCHEMA, \ - TEST_PMML_RULESET, DATA_FRAME + TEST_PMML_RULESET, DATA_FRAME, create_test_scorecard, PARTITIONS, DATA_DICTIONARY_SC class TestTrxfReader(TestCase): - def test_read(self): - reader = TrxfReader(DATA_DICTIONARY) + def test_read_ruleset(self): + reader = TrxfRuleSetReader(DATA_DICTIONARY) test_ruleset = create_test_ruleset() test_classifier = RuleSetClassifier([test_ruleset], classifier.RuleSelectionMethod.FIRST_HIT, default_label=0) ruleset_model = RuleSetModel(miningSchema=TEST_MINING_SCHEMA, ruleSet=TEST_PMML_RULESET) @@ -18,6 +20,13 @@ def test_read(self): self.assertEqual(reader.read(test_classifier), expected) def test_load_data_dictionary(self): - reader = TrxfReader() + reader = TrxfRuleSetReader() reader.load_data_dictionary(DATA_FRAME) self.assertEqual(reader.data_dictionary, DATA_DICTIONARY) + + def test_read_scorecard(self): + expected = create_test_scorecard() + scorecard = sc.Scorecard(PARTITIONS, bias=100) + reader = TrxfScorecardReader(DATA_DICTIONARY_SC) + actual = reader.read(scorecard) + self.assertEqual(expected, actual) diff --git a/tests/rule_induction/trxf/utilities.py b/tests/rule_induction/trxf/utilities.py index 2166d2e..4a04bb8 100644 --- a/tests/rule_induction/trxf/utilities.py +++ b/tests/rule_induction/trxf/utilities.py @@ -1,12 +1,12 @@ import numpy as np import pandas as pd +import aix360.algorithms.rule_induction.trxf.scorecard as sc from aix360.algorithms.rule_induction.trxf.core import Predicate, Feature, Relation, Conjunction, DnfRuleSet from aix360.algorithms.rule_induction.trxf.pmml_export import models from aix360.algorithms.rule_induction.trxf.pmml_export.models import DataDictionary, DataField, OpType, DataType, \ MiningSchema, MiningField, MiningFieldUsageType, RuleSet, SimpleRule, CompoundPredicate, SimplePredicate, Operator, \ - BooleanOperator -from aix360.algorithms.rule_induction.trxf.pmml_export.models.data_dictionary import Value + BooleanOperator, Output, OutputField, Attribute, Characteristic, Characteristics, Scorecard DATA_DICTIONARY = DataDictionary( dataFields=[DataField(name='toto0', optype=OpType.continuous, dataType=DataType.double), @@ -109,3 +109,38 @@ def create_test_ruleset(): c4 = Conjunction([p1, p2, p3, p4]) ruleset = DnfRuleSet([c1, c2, c3, c4], then_part=1) return ruleset + + +PARTITIONS = [ + sc.Partition([ + sc.IntervalBin(Feature('feature'), 5, None, 0.0), + sc.IntervalBin(Feature('feature'), -10, 0.0, 1.0), + sc.IntervalBin(Feature('feature'), 10, 1.0, None) + ]) +] +DATA_DICTIONARY_SC = DataDictionary( + dataFields=[DataField(name='feature', optype=OpType.continuous, dataType=DataType.double)] +) + +MINING_SCHEMA_SC = MiningSchema( + [MiningField(name='feature', usageType=MiningFieldUsageType.active)] +) +OUTPUT = Output([OutputField('RawResult', 'predictedValue', DataType.double, OpType.continuous)]) + + +def create_test_scorecard(): + predicate1 = CompoundPredicate([SimplePredicate(operator=Operator.lessThan, value='0.0', field='feature')], + booleanOperator=BooleanOperator.and_) + attribute1 = Attribute('5', predicate1) + predicate2 = CompoundPredicate( + [SimplePredicate(operator=Operator.greaterOrEqual, value='0.0', field='feature'), + SimplePredicate(operator=Operator.lessThan, value='1.0', field='feature')], + booleanOperator=BooleanOperator.and_) + attribute2 = Attribute('-10', predicate2) + predicate3 = CompoundPredicate( + [SimplePredicate(operator=Operator.greaterOrEqual, value='1.0', field='feature')], + booleanOperator=BooleanOperator.and_) + attribute3 = Attribute('10', predicate3) + characteristic = Characteristic('feature', [attribute1, attribute2, attribute3]) + characteristics = Characteristics([characteristic]) + return Scorecard(DATA_DICTIONARY_SC, MINING_SCHEMA_SC, OUTPUT, characteristics, '100') From 316693902cf2ed875d3c6d5ac3b18e9fe685f19e Mon Sep 17 00:00:00 2001 From: Yusik Kim Date: Fri, 4 Nov 2022 14:37:53 +0100 Subject: [PATCH 2/3] Add support for exporting PMML SimpleSetPredicate Signed-off-by: Yusik Kim --- .../trxf/pmml_export/models/__init__.py | 1 + .../trxf/pmml_export/models/attribute.py | 5 ++++- .../trxf/pmml_export/models/predicate.py | 10 ++++++++++ .../trxf/pmml_export/reader/trxf_ruleset_reader.py | 4 +++- .../pmml_export/reader/trxf_scorecard_reader.py | 14 +++++++++++--- .../pmml_export/serializer/nyoka_serializer.py | 5 +++++ .../rule_induction/trxf/pmml_export/utilities.py | 11 ++++++++--- setup.py | 11 ++++++----- tests/rule_induction/trxf/utilities.py | 2 +- 9 files changed, 49 insertions(+), 14 deletions(-) diff --git a/aix360/algorithms/rule_induction/trxf/pmml_export/models/__init__.py b/aix360/algorithms/rule_induction/trxf/pmml_export/models/__init__.py index 066d104..c2e9b98 100644 --- a/aix360/algorithms/rule_induction/trxf/pmml_export/models/__init__.py +++ b/aix360/algorithms/rule_induction/trxf/pmml_export/models/__init__.py @@ -11,6 +11,7 @@ from .predicate import CompoundPredicate from .predicate import Operator from .predicate import SimplePredicate +from .predicate import SimpleSetPredicate from .predicate import TruePredicate from .rule import SimpleRule from .rule import DEFAULT_WEIGHT diff --git a/aix360/algorithms/rule_induction/trxf/pmml_export/models/attribute.py b/aix360/algorithms/rule_induction/trxf/pmml_export/models/attribute.py index 48c3450..423fad0 100644 --- a/aix360/algorithms/rule_induction/trxf/pmml_export/models/attribute.py +++ b/aix360/algorithms/rule_induction/trxf/pmml_export/models/attribute.py @@ -21,4 +21,7 @@ class ComplexPartialScore: @dataclass(frozen=True) class Attribute: score: typing.Union[str, ComplexPartialScore] - predicate: typing.Union[predicate.SimplePredicate, predicate.CompoundPredicate, predicate.TruePredicate] + predicate: typing.Union[predicate.SimplePredicate, + predicate.CompoundPredicate, + predicate.SimpleSetPredicate, + predicate.TruePredicate] diff --git a/aix360/algorithms/rule_induction/trxf/pmml_export/models/predicate.py b/aix360/algorithms/rule_induction/trxf/pmml_export/models/predicate.py index d9dde9a..69e78cb 100644 --- a/aix360/algorithms/rule_induction/trxf/pmml_export/models/predicate.py +++ b/aix360/algorithms/rule_induction/trxf/pmml_export/models/predicate.py @@ -40,3 +40,13 @@ class TruePredicate: class CompoundPredicate: simplePredicates: typing.List[SimplePredicate] = field() booleanOperator: BooleanOperator = field() + + +MembershipOperator = enum.Enum('MembershipOperator', [('isIn', 0), ('isNotIn', 1)]) + + +@dataclass(frozen=True) +class SimpleSetPredicate: + field_: str = field() + membershipOperator: MembershipOperator = field() + values: typing.Set[str] = field() diff --git a/aix360/algorithms/rule_induction/trxf/pmml_export/reader/trxf_ruleset_reader.py b/aix360/algorithms/rule_induction/trxf/pmml_export/reader/trxf_ruleset_reader.py index 1a8e19d..7bf7e37 100644 --- a/aix360/algorithms/rule_induction/trxf/pmml_export/reader/trxf_ruleset_reader.py +++ b/aix360/algorithms/rule_induction/trxf/pmml_export/reader/trxf_ruleset_reader.py @@ -1,3 +1,5 @@ +from typing import Dict + import pandas as pd from aix360.algorithms.rule_induction.trxf.classifier import ruleset_classifier @@ -42,7 +44,7 @@ def load_data_dictionary(self, X: pd.DataFrame, values: Dict = None): @param X: Input dataframe @param values: A dict mapping column name to a list of possible categorical values. It will be inferred from X if not provided. """ - self._data_dictionary = extract_data_dictionary(X) + self._data_dictionary = extract_data_dictionary(X, values) def _convert_to_simple_rules(trxf_rules): diff --git a/aix360/algorithms/rule_induction/trxf/pmml_export/reader/trxf_scorecard_reader.py b/aix360/algorithms/rule_induction/trxf/pmml_export/reader/trxf_scorecard_reader.py index 23b6866..a03e70b 100644 --- a/aix360/algorithms/rule_induction/trxf/pmml_export/reader/trxf_scorecard_reader.py +++ b/aix360/algorithms/rule_induction/trxf/pmml_export/reader/trxf_scorecard_reader.py @@ -2,6 +2,8 @@ from aix360.algorithms.rule_induction.trxf import scorecard from aix360.algorithms.rule_induction.trxf.pmml_export import models +from aix360.algorithms.rule_induction.trxf.pmml_export.models import SimpleSetPredicate +from aix360.algorithms.rule_induction.trxf.pmml_export.models.predicate import MembershipOperator from aix360.algorithms.rule_induction.trxf.pmml_export.reader import AbstractReader from aix360.algorithms.rule_induction.trxf.pmml_export.utilities import extract_data_dictionary, trxf_to_pmml_predicate @@ -55,9 +57,15 @@ def _extract_characteristics(trxf_scorecard): feature_name = partition.feature.variable_names[0] attributes = [] for bin in partition.bins: - assert isinstance(bin, scorecard.IntervalBin), "Scorecard is only supported for continuous bins" - conjunction = bin.to_conjunction() - predicate = trxf_to_pmml_predicate(conjunction) + if isinstance(bin, scorecard.IntervalBin): + conjunction = bin.to_conjunction() + predicate = trxf_to_pmml_predicate(conjunction) + elif isinstance(bin, scorecard.SetBin): + predicate = SimpleSetPredicate(field_=feature_name, + membershipOperator=MembershipOperator.isIn, + values=bin.values) + else: + raise ValueError('Unsupported Bin type {} for feature {}'.format(type(bin), feature_name)) score = str(bin.sub_score) attribute = models.Attribute(score=score, predicate=predicate) attributes.append(attribute) diff --git a/aix360/algorithms/rule_induction/trxf/pmml_export/serializer/nyoka_serializer.py b/aix360/algorithms/rule_induction/trxf/pmml_export/serializer/nyoka_serializer.py index ae817cc..b7cf97a 100644 --- a/aix360/algorithms/rule_induction/trxf/pmml_export/serializer/nyoka_serializer.py +++ b/aix360/algorithms/rule_induction/trxf/pmml_export/serializer/nyoka_serializer.py @@ -158,5 +158,10 @@ def _nyoka_pmml_attributes(self, attribute: models.Attribute) -> nyoka_pmml.Attr SimplePredicate=[ nyoka_pmml.SimplePredicate(field=sp.field, operator=sp.operator.name, value=sp.value) for sp in attribute.predicate.simplePredicates]), + SimpleSetPredicate=None if (attribute.predicate is None or not isinstance( + attribute.predicate, models.SimpleSetPredicate)) else nyoka_pmml.SimpleSetPredicate( + field=attribute.predicate.field_, + booleanOperator=attribute.predicate.membershipOperator.name, + Array=nyoka_pmml.ArrayType(list(attribute.predicate.values))), True_=None if (attribute.predicate is None or not isinstance( attribute.predicate, models.TruePredicate)) else nyoka_pmml.True_()) diff --git a/aix360/algorithms/rule_induction/trxf/pmml_export/utilities.py b/aix360/algorithms/rule_induction/trxf/pmml_export/utilities.py index c907573..be3f4fc 100644 --- a/aix360/algorithms/rule_induction/trxf/pmml_export/utilities.py +++ b/aix360/algorithms/rule_induction/trxf/pmml_export/utilities.py @@ -1,20 +1,23 @@ +from typing import Dict + import numpy as np import pandas as pd from aix360.algorithms.rule_induction.trxf.pmml_export.models import Operator, SimplePredicate, CompoundPredicate, \ - BooleanOperator + BooleanOperator, Value from aix360.algorithms.rule_induction.trxf.core import Conjunction, Relation from aix360.algorithms.rule_induction.trxf.pmml_export import models -def extract_data_dictionary(X: pd.DataFrame): +def extract_data_dictionary(X: pd.DataFrame, values: Dict): """ Extract the data dictionary from a feature dataframe """ dtypes = X.dtypes data_fields = [] for index, value in dtypes.items(): + vals = None if np.issubdtype(value, np.integer): data_type = models.DataType.integer op_type = models.OpType.ordinal @@ -30,7 +33,9 @@ def extract_data_dictionary(X: pd.DataFrame): else: data_type = models.DataType.string op_type = models.OpType.categorical - data_fields.append(models.DataField(name=str(index), optype=op_type, dataType=data_type)) + vals = values[index] if values is not None and index in values else list(X[index].unique()) + wrapped_vals = list(map(lambda v: Value(v), vals)) if vals is not None else vals + data_fields.append(models.DataField(name=str(index), optype=op_type, dataType=data_type, values=wrapped_vals)) return models.DataDictionary(data_fields) diff --git a/setup.py b/setup.py index 45b11da..88e97ef 100644 --- a/setup.py +++ b/setup.py @@ -14,8 +14,8 @@ author_email='aix360@us.ibm.com', packages=setuptools.find_packages(), license='Apache License 2.0', - long_description=open('README.md', 'r', encoding='utf-8').read(), - long_description_content_type='text/markdown', + long_description=open('README.md', 'r', encoding='utf-8').read(), + long_description_content_type='text/markdown', install_requires=[ 'joblib>=0.11', 'scikit-learn>=0.21.2', @@ -31,16 +31,17 @@ 'pandas', 'scipy>=0.17', 'xport', - 'scikit-image', + 'scikit-image', 'requests', - 'xgboost==1.1.0', + 'xgboost==1.1.0', 'bleach>=2.1.0', 'docutils>=0.13.1', 'Pygments', - 'osqp', + 'osqp', 'lime==0.1.1.37', 'shap==0.34.0', 'nyoka==5.2.0', + 'pypmml', 'xmltodict==0.12.0', 'numba', 'tqdm', diff --git a/tests/rule_induction/trxf/utilities.py b/tests/rule_induction/trxf/utilities.py index 4a04bb8..060a00c 100644 --- a/tests/rule_induction/trxf/utilities.py +++ b/tests/rule_induction/trxf/utilities.py @@ -6,7 +6,7 @@ from aix360.algorithms.rule_induction.trxf.pmml_export import models from aix360.algorithms.rule_induction.trxf.pmml_export.models import DataDictionary, DataField, OpType, DataType, \ MiningSchema, MiningField, MiningFieldUsageType, RuleSet, SimpleRule, CompoundPredicate, SimplePredicate, Operator, \ - BooleanOperator, Output, OutputField, Attribute, Characteristic, Characteristics, Scorecard + BooleanOperator, Output, OutputField, Attribute, Characteristic, Characteristics, Scorecard, Value DATA_DICTIONARY = DataDictionary( dataFields=[DataField(name='toto0', optype=OpType.continuous, dataType=DataType.double), From d9c38052f287fc83da2818eac8f2a7e513e05782 Mon Sep 17 00:00:00 2001 From: Yusik Kim Date: Fri, 4 Nov 2022 19:15:32 +0100 Subject: [PATCH 3/3] Fixed call to extract_data_dictionary Signed-off-by: Yusik Kim --- .../trxf/pmml_export/reader/trxf_scorecard_reader.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/aix360/algorithms/rule_induction/trxf/pmml_export/reader/trxf_scorecard_reader.py b/aix360/algorithms/rule_induction/trxf/pmml_export/reader/trxf_scorecard_reader.py index a03e70b..49610a2 100644 --- a/aix360/algorithms/rule_induction/trxf/pmml_export/reader/trxf_scorecard_reader.py +++ b/aix360/algorithms/rule_induction/trxf/pmml_export/reader/trxf_scorecard_reader.py @@ -34,11 +34,11 @@ def read(self, trxf_scorecard: scorecard.Scorecard) -> models.Scorecard: characteristics=characteristics, initialScore=str(trxf_scorecard.bias)) - def load_data_dictionary(self, X: pd.DataFrame): + def load_data_dictionary(self, X: pd.DataFrame, values=None): """ Extract the data dictionary from a feature dataframe, and store it """ - self._data_dictionary = extract_data_dictionary(X) + self._data_dictionary = extract_data_dictionary(X, values) def _extract_mining_schema(scorecard_features):