fix builds by supporting sparse_output renamed parameter in OneHotEnc…

…oder in new scikit-learn version update (#2507)
microsoft · Jan 26, 2024 · 66e33cb · 66e33cb
1 parent f1cf49c
commit 66e33cb
Show file tree

Hide file tree

Showing 5 changed files with 44 additions and 7 deletions.
diff --git a/.github/workflows/CI-python.yml b/.github/workflows/CI-python.yml
@@ -58,6 +58,12 @@ jobs:
           pip install -v -e .
         working-directory: ${{ matrix.packageDirectory }}
 
+      - if: ${{ (matrix.packageDirectory == 'erroranalysis') || (matrix.packageDirectory == 'responsibleai') }}
+        name: Install rai_test_utils locally until next version is released
+        run: |
+          pip install -v -e .
+        working-directory: rai_test_utils
+
       - name: Pip freeze
         run: |
           pip freeze > installed-requirements-dev.txt

diff --git a/...-dashboards/erroranalysis-dashboard/erroranalysis-interpretability-dashboard-census.ipynb b/...-dashboards/erroranalysis-dashboard/erroranalysis-interpretability-dashboard-census.ipynb
@@ -118,11 +118,19 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "from packaging import version\n",
+    "import sklearn\n",
     "from sklearn.pipeline import Pipeline\n",
     "from sklearn.impute import SimpleImputer\n",
     "from sklearn.preprocessing import StandardScaler, OneHotEncoder\n",
     "from sklearn.compose import ColumnTransformer\n",
     "\n",
+    "# for older scikit-learn versions use sparse, for newer sparse_output:\n",
+    "if version.parse(sklearn.__version__) < version.parse('1.2'):\n",
+    "    ohe_params = {\"sparse\": False}\n",
+    "else:\n",
+    "    ohe_params = {\"sparse_output\": False}\n",
+    "\n",
     "def split_label(dataset):\n",
     "    X = dataset.drop(['income'], axis=1)\n",
     "    y = dataset[['income']]\n",
@@ -141,7 +149,7 @@
     "    ])\n",
     "    cat_pipe = Pipeline([\n",
     "        ('cat_imputer', SimpleImputer(strategy='constant', fill_value='?')),\n",
-    "        ('cat_encoder', OneHotEncoder(handle_unknown='ignore', sparse=False))\n",
+    "        ('cat_encoder', OneHotEncoder(handle_unknown='ignore', **ohe_params))\n",
     "    ])\n",
     "    feat_pipe = ColumnTransformer([\n",
     "        ('num_pipe', num_pipe, pipe_cfg['num_cols']),\n",

diff --git a/...leaidashboard/tabular/responsibleaidashboard-housing-classification-model-debugging.ipynb b/...leaidashboard/tabular/responsibleaidashboard-housing-classification-model-debugging.ipynb
@@ -70,12 +70,20 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "from packaging import version\n",
     "from raiutils.dataset import fetch_dataset\n",
+    "import sklearn\n",
     "from sklearn.pipeline import Pipeline\n",
     "from sklearn.impute import SimpleImputer\n",
     "from sklearn.preprocessing import OneHotEncoder\n",
     "from sklearn.compose import ColumnTransformer\n",
     "\n",
+    "# for older scikit-learn versions use sparse, for newer sparse_output:\n",
+    "if version.parse(sklearn.__version__) < version.parse('1.2'):\n",
+    "    ohe_params = {\"sparse\": False}\n",
+    "else:\n",
+    "    ohe_params = {\"sparse_output\": False}\n",
+    "\n",
     "def split_label(dataset, target_feature):\n",
     "    X = dataset.drop([target_feature], axis=1)\n",
     "    y = dataset[[target_feature]]\n",
@@ -93,7 +101,7 @@
     "    ])\n",
     "    cat_pipe = Pipeline([\n",
     "        ('cat_imputer', SimpleImputer(strategy='constant', fill_value='?')),\n",
-    "        ('cat_encoder', OneHotEncoder(handle_unknown='ignore', sparse=False))\n",
+    "        ('cat_encoder', OneHotEncoder(handle_unknown='ignore', **ohe_params))\n",
     "    ])\n",
     "    feat_pipe = ColumnTransformer([\n",
     "        ('num_pipe', num_pipe, pipe_cfg['num_cols']),\n",
@@ -179,7 +187,7 @@
    "source": [
     "To use Responsible AI Dashboard, initialize a RAIInsights object upon which different components can be loaded.\n",
     "\n",
-    "RAIInsights accepts the model, the full dataset, the test dataset, the target feature string and the task type string as its arguments.",
+    "RAIInsights accepts the model, the full dataset, the test dataset, the target feature string and the task type string as its arguments.\n",
     "\n",
     "You may also create the `FeatureMetadata` container, identify any feature of your choice as the `identity_feature`, specify a list of strings of categorical feature names via the `categorical_features` parameter, and specify dropped features via the `dropped_features` parameter. The `FeatureMetadata` may also be passed into the `RAIInsights`."
    ]

diff --git a/...books/responsibleaidashboard/tabular/responsibleaidashboard-housing-decision-making.ipynb b/...books/responsibleaidashboard/tabular/responsibleaidashboard-housing-decision-making.ipynb
@@ -59,12 +59,20 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "from packaging import version\n",
     "from raiutils.dataset import fetch_dataset\n",
+    "import sklearn\n",
     "from sklearn.pipeline import Pipeline\n",
     "from sklearn.impute import SimpleImputer\n",
     "from sklearn.preprocessing import StandardScaler, OneHotEncoder\n",
     "from sklearn.compose import ColumnTransformer\n",
     "\n",
+    "# for older scikit-learn versions use sparse, for newer sparse_output:\n",
+    "if version.parse(sklearn.__version__) < version.parse('1.2'):\n",
+    "    ohe_params = {\"sparse\": False}\n",
+    "else:\n",
+    "    ohe_params = {\"sparse_output\": False}\n",
+    "\n",
     "def split_label(dataset, target_feature):\n",
     "    X = dataset.drop([target_feature], axis=1)\n",
     "    y = dataset[[target_feature]]\n",
@@ -83,7 +91,7 @@
     "    ])\n",
     "    cat_pipe = Pipeline([\n",
     "        ('cat_imputer', SimpleImputer(strategy='constant', fill_value='?')),\n",
-    "        ('cat_encoder', OneHotEncoder(handle_unknown='ignore', sparse=False))\n",
+    "        ('cat_encoder', OneHotEncoder(handle_unknown='ignore', **ohe_params))\n",
     "    ])\n",
     "    feat_pipe = ColumnTransformer([\n",
     "        ('num_pipe', num_pipe, pipe_cfg['num_cols']),\n",
@@ -148,7 +156,7 @@
    "source": [
     "To use Responsible AI Dashboard, initialize a RAIInsights object upon which different components can be loaded.\n",
     "\n",
-    "RAIInsights accepts the model, the full dataset, the test dataset, the target feature string and the task type string as its arguments.",
+    "RAIInsights accepts the model, the full dataset, the test dataset, the target feature string and the task type string as its arguments.\n",
     "\n",
     "You may also create the `FeatureMetadata` container, identify any feature of your choice as the `identity_feature`, specify a list of strings of categorical feature names via the `categorical_features` parameter, and specify dropped features via the `dropped_features` parameter. The `FeatureMetadata` may also be passed into the `RAIInsights`."
    ]

diff --git a/rai_test_utils/rai_test_utils/models/sklearn/sklearn_model_utils.py b/rai_test_utils/rai_test_utils/models/sklearn/sklearn_model_utils.py
@@ -3,6 +3,8 @@
 
 import numpy as np
 import pandas as pd
+import sklearn
+from packaging import version
 from sklearn import svm
 from sklearn.compose import ColumnTransformer
 from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
@@ -127,6 +129,11 @@ def conv(X):
             (conv(np.prod(x, axis=1)).reshape(-1, 1),
                 conv(np.prod(x, axis=1)**2).reshape(-1, 1))
         ))
+    # for older scikit-learn versions use sparse, for newer sparse_output:
+    if version.parse(sklearn.__version__) < version.parse('1.2'):
+        ohe_params = {"sparse": False}
+    else:
+        ohe_params = {"sparse_output": False}
     transformations = ColumnTransformer([
         ("age_fare_1", Pipeline(steps=[
             ('imputer', SimpleImputer(strategy='median')),
@@ -137,8 +144,8 @@ def conv(X):
         ("embarked", Pipeline(steps=[
             ("imputer",
                 SimpleImputer(strategy='constant', fill_value='missing')),
-            ("encoder", OneHotEncoder(sparse=False))]), ["embarked"]),
-        ("sex_pclass", OneHotEncoder(sparse=False), ["sex", "pclass"])
+            ("encoder", OneHotEncoder(**ohe_params))]), ["embarked"]),
+        ("sex_pclass", OneHotEncoder(**ohe_params), ["sex", "pclass"])
     ])
     clf = Pipeline(steps=[('preprocessor', transformations),
                           ('classifier',