From 18b249f9f4fcf311d71d8ac80ce1e8d5671d865b Mon Sep 17 00:00:00 2001
From: Gracechung-sw <hjngy0511@gmail.com>
Date: Wed, 26 Jun 2024 17:10:20 +0900
Subject: [PATCH] add code about clarify fairness bias and explanability of
 sagemaker

---
 .../clip07_fairness_bias_clarify.ipynb        | 1067 +++++++++++++++++
 1 file changed, 1067 insertions(+)
 create mode 100644 aws-sagemaker/clarify_fairness_explainability/clip07_fairness_bias_clarify.ipynb

diff --git a/aws-sagemaker/clarify_fairness_explainability/clip07_fairness_bias_clarify.ipynb b/aws-sagemaker/clarify_fairness_explainability/clip07_fairness_bias_clarify.ipynb
new file mode 100644
index 0000000..c240069
--- /dev/null
+++ b/aws-sagemaker/clarify_fairness_explainability/clip07_fairness_bias_clarify.ipynb
@@ -0,0 +1,1067 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5a5cce68-132e-459a-8b5c-5e16d2d4607f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "!{sys.executable} -m pip uninstall -y sagemaker\n",
+    "!{sys.executable} -m pip install --upgrade pip\n",
+    "!{sys.executable} -m pip install --upgrade boto3 --no-cache-dir\n",
+    "!{sys.executable} -m pip install --upgrade sagemaker==2.123.0 --no-cache-dir"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "713eba58-5769-44f4-bc0e-dda7ebdce879",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sagemaker import Session\n",
+    "from sagemaker import get_execution_role\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import os\n",
+    "import boto3\n",
+    "from datetime import datetime\n",
+    "\n",
+    "\n",
+    "session = Session()\n",
+    "default_bucket = session.default_bucket()\n",
+    "default_prefix = \"sagemaker/fastcampus-sagemaker-clarify-bias-practice\"\n",
+    "region = session.boto_region_name\n",
+    "\n",
+    "role = get_execution_role()\n",
+    "s3_client = boto3.client(\"s3\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "1463073d-f37d-4f07-9d44-238184ca998c",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<sagemaker.session.Session object at 0x7f04bb91d3c0>\n",
+      "us-east-2\n",
+      "arn:aws:iam::767397847434:role/service-role/AmazonSageMaker-ExecutionRole-20240111T201222\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(session)\n",
+    "print(region)\n",
+    "print(role)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "fb5ff674-2e9f-489e-af8c-1a705bacc8fa",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "adult.data saved!\n",
+      "adult.test saved!\n"
+     ]
+    }
+   ],
+   "source": [
+    "adult_columns = [\n",
+    "    \"Age\",\n",
+    "    \"Workclass\",\n",
+    "    \"fnlwgt\",\n",
+    "    \"Education\",\n",
+    "    \"Education-Num\",\n",
+    "    \"Marital Status\",\n",
+    "    \"Occupation\",\n",
+    "    \"Relationship\",\n",
+    "    \"Ethnic group\",\n",
+    "    \"Sex\",\n",
+    "    \"Capital Gain\",\n",
+    "    \"Capital Loss\",\n",
+    "    \"Hours per week\",\n",
+    "    \"Country\",\n",
+    "    \"Target\",\n",
+    "]\n",
+    "if not os.path.isfile(\"adult.data\"):\n",
+    "    s3_client.download_file(\n",
+    "        f\"sagemaker-example-files-prod-{session.boto_region_name}\",\n",
+    "        \"datasets/tabular/uci_adult/adult.data\",\n",
+    "        \"adult.data\",\n",
+    "    )\n",
+    "    print(\"adult.data saved!\")\n",
+    "else:\n",
+    "    print(\"adult.data already on disk.\")\n",
+    "\n",
+    "if not os.path.isfile(\"adult.test\"):\n",
+    "    s3_client.download_file(\n",
+    "        f\"sagemaker-example-files-prod-{session.boto_region_name}\",\n",
+    "        \"datasets/tabular/uci_adult/adult.test\",\n",
+    "        \"adult.test\",\n",
+    "    )\n",
+    "    print(\"adult.test saved!\")\n",
+    "else:\n",
+    "    print(\"adult.test already on disk.\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "d6e8d2da-2fdb-4548-a44e-7e653f1be0e8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "from sagemaker.session import Session\n",
+    "from sagemaker import get_execution_role\n",
+    "from sagemaker.experiments.run import Run\n",
+    "from sagemaker.utils import unique_name_from_base\n",
+    "\n",
+    "role = get_execution_role()\n",
+    "sagemaker_session = Session()\n",
+    "\n",
+    "experiment_name = \"fc-bias-fairness-clarify-practice-{}\".format(\n",
+    "    datetime.now().strftime(\"%d-%m-%Y-%H-%M-%S\"))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "73a85060-e223-4ba3-9cb8-8cc058950b82",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Age</th>\n",
+       "      <th>Workclass</th>\n",
+       "      <th>fnlwgt</th>\n",
+       "      <th>Education</th>\n",
+       "      <th>Education-Num</th>\n",
+       "      <th>Marital Status</th>\n",
+       "      <th>Occupation</th>\n",
+       "      <th>Relationship</th>\n",
+       "      <th>Ethnic group</th>\n",
+       "      <th>Sex</th>\n",
+       "      <th>Capital Gain</th>\n",
+       "      <th>Capital Loss</th>\n",
+       "      <th>Hours per week</th>\n",
+       "      <th>Country</th>\n",
+       "      <th>Target</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>39</td>\n",
+       "      <td>State-gov</td>\n",
+       "      <td>77516</td>\n",
+       "      <td>Bachelors</td>\n",
+       "      <td>13</td>\n",
+       "      <td>Never-married</td>\n",
+       "      <td>Adm-clerical</td>\n",
+       "      <td>Not-in-family</td>\n",
+       "      <td>White</td>\n",
+       "      <td>Male</td>\n",
+       "      <td>2174</td>\n",
+       "      <td>0</td>\n",
+       "      <td>40</td>\n",
+       "      <td>United-States</td>\n",
+       "      <td>&lt;=50K</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>50</td>\n",
+       "      <td>Self-emp-not-inc</td>\n",
+       "      <td>83311</td>\n",
+       "      <td>Bachelors</td>\n",
+       "      <td>13</td>\n",
+       "      <td>Married-civ-spouse</td>\n",
+       "      <td>Exec-managerial</td>\n",
+       "      <td>Husband</td>\n",
+       "      <td>White</td>\n",
+       "      <td>Male</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>13</td>\n",
+       "      <td>United-States</td>\n",
+       "      <td>&lt;=50K</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>38</td>\n",
+       "      <td>Private</td>\n",
+       "      <td>215646</td>\n",
+       "      <td>HS-grad</td>\n",
+       "      <td>9</td>\n",
+       "      <td>Divorced</td>\n",
+       "      <td>Handlers-cleaners</td>\n",
+       "      <td>Not-in-family</td>\n",
+       "      <td>White</td>\n",
+       "      <td>Male</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>40</td>\n",
+       "      <td>United-States</td>\n",
+       "      <td>&lt;=50K</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>53</td>\n",
+       "      <td>Private</td>\n",
+       "      <td>234721</td>\n",
+       "      <td>11th</td>\n",
+       "      <td>7</td>\n",
+       "      <td>Married-civ-spouse</td>\n",
+       "      <td>Handlers-cleaners</td>\n",
+       "      <td>Husband</td>\n",
+       "      <td>Black</td>\n",
+       "      <td>Male</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>40</td>\n",
+       "      <td>United-States</td>\n",
+       "      <td>&lt;=50K</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>28</td>\n",
+       "      <td>Private</td>\n",
+       "      <td>338409</td>\n",
+       "      <td>Bachelors</td>\n",
+       "      <td>13</td>\n",
+       "      <td>Married-civ-spouse</td>\n",
+       "      <td>Prof-specialty</td>\n",
+       "      <td>Wife</td>\n",
+       "      <td>Black</td>\n",
+       "      <td>Female</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>40</td>\n",
+       "      <td>Cuba</td>\n",
+       "      <td>&lt;=50K</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   Age         Workclass  fnlwgt  Education  Education-Num  \\\n",
+       "0   39         State-gov   77516  Bachelors             13   \n",
+       "1   50  Self-emp-not-inc   83311  Bachelors             13   \n",
+       "2   38           Private  215646    HS-grad              9   \n",
+       "3   53           Private  234721       11th              7   \n",
+       "4   28           Private  338409  Bachelors             13   \n",
+       "\n",
+       "       Marital Status         Occupation   Relationship Ethnic group     Sex  \\\n",
+       "0       Never-married       Adm-clerical  Not-in-family        White    Male   \n",
+       "1  Married-civ-spouse    Exec-managerial        Husband        White    Male   \n",
+       "2            Divorced  Handlers-cleaners  Not-in-family        White    Male   \n",
+       "3  Married-civ-spouse  Handlers-cleaners        Husband        Black    Male   \n",
+       "4  Married-civ-spouse     Prof-specialty           Wife        Black  Female   \n",
+       "\n",
+       "   Capital Gain  Capital Loss  Hours per week        Country Target  \n",
+       "0          2174             0              40  United-States  <=50K  \n",
+       "1             0             0              13  United-States  <=50K  \n",
+       "2             0             0              40  United-States  <=50K  \n",
+       "3             0             0              40  United-States  <=50K  \n",
+       "4             0             0              40           Cuba  <=50K  "
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "training_data = pd.read_csv(\n",
+    "    \"adult.data\", names=adult_columns, sep=r\"\\s*,\\s*\", engine=\"python\", na_values=\"?\"\n",
+    ").dropna()\n",
+    "\n",
+    "testing_data = pd.read_csv(\n",
+    "    \"adult.test\", names=adult_columns, sep=r\"\\s*,\\s*\", engine=\"python\", na_values=\"?\", skiprows=1\n",
+    ").dropna()\n",
+    "\n",
+    "training_data.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "cc962d4a-8d59-4d17-9768-54398fcdc84d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn import preprocessing\n",
+    "\n",
+    "\n",
+    "def number_encode_features(df):\n",
+    "    result = df.copy()\n",
+    "    encoders = {}\n",
+    "    for column in result.columns:\n",
+    "        if result.dtypes[column] == object:\n",
+    "            encoders[column] = preprocessing.LabelEncoder()\n",
+    "            result[column] = encoders[column].fit_transform(\n",
+    "                result[column].fillna(\"None\"))\n",
+    "    return result, encoders\n",
+    "\n",
+    "\n",
+    "training_data = pd.concat(\n",
+    "    [training_data[\"Target\"], training_data.drop([\"Target\"], axis=1)], axis=1)\n",
+    "training_data, _ = number_encode_features(training_data)\n",
+    "training_data.to_csv(\"train_data.csv\", index=False, header=False)\n",
+    "\n",
+    "testing_data, _ = number_encode_features(testing_data)\n",
+    "test_features = testing_data.drop([\"Target\"], axis=1)\n",
+    "test_target = testing_data[\"Target\"]\n",
+    "test_features.to_csv(\"test_features.csv\", index=False, header=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "bd13da22-a648-442e-bd5b-3d79bdf1b00a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sagemaker.s3 import S3Uploader\n",
+    "from sagemaker.inputs import TrainingInput\n",
+    "\n",
+    "train_uri = S3Uploader.upload(\n",
+    "    \"train_data.csv\", \"s3://{}/{}\".format(default_bucket, default_prefix))\n",
+    "train_input = TrainingInput(train_uri, content_type=\"text/csv\")\n",
+    "test_uri = S3Uploader.upload(\n",
+    "    \"test_features.csv\", \"s3://{}/{}\".format(default_bucket, default_prefix)\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "a9a1243b-9038-45c6-9416-126b832e241a",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "'257758044811.dkr.ecr.us-east-2.amazonaws.com/sagemaker-xgboost:1.2-1'"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# Estimator 기반의 XGBoost 모델 생성\n",
+    "\n",
+    "from sagemaker.image_uris import retrieve\n",
+    "from sagemaker.estimator import Estimator\n",
+    "\n",
+    "# Creates an XGBoost estimator based on the provided region.\n",
+    "# region (str): The AWS region to retrieve the XGBoost container for.\n",
+    "container = retrieve(\"xgboost\", region, version=\"1.2-1\")\n",
+    "container  # '257758044811.dkr.ecr.us-east-2.amazonaws.com/sagemaker-xgboost:1.2-1' 이렇게 ECR 에서 xgboost SageMaker 이미지를 가져옴"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "1e87f62b-b3da-44ad-8b52-84c69f81b40a",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.\n",
+      "INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2024-01-17-10-09-29-060\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "2024-01-17 10:09:29 Starting - Starting the training job..\n",
+      "2024-01-17 10:09:44 Starting - Preparing the instances for training...........\n",
+      "2024-01-17 10:10:45 Downloading - Downloading input data.....\n",
+      "2024-01-17 10:11:15 Downloading - Downloading the training image.....\n",
+      "2024-01-17 10:11:40 Training - Training image download completed. Training in progress.....\n",
+      "2024-01-17 10:12:06 Uploading - Uploading generated training model..\n",
+      "2024-01-17 10:12:22 Completed - Training job completed\n"
+     ]
+    }
+   ],
+   "source": [
+    "container = retrieve(\"xgboost\", region, version=\"1.2-1\")\n",
+    "\n",
+    "xgb = Estimator(\n",
+    "    container,\n",
+    "    role,\n",
+    "    instance_count=1,\n",
+    "    instance_type=\"ml.m5.xlarge\",\n",
+    "    disable_profiler=True,\n",
+    "    sagemaker_session=session,\n",
+    ")\n",
+    "\n",
+    "# Hyperparameters 설정\n",
+    "xgb.set_hyperparameters(\n",
+    "    max_depth=5,\n",
+    "    eta=0.2,\n",
+    "    gamma=4,\n",
+    "    min_child_weight=6,\n",
+    "    subsample=0.8,\n",
+    "    objective=\"binary:logistic\",\n",
+    "    num_round=800,\n",
+    ")\n",
+    "\n",
+    "with Run(\n",
+    "    experiment_name=experiment_name,\n",
+    "    # create a experiment run with only the model training on it\n",
+    "    run_name=\"Lecture-model-train-only\",\n",
+    "    sagemaker_session=sagemaker_session,\n",
+    ") as run:\n",
+    "    xgb.fit({\"train\": train_input}, logs=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "d17e0f65-21e4-4510-87ee-9987aa812cde",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO:sagemaker:Creating model with name: Lecture-DEMO-clarify-bias-model-17-01-2024-10-12-26\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "'Lecture-DEMO-clarify-bias-model-17-01-2024-10-12-26'"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "\n",
+    "# model 생성하고, SageMaker에 저장\n",
+    "model_name = \"Lecture-DEMO-clarify-bias-model-{}\".format(\n",
+    "    datetime.now().strftime(\"%d-%m-%Y-%H-%M-%S\"))\n",
+    "model = xgb.create_model(name=model_name)\n",
+    "container_def = model.prepare_container_def()\n",
+    "session.create_model(model_name, role, container_def)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "0ebbaa3b-5002-44be-969a-86177809900a",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO:sagemaker.image_uris:Defaulting to the only supported framework/algorithm version: 1.0.\n",
+      "INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Clarify 정의해서 Fairness와 Explainability 분석\n",
+    "from sagemaker import clarify\n",
+    "\n",
+    "clarify_processor = clarify.SageMakerClarifyProcessor(\n",
+    "    role=role,\n",
+    "    instance_count=1,\n",
+    "    instance_type=\"ml.m5.xlarge\",\n",
+    "    sagemaker_session=session\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "6e7b68e7-21b1-4990-9bce-0686da69eb1c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "bias_report_output_path = \"s3://{}/{}/clarify-bias\".format(\n",
+    "    default_bucket, default_prefix)\n",
+    "\n",
+    "# DataConfig: 어떤 데이터를 써서 Fairness를 판별할지 정의\n",
+    "bias_data_config = clarify.DataConfig(\n",
+    "    s3_data_input_path=train_uri,\n",
+    "    s3_output_path=bias_report_output_path,\n",
+    "    label=\"Target\",\n",
+    "    headers=training_data.columns.to_list(),\n",
+    "    dataset_type=\"text/csv\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "746a05f1-6bf6-41e5-9af3-80fc3267234f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ModelConfig: 어떤 모델의 Fairness를 판단할지 정의\n",
+    "model_config = clarify.ModelConfig(\n",
+    "    model_name=model_name,\n",
+    "    instance_type=\"ml.m5.xlarge\",\n",
+    "    instance_count=1,\n",
+    "    accept_type=\"text/csv\",\n",
+    "    content_type=\"text/csv\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "1b97f08b-617f-4be3-a84d-94729d6d935c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Prediction을 하는데, 어느 값을 기준으로 probability를 0 or 1로 볼지 정의\n",
+    "predictions_config = clarify.ModelPredictedLabelConfig(\n",
+    "    probability_threshold=0.75)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "45a152c7-b83d-43d9-8b2c-92baebacd885",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# BiasConfig: Bias는 pre-training bias(학습 전 데이터만으로 bias를 판별하는 것), post-training bias(training후 모델의 추론 결과를 통해 bias를 측정하는 것) 를 판별할 수 있는데,\n",
+    "\n",
+    "\n",
+    "bias_config = clarify.BiasConfig(\n",
+    "    label_values_or_threshold=[1],\n",
+    "    facet_name=\"Sex\",\n",
+    "    facet_values_or_threshold=[0],\n",
+    "    group_name=\"Age\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "003e6676-c8af-4ef1-b2c7-57fbce292f52",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO:sagemaker.clarify:Analysis Config: {'dataset_type': 'text/csv', 'headers': ['Target', 'Age', 'Workclass', 'fnlwgt', 'Education', 'Education-Num', 'Marital Status', 'Occupation', 'Relationship', 'Ethnic group', 'Sex', 'Capital Gain', 'Capital Loss', 'Hours per week', 'Country'], 'label': 'Target', 'label_values_or_threshold': [1], 'facet': [{'name_or_index': 'Sex', 'value_or_threshold': [0]}], 'group_variable': 'Age', 'methods': {'report': {'name': 'report', 'title': 'Analysis Report'}, 'pre_training_bias': {'methods': 'all'}, 'post_training_bias': {'methods': 'all'}}, 'predictor': {'model_name': 'Lecture-DEMO-clarify-bias-model-17-01-2024-10-12-26', 'instance_type': 'ml.m5.xlarge', 'initial_instance_count': 1, 'accept_type': 'text/csv', 'content_type': 'text/csv'}, 'probability_threshold': 0.75}\n",
+      "INFO:sagemaker:Creating processing-job with name Clarify-Bias-2024-01-17-10-18-43-197\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Job Name:  Clarify-Bias-2024-01-17-10-18-43-197\n",
+      "Inputs:  [{'InputName': 'dataset', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-2-767397847434/sagemaker/fastcampus-sagemaker-clarify-bias-practice/train_data.csv', 'LocalPath': '/opt/ml/processing/input/data', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'analysis_config', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-2-767397847434/sagemaker/fastcampus-sagemaker-clarify-bias-practice/clarify-bias/analysis_config.json', 'LocalPath': '/opt/ml/processing/input/config', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]\n",
+      "Outputs:  [{'OutputName': 'analysis_result', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://sagemaker-us-east-2-767397847434/sagemaker/fastcampus-sagemaker-clarify-bias-practice/clarify-bias', 'LocalPath': '/opt/ml/processing/output', 'S3UploadMode': 'EndOfJob'}}]\n",
+      "..............................\u001b[34m2024-01-17 10:23:34,743 logging.conf not found when configuring logging, using default logging configuration.\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:23:34,743 Starting SageMaker Clarify Processing job\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:23:34,744 Analysis config path: /opt/ml/processing/input/config/analysis_config.json\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:23:34,744 Analysis result path: /opt/ml/processing/output\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:23:34,745 This host is algo-1.\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:23:34,745 This host is the leader.\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:23:34,745 Number of hosts in the cluster is 1.\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:23:35,002 Running Python / Pandas based analyzer.\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:23:35,002 Dataset type: text/csv uri: /opt/ml/processing/input/data\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:23:35,012 Loading dataset...\u001b[0m\n",
+      "\u001b[34m/usr/local/lib/python3.9/site-packages/analyzer/data_loading/csv_data_loader.py:336: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
+      "  df = df.append(df_tmp, ignore_index=True)\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:23:35,047 Loaded dataset. Dataset info:\u001b[0m\n",
+      "\u001b[34m<class 'pandas.core.frame.DataFrame'>\u001b[0m\n",
+      "\u001b[34mRangeIndex: 30162 entries, 0 to 30161\u001b[0m\n",
+      "\u001b[34mData columns (total 14 columns):\n",
+      " #   Column          Non-Null Count  Dtype\u001b[0m\n",
+      "\u001b[34m---  ------          --------------  -----\n",
+      " 0   Age             30162 non-null  int64\n",
+      " 1   Workclass       30162 non-null  int64\n",
+      " 2   fnlwgt          30162 non-null  int64\n",
+      " 3   Education       30162 non-null  int64\n",
+      " 4   Education-Num   30162 non-null  int64\n",
+      " 5   Marital Status  30162 non-null  int64\n",
+      " 6   Occupation      30162 non-null  int64\n",
+      " 7   Relationship    30162 non-null  int64\n",
+      " 8   Ethnic group    30162 non-null  int64\n",
+      " 9   Sex             30162 non-null  int64\n",
+      " 10  Capital Gain    30162 non-null  int64\n",
+      " 11  Capital Loss    30162 non-null  int64\n",
+      " 12  Hours per week  30162 non-null  int64\n",
+      " 13  Country         30162 non-null  int64\u001b[0m\n",
+      "\u001b[34mdtypes: int64(14)\u001b[0m\n",
+      "\u001b[34mmemory usage: 3.2 MB\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:23:35,137 Spinning up shadow endpoint\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:23:35,137 Creating endpoint-config with name sm-clarify-config-1705487015-d3ea\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:23:35,501 Creating endpoint: 'sm-clarify-Lecture-DEMO-clarify-bias-model-17-0-1705487015-4ff9'\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:23:35,559 No endpoints ruleset found for service sagemaker-internal, falling back to legacy endpoint routing.\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:23:35,988 Using endpoint name: sm-clarify-Lecture-DEMO-clarify-bias-model-17-0-1705487015-4ff9\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:23:35,988 Waiting for endpoint ...\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:23:35,988 Checking endpoint status:\u001b[0m\n",
+      "\u001b[34mLegend:\u001b[0m\n",
+      "\u001b[34m(OutOfService: x, Creating: -, Updating: -, InService: !, RollingBack: <, Deleting: o, Failed: *)\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:26:36,488 Endpoint is in service after 180 seconds\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:26:36,488 Endpoint ready.\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:26:36,488 ======================================\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:26:36,488 Calculating post-training bias metrics\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:26:36,488 ======================================\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:26:36,488 Getting predictions from the endpoint\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:26:38,991 We assume a prediction above 0.750 indicates 1 and below or equal indicates 0.\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:26:38,991 Column Target with data uniqueness fraction 6.630860022544923e-05 is classifed as a CATEGORICAL column\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:26:38,994 Column Sex with data uniqueness fraction 6.630860022544923e-05 is classifed as a CATEGORICAL column\u001b[0m\n",
+      "\u001b[34m/usr/local/lib/python3.9/site-packages/smclarify/bias/report.py:591: FutureWarning: In a future version of pandas all arguments of DataFrame.drop except for the argument 'labels' will be keyword-only.\n",
+      "  df = df.drop(facet_column.name, 1)\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:26:38,997 Column Target with data uniqueness fraction 6.630860022544923e-05 is classifed as a CATEGORICAL column\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:26:38,999 Column None with data uniqueness fraction 6.630860022544923e-05 is classifed as a CATEGORICAL column\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:26:40,070 Calculated global analysis with predictor\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:26:40,070 Stop using endpoint: sm-clarify-Lecture-DEMO-clarify-bias-model-17-0-1705487015-4ff9\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:26:40,070 Deleting endpoint configuration with name: sm-clarify-config-1705487015-d3ea\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:26:40,208 Deleting endpoint with name: sm-clarify-Lecture-DEMO-clarify-bias-model-17-0-1705487015-4ff9\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:26:40,315 Model endpoint delivered 0.55831 requests per second and a total of 2 requests over 4 seconds\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:26:40,315 =====================================\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:26:40,315 Calculating pre-training bias metrics\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:26:40,315 =====================================\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:26:40,316 Column Target with data uniqueness fraction 6.630860022544923e-05 is classifed as a CATEGORICAL column\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:26:40,318 Column Sex with data uniqueness fraction 6.630860022544923e-05 is classifed as a CATEGORICAL column\u001b[0m\n",
+      "\u001b[34m/usr/local/lib/python3.9/site-packages/smclarify/bias/report.py:591: FutureWarning: In a future version of pandas all arguments of DataFrame.drop except for the argument 'labels' will be keyword-only.\n",
+      "  df = df.drop(facet_column.name, 1)\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:26:40,321 Column Target with data uniqueness fraction 6.630860022544923e-05 is classifed as a CATEGORICAL column\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:26:40,569 ======================================\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:26:40,569 Calculating bias statistics for report\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:26:40,569 ======================================\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:26:40,569 Column Target with data uniqueness fraction 6.630860022544923e-05 is classifed as a CATEGORICAL column\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:26:40,571 Column Sex with data uniqueness fraction 6.630860022544923e-05 is classifed as a CATEGORICAL column\u001b[0m\n",
+      "\u001b[34m/usr/local/lib/python3.9/site-packages/smclarify/bias/report.py:591: FutureWarning: In a future version of pandas all arguments of DataFrame.drop except for the argument 'labels' will be keyword-only.\n",
+      "  df = df.drop(facet_column.name, 1)\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:26:40,574 Column Target with data uniqueness fraction 6.630860022544923e-05 is classifed as a CATEGORICAL column\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:26:40,576 Column None with data uniqueness fraction 6.630860022544923e-05 is classifed as a CATEGORICAL column\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:26:40,584 Column Target with data uniqueness fraction 6.630860022544923e-05 is classifed as a CATEGORICAL column\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:26:40,586 Column None with data uniqueness fraction 6.630860022544923e-05 is classifed as a CATEGORICAL column\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:26:40,601 Converting Pandas DataFrame to SparkDataFrame for computing report metadata\u001b[0m\n",
+      "\u001b[34m--!10:26:42.247 [main] WARN  o.a.hadoop.util.NativeCodeLoader - Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\u001b[0m\n",
+      "\u001b[34m#015[Stage 0:>                                                          (0 + 4) / 4]#015#015                                                                                #015#015[Stage 3:>                                                          (0 + 4) / 4]#015#015                                                                                #0152024-01-17 10:26:54,103 Calculated global analysis without predictor\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:26:54,103 Stop using endpoint: None\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:26:56,379 ['jupyter', 'nbconvert', '--to', 'html', '--output', '/opt/ml/processing/output/report.html', '/opt/ml/processing/output/report.ipynb', '--template', 'sagemaker-xai']\u001b[0m\n",
+      "\u001b[34m[NbConvertApp] Converting notebook /opt/ml/processing/output/report.ipynb to html\u001b[0m\n",
+      "\u001b[34m[NbConvertApp] Writing 823004 bytes to /opt/ml/processing/output/report.html\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:26:57,456 ['wkhtmltopdf', '-q', '--enable-local-file-access', '/opt/ml/processing/output/report.html', '/opt/ml/processing/output/report.pdf']\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:26:58,555 Collected analyses: \u001b[0m\n",
+      "\u001b[34m{\n",
+      "    \"version\": \"1.0\",\n",
+      "    \"post_training_bias_metrics\": {\n",
+      "        \"label\": \"Target\",\n",
+      "        \"facets\": {\n",
+      "            \"Sex\": [\n",
+      "                {\n",
+      "                    \"value_or_threshold\": \"0\",\n",
+      "                    \"metrics\": [\n",
+      "                        {\n",
+      "                            \"name\": \"AD\",\n",
+      "                            \"description\": \"Accuracy Difference (AD)\",\n",
+      "                            \"value\": -0.10472421457047243\n",
+      "                        },\n",
+      "                        {\n",
+      "                            \"name\": \"CDDPL\",\n",
+      "                            \"description\": \"Conditional Demographic Disparity in Predicted Labels (CDDPL)\",\n",
+      "                            \"value\": 0.17524085448088114\n",
+      "                        },\n",
+      "                        {\n",
+      "                            \"name\": \"DAR\",\n",
+      "                            \"description\": \"Difference in Acceptance Rates (DAR)\",\n",
+      "                            \"value\": -0.004519732836182344\n",
+      "                        },\n",
+      "                        {\n",
+      "                            \"name\": \"DCA\",\n",
+      "                            \"description\": \"Difference in Conditional Acceptance (DCA)\",\n",
+      "                            \"value\": -0.15231911077048244\n",
+      "                        },\n",
+      "                        {\n",
+      "                            \"name\": \"DCR\",\n",
+      "                            \"description\": \"Difference in Conditional Rejection (DCR)\",\n",
+      "                            \"value\": 0.12028471242426919\n",
+      "                        },\n",
+      "                        {\n",
+      "                            \"name\": \"DI\",\n",
+      "                            \"description\": \"Disparate Impact (DI)\",\n",
+      "                            \"value\": 0.33622152955999146\n",
+      "                        },\n",
+      "                        {\n",
+      "                            \"name\": \"DPPL\",\n",
+      "                            \"description\": \"Difference in Positive Proportions in Predicted Labels (DPPL)\",\n",
+      "                            \"value\": 0.10575508800386199\n",
+      "                        },\n",
+      "                        {\n",
+      "                            \"name\": \"DRR\",\n",
+      "                            \"description\": \"Difference in Rejection Rates (DRR)\",\n",
+      "                            \"value\": 0.12672179028324737\n",
+      "                        },\n",
+      "                        {\n",
+      "                            \"name\": \"FT\",\n",
+      "                            \"description\": \"Flip Test (FT)\",\n",
+      "                            \"value\": -0.0014312001635657329\n",
+      "                        },\n",
+      "                        {\n",
+      "                            \"name\": \"GE\",\n",
+      "                            \"description\": \"Generalized Entropy (GE)\",\n",
+      "                            \"value\": 0.07818223296686777\n",
+      "                        },\n",
+      "                        {\n",
+      "                            \"name\": \"RD\",\n",
+      "                            \"description\": \"Recall Difference (RD)\",\n",
+      "                            \"value\": 0.0326136839121573\n",
+      "                        },\n",
+      "                        {\n",
+      "                            \"name\": \"SD\",\n",
+      "                            \"description\": \"Specificity Difference (SD)\",\n",
+      "                            \"value\": 0.008260569337440238\n",
+      "                        },\n",
+      "                        {\n",
+      "                            \"name\": \"TE\",\n",
+      "                            \"description\": \"Treatment Equality (TE)\",\n",
+      "                            \"value\": 5.872968091511137\n",
+      "                        }\n",
+      "                    ]\n",
+      "                }\n",
+      "            ]\n",
+      "        },\n",
+      "        \"label_value_or_threshold\": \"1\"\n",
+      "    },\n",
+      "    \"pre_training_bias_metrics\": {\n",
+      "        \"label\": \"Target\",\n",
+      "        \"facets\": {\n",
+      "            \"Sex\": [\n",
+      "                {\n",
+      "                    \"value_or_threshold\": \"0\",\n",
+      "                    \"metrics\": [\n",
+      "                        {\n",
+      "                            \"name\": \"CDDL\",\n",
+      "                            \"description\": \"Conditional Demographic Disparity in Labels (CDDL)\",\n",
+      "                            \"value\": 0.214915908649356\n",
+      "                        },\n",
+      "                        {\n",
+      "                            \"name\": \"CI\",\n",
+      "                            \"description\": \"Class Imbalance (CI)\",\n",
+      "                            \"value\": 0.3513692725946555\n",
+      "                        },\n",
+      "                        {\n",
+      "                            \"name\": \"DPL\",\n",
+      "                            \"description\": \"Difference in Positive Proportions in Labels (DPL)\",\n",
+      "                            \"value\": 0.20015891077100018\n",
+      "                        },\n",
+      "                        {\n",
+      "                            \"name\": \"JS\",\n",
+      "                            \"description\": \"Jensen-Shannon Divergence (JS)\",\n",
+      "                            \"value\": 0.03075614465977302\n",
+      "                        },\n",
+      "                        {\n",
+      "                            \"name\": \"KL\",\n",
+      "                            \"description\": \"Kullback-Liebler Divergence (KL)\",\n",
+      "                            \"value\": 0.14306865156306428\n",
+      "                        },\n",
+      "                        {\n",
+      "                            \"name\": \"KS\",\n",
+      "                            \"description\": \"Kolmogorov-Smirnov Distance (KS)\",\n",
+      "                            \"value\": 0.20015891077100018\n",
+      "                        },\n",
+      "                        {\n",
+      "                            \"name\": \"LP\",\n",
+      "                            \"description\": \"L-p Norm (LP)\",\n",
+      "                            \"value\": 0.2830674462421746\n",
+      "                        },\n",
+      "                        {\n",
+      "                            \"name\": \"TVD\",\n",
+      "                            \"description\": \"Total Variation Distance (TVD)\",\n",
+      "                            \"value\": 0.20015891077100015\n",
+      "                        }\n",
+      "                    ]\n",
+      "                }\n",
+      "            ]\n",
+      "        },\n",
+      "        \"label_value_or_threshold\": \"1\"\n",
+      "    }\u001b[0m\n",
+      "\u001b[34m}\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:26:58,556 exit_message: Completed: SageMaker XAI Analyzer ran successfully\u001b[0m\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "with Run(\n",
+    "    experiment_name=experiment_name,\n",
+    "    run_name=\"bias-only\",  # create a experiment run with only the bias analysis on it\n",
+    "    sagemaker_session=sagemaker_session,\n",
+    ") as run:\n",
+    "    clarify_processor.run_bias(\n",
+    "        data_config=bias_data_config,\n",
+    "        bias_config=bias_config,\n",
+    "        model_config=model_config,\n",
+    "        model_predicted_label_config=predictions_config,\n",
+    "        pre_training_methods=\"all\",\n",
+    "        post_training_methods=\"all\",\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "47890439-ca47-4fea-99ae-501ede19aa3b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "shap_config = clarify.SHAPConfig(\n",
+    "    baseline=[test_features.iloc[0].values.tolist()],\n",
+    "    num_samples=15,\n",
+    "    agg_method=\"mean_abs\",\n",
+    "    save_local_shap_values=True,\n",
+    ")\n",
+    "\n",
+    "explainability_output_path = \"s3://{}/{}/clarify-explainability\".format(\n",
+    "    default_bucket, default_prefix\n",
+    ")\n",
+    "explainability_data_config = clarify.DataConfig(\n",
+    "    s3_data_input_path=train_uri,\n",
+    "    s3_output_path=explainability_output_path,\n",
+    "    label=\"Target\",\n",
+    "    headers=training_data.columns.to_list(),\n",
+    "    dataset_type=\"text/csv\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "4eb95897-a60a-4416-b49d-cd539c043fdd",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "INFO:sagemaker.clarify:Analysis Config: {'dataset_type': 'text/csv', 'headers': ['Target', 'Age', 'Workclass', 'fnlwgt', 'Education', 'Education-Num', 'Marital Status', 'Occupation', 'Relationship', 'Ethnic group', 'Sex', 'Capital Gain', 'Capital Loss', 'Hours per week', 'Country'], 'label': 'Target', 'predictor': {'model_name': 'Lecture-DEMO-clarify-bias-model-17-01-2024-10-12-26', 'instance_type': 'ml.m5.xlarge', 'initial_instance_count': 1, 'accept_type': 'text/csv', 'content_type': 'text/csv'}, 'methods': {'report': {'name': 'report', 'title': 'Analysis Report'}, 'shap': {'use_logit': False, 'save_local_shap_values': True, 'baseline': [[25, 2, 226802, 1, 7, 4, 6, 3, 2, 1, 0, 0, 40, 37]], 'num_samples': 15, 'agg_method': 'mean_abs'}}}\n",
+      "INFO:sagemaker:Creating processing-job with name Clarify-Explainability-2024-01-17-10-27-29-944\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Job Name:  Clarify-Explainability-2024-01-17-10-27-29-944\n",
+      "Inputs:  [{'InputName': 'dataset', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-2-767397847434/sagemaker/fastcampus-sagemaker-clarify-bias-practice/train_data.csv', 'LocalPath': '/opt/ml/processing/input/data', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'analysis_config', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-2-767397847434/sagemaker/fastcampus-sagemaker-clarify-bias-practice/clarify-explainability/analysis_config.json', 'LocalPath': '/opt/ml/processing/input/config', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]\n",
+      "Outputs:  [{'OutputName': 'analysis_result', 'AppManaged': False, 'S3Output': {'S3Uri': 's3://sagemaker-us-east-2-767397847434/sagemaker/fastcampus-sagemaker-clarify-bias-practice/clarify-explainability', 'LocalPath': '/opt/ml/processing/output', 'S3UploadMode': 'EndOfJob'}}]\n",
+      ".............................\u001b[34m2024-01-17 10:32:05,324 logging.conf not found when configuring logging, using default logging configuration.\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:32:05,325 Starting SageMaker Clarify Processing job\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:32:05,325 Analysis config path: /opt/ml/processing/input/config/analysis_config.json\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:32:05,325 Analysis result path: /opt/ml/processing/output\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:32:05,326 This host is algo-1.\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:32:05,326 This host is the leader.\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:32:05,326 Number of hosts in the cluster is 1.\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:32:05,594 Running Python / Pandas based analyzer.\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:32:05,594 Dataset type: text/csv uri: /opt/ml/processing/input/data\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:32:05,604 Loading dataset...\u001b[0m\n",
+      "\u001b[34m/usr/local/lib/python3.9/site-packages/analyzer/data_loading/csv_data_loader.py:336: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.\n",
+      "  df = df.append(df_tmp, ignore_index=True)\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:32:05,639 Loaded dataset. Dataset info:\u001b[0m\n",
+      "\u001b[34m<class 'pandas.core.frame.DataFrame'>\u001b[0m\n",
+      "\u001b[34mRangeIndex: 30162 entries, 0 to 30161\u001b[0m\n",
+      "\u001b[34mData columns (total 14 columns):\n",
+      " #   Column          Non-Null Count  Dtype\u001b[0m\n",
+      "\u001b[34m---  ------          --------------  -----\n",
+      " 0   Age             30162 non-null  int64\n",
+      " 1   Workclass       30162 non-null  int64\n",
+      " 2   fnlwgt          30162 non-null  int64\n",
+      " 3   Education       30162 non-null  int64\n",
+      " 4   Education-Num   30162 non-null  int64\n",
+      " 5   Marital Status  30162 non-null  int64\n",
+      " 6   Occupation      30162 non-null  int64\n",
+      " 7   Relationship    30162 non-null  int64\n",
+      " 8   Ethnic group    30162 non-null  int64\n",
+      " 9   Sex             30162 non-null  int64\n",
+      " 10  Capital Gain    30162 non-null  int64\n",
+      " 11  Capital Loss    30162 non-null  int64\n",
+      " 12  Hours per week  30162 non-null  int64\n",
+      " 13  Country         30162 non-null  int64\u001b[0m\n",
+      "\u001b[34mdtypes: int64(14)\u001b[0m\n",
+      "\u001b[34mmemory usage: 3.2 MB\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:32:05,730 Spinning up shadow endpoint\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:32:05,730 Creating endpoint-config with name sm-clarify-config-1705487525-78e8\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:32:06,100 Creating endpoint: 'sm-clarify-Lecture-DEMO-clarify-bias-model-17-0-1705487526-7abe'\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:32:06,158 No endpoints ruleset found for service sagemaker-internal, falling back to legacy endpoint routing.\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:32:06,621 Using endpoint name: sm-clarify-Lecture-DEMO-clarify-bias-model-17-0-1705487526-7abe\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:32:06,621 Waiting for endpoint ...\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:32:06,621 Checking endpoint status:\u001b[0m\n",
+      "\u001b[34mLegend:\u001b[0m\n",
+      "\u001b[34m(OutOfService: x, Creating: -, Updating: -, InService: !, RollingBack: <, Deleting: o, Failed: *)\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:35:07,138 Endpoint is in service after 181 seconds\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:35:07,138 Endpoint ready.\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:35:07,238 Clarify Kernel SHAP n_coalitions: 15, n_instances: 1, n_features_to_explain: 14, model_output_size: 1\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:35:07,238 =====================================================\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:35:07,238 Shap analyzer: explaining 30162 rows, 14 columns...\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:35:07,238 =====================================================\n",
+      "  0% (0 of 30162) |                      | Elapsed Time: 0:00:00 ETA:  --:--:--\u001b[0m\n",
+      "\u001b[34m 10% (3201 of 30162) |##                 | Elapsed Time: 0:00:30 ETA:   0:04:12\u001b[0m\n",
+      "\u001b[34m 22% (6656 of 30162) |####               | Elapsed Time: 0:01:00 ETA:   0:03:24\u001b[0m\n",
+      "\u001b[34m 33% (10190 of 30162) |######            | Elapsed Time: 0:01:30 ETA:   0:02:49\u001b[0m\n",
+      "\u001b[34m 45% (13642 of 30162) |########          | Elapsed Time: 0:02:00 ETA:   0:02:23\u001b[0m\n",
+      "\u001b[34m 56% (17137 of 30162) |##########        | Elapsed Time: 0:02:30 ETA:   0:01:51\u001b[0m\n",
+      "\u001b[34m 68% (20561 of 30162) |############      | Elapsed Time: 0:03:00 ETA:   0:01:24\u001b[0m\n",
+      "\u001b[34m 79% (23975 of 30162) |##############    | Elapsed Time: 0:03:30 ETA:   0:00:54\u001b[0m\n",
+      "\u001b[34m 90% (27407 of 30162) |################  | Elapsed Time: 0:04:00 ETA:   0:00:24\u001b[0m\n",
+      "\u001b[34m100% (30162 of 30162) |##################| Elapsed Time: 0:04:23 Time:  0:04:23\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:39:31,250 getting explanations took 264.01 seconds.\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:39:31,251 ===================================================\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:39:31,251 Falling back to generic labels: label0, label1, ...\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:39:33,195 converting explanations to tabular took 1.94 seconds.\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:39:33,195 ===================================================\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:39:33,198 Wrote baseline used to compute explanations to: /opt/ml/processing/output/explanations_shap/baseline.csv\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:39:33,880 Wrote 30162 local explanations to: /opt/ml/processing/output/explanations_shap/out.csv\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:39:33,880 writing local explanations took 0.68 seconds.\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:39:33,880 ===================================================\u001b[0m\n",
+      "\u001b[34m/usr/local/lib/python3.9/site-packages/numpy/core/fromnumeric.py:3430: FutureWarning: In a future version, DataFrame.mean(axis=None) will return a scalar mean over the entire DataFrame. To retain the old behavior, use 'frame.mean(axis=0)' or just 'frame.mean()'\n",
+      "  return mean(axis=axis, dtype=dtype, out=out, **kwargs)\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:39:33,884 aggregating local explanations took 0.00 seconds.\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:39:33,884 ===================================================\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:39:33,884 Shap analysis finished.\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:39:33,885 Calculated global analysis with predictor\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:39:33,885 Stop using endpoint: sm-clarify-Lecture-DEMO-clarify-bias-model-17-0-1705487526-7abe\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:39:33,885 Deleting endpoint configuration with name: sm-clarify-config-1705487525-78e8\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:39:34,073 Deleting endpoint with name: sm-clarify-Lecture-DEMO-clarify-bias-model-17-0-1705487526-7abe\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:39:34,191 Model endpoint delivered 113.08124 requests per second and a total of 30164 requests over 267 seconds\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:39:34,191 Calculated global analysis without predictor\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:39:52,900 Stop using endpoint: None\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:40:01,343 ['jupyter', 'nbconvert', '--to', 'html', '--output', '/opt/ml/processing/output/report.html', '/opt/ml/processing/output/report.ipynb', '--template', 'sagemaker-xai']\u001b[0m\n",
+      "\u001b[34m[NbConvertApp] Converting notebook /opt/ml/processing/output/report.ipynb to html\u001b[0m\n",
+      "\u001b[34m[NbConvertApp] Writing 535378 bytes to /opt/ml/processing/output/report.html\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:40:02,325 ['wkhtmltopdf', '-q', '--enable-local-file-access', '/opt/ml/processing/output/report.html', '/opt/ml/processing/output/report.pdf']\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:40:02,991 Collected analyses: \u001b[0m\n",
+      "\u001b[34m{\n",
+      "    \"version\": \"1.0\",\n",
+      "    \"explanations\": {\n",
+      "        \"kernel_shap\": {\n",
+      "            \"label0\": {\n",
+      "                \"global_shap_values\": {\n",
+      "                    \"Age\": 0.03641889311978478,\n",
+      "                    \"Workclass\": 0.018266321647830524,\n",
+      "                    \"fnlwgt\": 0.021253751560726748,\n",
+      "                    \"Education\": 0.018540646066814045,\n",
+      "                    \"Education-Num\": 0.037158876677162146,\n",
+      "                    \"Marital Status\": 0.028987967748786588,\n",
+      "                    \"Occupation\": 0.026774353559974554,\n",
+      "                    \"Relationship\": 0.03638060910768331,\n",
+      "                    \"Ethnic group\": 0.019755784723812633,\n",
+      "                    \"Sex\": 0.017970951336745932,\n",
+      "                    \"Capital Gain\": 0.03342374795175269,\n",
+      "                    \"Capital Loss\": 0.0196018076330827,\n",
+      "                    \"Hours per week\": 0.021397514319347735,\n",
+      "                    \"Country\": 0.04666602078225621\n",
+      "                },\n",
+      "                \"expected_value\": 0.0006380207487381995\n",
+      "            }\n",
+      "        }\n",
+      "    }\u001b[0m\n",
+      "\u001b[34m}\u001b[0m\n",
+      "\u001b[34m2024-01-17 10:40:02,992 exit_message: Completed: SageMaker XAI Analyzer ran successfully\u001b[0m\n",
+      "\u001b[34m--!\u001b[0m\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "with Run(\n",
+    "    experiment_name=experiment_name,\n",
+    "    # create a experiment run with only the model explainabilit on it\n",
+    "    run_name=\"explainabilit-only\",\n",
+    "    sagemaker_session=sagemaker_session,\n",
+    ") as run:\n",
+    "    clarify_processor.run_explainability(\n",
+    "        data_config=explainability_data_config,\n",
+    "        model_config=model_config,\n",
+    "        explainability_config=shap_config,\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3a9192ba-52f8-4a43-aabd-5ff618fee629",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}