feat(classifier): Add support for excluding list of exact column names (

datahub-project#9472) Co-authored-by: Ethan Cartwright <[email protected]>
acryldata · Jan 17, 2024 · dfb2f7e · dfb2f7e
1 parent acfc315
commit dfb2f7e
Show file tree

Hide file tree

Showing 5 changed files with 79 additions and 2 deletions.
diff --git a/docker/datahub-ingestion-base/base-requirements.txt b/docker/datahub-ingestion-base/base-requirements.txt
@@ -1,5 +1,5 @@
 # Generated requirements file. Run ./regenerate-base-requirements.sh to regenerate.
-acryl-datahub-classify==0.0.8
+acryl-datahub-classify==0.0.9
 acryl-PyHive==0.6.16
 acryl-sqlglot==20.4.1.dev14
 aenum==3.1.15

diff --git a/metadata-ingestion/docs/dev_guides/classification.md b/metadata-ingestion/docs/dev_guides/classification.md
@@ -31,9 +31,11 @@ DataHub Classifier is the default classifier implementation, which uses [acryl-d
 | Field                                                  | Required                                              | Type                                           | Description                                                                                                                                               | Default                                                                                                                                                               |
 | ------------------------------------------------------ | ----------------------------------------------------- | ---------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | confidence_level_threshold                             |                                                       | number                                         |                                                                                                                                                           | 0.68                                                                                                                                                                  |
+| strip_exclusion_formatting                                             |                                                       | bool                                   | A flag that determines whether the exclusion list uses exact matching or format stripping (case-insensitivity, punctuation removal, and special character removal). | True |
 | info_types                                             |                                                       | list[string]                                   | List of infotypes to be predicted. By default, all supported infotypes are considered, along with any custom infotypes configured in `info_types_config`. | None                                                                                                                                                                  |
 | info_types_config                                      | Configuration details for infotypes                   | Dict[str, InfoTypeConfig]                      |                                                                                                                                                           | See [reference_input.py](https://github.com/acryldata/datahub-classify/blob/main/datahub-classify/src/datahub_classify/reference_input.py) for default configuration. |
 | info_types_config.`key`.prediction_factors_and_weights | ❓ (required if info_types_config.`key` is set)        | Dict[str,number]                               | Factors and their weights to consider when predicting info types                                                                                          |                                                                                                                                                                       |
+| info_types_config.`key`.exclude_name                           |                                                       | list[string]        | Optional list of names to exclude from classification.                                                                                                                                                           | None                                                                                                                                                                       |
 | info_types_config.`key`.name                           |                                                       | NameFactorConfig (see below for fields)        |                                                                                                                                                           |                                                                                                                                                                       |
 | info_types_config.`key`.name.regex                     |                                                       | Array of string                                | List of regex patterns the column name follows for the info type                                                                                          | ['.*']                                                                                                                                                                |
 | info_types_config.`key`.description                    |                                                       | DescriptionFactorConfig (see below for fields) |                                                                                                                                                           |                                                                                                                                                                       |

diff --git a/metadata-ingestion/setup.py b/metadata-ingestion/setup.py
@@ -189,7 +189,7 @@
     "pandas",
     "cryptography",
     "msal",
-    "acryl-datahub-classify==0.0.8",
+    "acryl-datahub-classify==0.0.9",
     # spacy version restricted to reduce backtracking, used by acryl-datahub-classify,
     "spacy==3.4.3",
 }

diff --git a/metadata-ingestion/src/datahub/ingestion/glossary/datahub_classifier.py b/metadata-ingestion/src/datahub/ingestion/glossary/datahub_classifier.py
@@ -73,6 +73,11 @@ class Config:
         description="Factors and their weights to consider when predicting info types",
         alias="prediction_factors_and_weights",
     )
+    ExcludeName: Optional[List[str]] = Field(
+        default=None,
+        alias="exclude_name",
+        description="List of exact column names to exclude from classification for this info type",
+    )
     Name: Optional[NameFactorConfig] = Field(default=None, alias="name")
 
     Description: Optional[DescriptionFactorConfig] = Field(
@@ -95,6 +100,7 @@ class DataHubClassifierConfig(ConfigModel):
         default=0.68,
         description="The confidence threshold above which the prediction is considered as a proposal",
     )
+    strip_exclusion_formatting: bool = Field(default=True)
     info_types: Optional[List[str]] = Field(
         default=None,
         description="List of infotypes to be predicted. By default, all supported infotypes are considered, along with any custom infotypes configured in `info_types_config`.",

diff --git a/metadata-ingestion/tests/unit/test_classification.py b/metadata-ingestion/tests/unit/test_classification.py
@@ -157,3 +157,72 @@ def test_incorrect_custom_info_type_config():
                 },
             }
         )
+
+
+def test_exclude_name_config():
+    config = DataHubClassifier.create(
+        config_dict={
+            "confidence_level_threshold": 0.7,
+            "info_types_config": {
+                "Email_Address": {
+                    "Prediction_Factors_and_Weights": {
+                        "Name": 1,
+                        "Description": 0,
+                        "Datatype": 0,
+                        "Values": 0,
+                    },
+                    "ExcludeName": ["email_sent", "email_received"],
+                    "Name": {
+                        "regex": [
+                            "^.*mail.*id.*$",
+                            "^.*id.*mail.*$",
+                            "^.*mail.*add.*$",
+                            "^.*add.*mail.*$",
+                            "email",
+                            "mail",
+                        ]
+                    },
+                    "Description": {"regex": []},
+                    "Datatype": {"type": ["str"]},
+                    "Values": {"prediction_type": "regex", "regex": [], "library": []},
+                }
+            },
+        }
+    ).config
+    assert config.info_types_config["Email_Address"].ExcludeName is not None
+    assert config.info_types_config["Email_Address"].ExcludeName == [
+        "email_sent",
+        "email_received",
+    ]
+
+
+def test_no_exclude_name_config():
+    config = DataHubClassifier.create(
+        config_dict={
+            "confidence_level_threshold": 0.7,
+            "info_types_config": {
+                "Email_Address": {
+                    "Prediction_Factors_and_Weights": {
+                        "Name": 1,
+                        "Description": 0,
+                        "Datatype": 0,
+                        "Values": 0,
+                    },
+                    "Name": {
+                        "regex": [
+                            "^.*mail.*id.*$",
+                            "^.*id.*mail.*$",
+                            "^.*mail.*add.*$",
+                            "^.*add.*mail.*$",
+                            "email",
+                            "mail",
+                        ]
+                    },
+                    "Description": {"regex": []},
+                    "Datatype": {"type": ["str"]},
+                    "Values": {"prediction_type": "regex", "regex": [], "library": []},
+                }
+            },
+        }
+    ).config
+    assert config.info_types_config["Email_Address"].ExcludeName is None