Add dataset /presidio-entities endpoint (#2846)

* add dataset-presidio-entities-count * add to graph * fix tests for graph * add /presidio-entities endpoint * child of dataset-split-names * add openapi * fix tests * fix openapi * Apply suggestions from code review Co-authored-by: Sylvain Lesage <[email protected]> --------- Co-authored-by: Sylvain Lesage <[email protected]>
huggingface · May 22, 2024 · f69fb2e · f69fb2e
1 parent 9ba61fc
commit f69fb2e
Show file tree

Hide file tree

Showing 9 changed files with 705 additions and 6 deletions.
diff --git a/docs/source/openapi.json b/docs/source/openapi.json
@@ -1061,7 +1061,8 @@
           "num_opt_out_urls",
           "num_urls",
           "num_scanned_rows",
-          "has_urls_columns"
+          "has_urls_columns",
+          "full_scan"
         ],
         "properties": {
           "urls_columns": {
@@ -1088,6 +1089,45 @@
           "full_scan": { "anyOf": [{ "type": "boolean" }, { "type": "null" }] }
         }
       },
+      "PresidioEntitiesCountResponse": {
+        "type": "object",
+        "required": [
+          "scanned_columns",
+          "num_rows_with_person_entities",
+          "num_rows_with_phone_number_entities",
+          "num_rows_with_email_address_entities",
+          "num_rows_with_sensitive_pii",
+          "num_scanned_rows",
+          "has_scanned_columns"
+        ],
+        "properties": {
+          "scanned_columns": {
+            "type": "array",
+            "items": {
+              "type": "string"
+            }
+          },
+          "num_rows_with_person_entities": {
+            "type": "integer"
+          },
+          "num_rows_with_phone_number_entities": {
+            "type": "integer"
+          },
+          "num_rows_with_email_address_entities": {
+            "type": "integer"
+          },
+          "num_rows_with_sensitive_pii": {
+            "type": "integer"
+          },
+          "num_scanned_rows": {
+            "type": "integer"
+          },
+          "has_scanned_columns": {
+            "type": "boolean"
+          },
+          "full_scan": { "anyOf": [{ "type": "boolean" }, { "type": "null" }] }
+        }
+      },
       "ColumnType": {
         "type": "string",
         "enum": [
@@ -5449,6 +5489,151 @@
         }
       }
     },
+    "/presidio-entities": {
+      "get": {
+        "summary": "Get the number of rows containing Presidio entities in a dataset.",
+        "description": "Based on Presidio, returns the number of rows containing names, emails, phone numbers of sensitive PII. Only a sample of the rows is scanned, the first 10K rows at the moment.",
+        "externalDocs": {
+          "description": "See https://microsoft.github.io/presidio/. The Hub docs are still missing for the endpoint, see https://github.com/huggingface/dataset-viewer/issues/1664.",
+          "url": "https://huggingface.co/docs/datasets-server/"
+        },
+        "operationId": "getPresidioEntities",
+        "security": [
+          {},
+          {
+            "AuthorizationHuggingFaceApiToken": []
+          },
+          {
+            "AuthorizationHuggingFaceJWT": []
+          }
+        ],
+        "parameters": [
+          {
+            "$ref": "#/components/parameters/RequiredDataset"
+          }
+        ],
+        "responses": {
+          "200": {
+            "description": "The number of Presidio entities in the dataset.",
+            "headers": {
+              "Cache-Control": {
+                "$ref": "#/components/headers/Cache-Control"
+              },
+              "Access-Control-Allow-Origin": {
+                "$ref": "#/components/headers/Access-Control-Allow-Origin"
+              }
+            },
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/PresidioEntitiesCountResponse"
+                },
+                "examples": {
+                  "number of URLS for a dataset": {
+                    "summary": "number of entities for a dataset.",
+                    "description": "Try with https://datasets-server.huggingface.co/presidio-entities?dataset=lhoestq/fake_name_and_ssn",
+                    "value": {
+                      "scanned_columns": ["fake_name", "fake_ssn"],
+                      "num_rows_with_person_entities": 3,
+                      "num_rows_with_phone_number_entities": 0,
+                      "num_rows_with_email_address_entities": 0,
+                      "num_rows_with_sensitive_pii": 2,
+                      "num_scanned_rows": 3,
+                      "has_scanned_columns": false,
+                      "full_scan": true
+                    }
+                  },
+                  "dataset that has no image URLs columns": {
+                    "summary": "no scanned columns: values are zero.",
+                    "description": "Try with https://datasets-server.huggingface.co/presidio-entities?dataset=mnist",
+                    "value": {
+                      "scanned_columns": [],
+                      "num_rows_with_person_entities": 0,
+                      "num_rows_with_phone_number_entities": 0,
+                      "num_rows_with_email_address_entities": 0,
+                      "num_rows_with_sensitive_pii": 0,
+                      "num_scanned_rows": 0,
+                      "has_scanned_columns": false,
+                      "full_scan": false
+                    }
+                  }
+                }
+              }
+            }
+          },
+          "401": {
+            "$ref": "#/components/responses/Common401"
+          },
+          "404": {
+            "$ref": "#/components/responses/DatasetConfigSplit404"
+          },
+          "422": {
+            "$ref": "#/components/responses/Dataset422"
+          },
+          "500": {
+            "description": "The server crashed, the response still hasn't been generated (the process is asynchronous), or the response couldn't be generated successfully due to an error in the dataset itself. The client can retry after a time, in particular in the case of the response still being processed. If the error does not vanish, it's possibly due to a bug in the API software or in the dataset, and should be reported.",
+            "headers": {
+              "Cache-Control": {
+                "$ref": "#/components/headers/Cache-Control"
+              },
+              "Access-Control-Allow-Origin": {
+                "$ref": "#/components/headers/Access-Control-Allow-Origin"
+              },
+              "X-Error-Code": {
+                "$ref": "#/components/headers/X-Error-Code-500"
+              }
+            },
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/CustomError"
+                },
+                "examples": {
+                  "response not ready": {
+                    "$ref": "#/components/examples/ResponseNotReadyError"
+                  },
+                  "unexpected error": {
+                    "$ref": "#/components/examples/UnexpectedJsonError"
+                  }
+                }
+              },
+              "text/plain": {
+                "schema": {
+                  "$ref": "#/components/schemas/ServerErrorResponse"
+                },
+                "examples": {
+                  "internal server error": {
+                    "$ref": "#/components/examples/UnexpectedTextError"
+                  }
+                }
+              }
+            }
+          },
+          "501": {
+            "description": "The server does not implement the feature or Presidio is not enabled on this dataset.",
+            "headers": {
+              "Cache-Control": {
+                "$ref": "#/components/headers/Cache-Control"
+              },
+              "Access-Control-Allow-Origin": {
+                "$ref": "#/components/headers/Access-Control-Allow-Origin"
+              },
+              "X-Error-Code": {
+                "$ref": "#/components/headers/X-Error-Code-501"
+              }
+            },
+            "content": {
+              "application/json": {
+                "schema": {
+                  "$ref": "#/components/schemas/CustomError"
+                },
+                "examples": {}
+              }
+            }
+          }
+        }
+      }
+    },
     "/statistics": {
       "get": {
         "summary": "Descriptive statistics of a split's columns",

diff --git a/libs/libcommon/src/libcommon/processing_graph.py b/libs/libcommon/src/libcommon/processing_graph.py
@@ -655,6 +655,15 @@ def parse_id(id: str) -> tuple[str, str, Optional[str], Optional[str], str]:
         "job_runner_version": 1,
         "difficulty": 70,
     },
+    "dataset-presidio-entities-count": {
+        "input_type": "dataset",
+        "triggered_by": [
+            "dataset-split-names",  # required in case the dataset has no configs (error in previous step)
+            "split-presidio-scan",
+        ],
+        "job_runner_version": 1,
+        "difficulty": 20,
+    },
     "split-duckdb-index": {
         "input_type": "split",
         "triggered_by": "config-parquet-metadata",

diff --git a/libs/libcommon/tests/test_backfill_on_real_graph.py b/libs/libcommon/tests/test_backfill_on_real_graph.py
@@ -57,6 +57,7 @@ def test_plan_job_creation_and_termination() -> None:
                 "dataset-modalities,dataset,revision",
                 "dataset-opt-in-out-urls-count,dataset,revision",
                 "dataset-parquet,dataset,revision",
+                "dataset-presidio-entities-count,dataset,revision",
                 "dataset-size,dataset,revision",
                 "dataset-split-names,dataset,revision",
                 "dataset-croissant-crumbs,dataset,revision",
@@ -68,7 +69,7 @@ def test_plan_job_creation_and_termination() -> None:
         # The queue is empty, so no step is in process.
         queue_status={"in_process": []},
         # The root dataset-level steps, as well as the "fan-in" steps, are ready to be backfilled.
-        tasks=["CreateJobs,12"],
+        tasks=["CreateJobs,13"],
     )
 
     dataset_backfill_plan.run()
@@ -94,6 +95,7 @@ def test_plan_job_creation_and_termination() -> None:
                 "dataset-modalities,dataset,revision",
                 "dataset-opt-in-out-urls-count,dataset,revision",
                 "dataset-parquet,dataset,revision",
+                "dataset-presidio-entities-count,dataset,revision",
                 "dataset-size,dataset,revision",
                 "dataset-split-names,dataset,revision",
                 "dataset-croissant-crumbs,dataset,revision",
@@ -112,6 +114,7 @@ def test_plan_job_creation_and_termination() -> None:
                 "dataset-is-valid,dataset,revision",
                 "dataset-opt-in-out-urls-count,dataset,revision",
                 "dataset-parquet,dataset,revision",
+                "dataset-presidio-entities-count,dataset,revision",
                 "dataset-size,dataset,revision",
                 "dataset-compatible-libraries,dataset,revision",
                 "dataset-modalities,dataset,revision",
@@ -177,6 +180,7 @@ def test_plan_job_creation_and_termination() -> None:
                 "dataset-modalities,dataset,revision",
                 "dataset-opt-in-out-urls-count,dataset,revision",
                 "dataset-parquet,dataset,revision",
+                "dataset-presidio-entities-count,dataset,revision",
                 "dataset-size,dataset,revision",
                 "dataset-split-names,dataset,revision",
                 "dataset-croissant-crumbs,dataset,revision",
@@ -194,6 +198,7 @@ def test_plan_job_creation_and_termination() -> None:
                 "dataset-is-valid,dataset,revision",
                 "dataset-opt-in-out-urls-count,dataset,revision",
                 "dataset-parquet,dataset,revision",
+                "dataset-presidio-entities-count,dataset,revision",
                 "dataset-size,dataset,revision",
                 "dataset-compatible-libraries,dataset,revision",
                 "dataset-modalities,dataset,revision",

diff --git a/libs/libcommon/tests/test_processing_graph.py b/libs/libcommon/tests/test_processing_graph.py
@@ -90,7 +90,9 @@ def test_graph() -> None:
         ),
         (
             "dataset-split-names",
-            [],
+            [
+                "dataset-presidio-entities-count",
+            ],
             [
                 "dataset-config-names",
                 "config-split-names",
@@ -273,7 +275,7 @@ def test_graph() -> None:
         ),
         (
             "split-presidio-scan",
-            [],
+            ["dataset-presidio-entities-count"],
             ["config-parquet-metadata"],
             [
                 "config-parquet",
@@ -282,6 +284,21 @@ def test_graph() -> None:
                 "dataset-config-names",
             ],
         ),
+        (
+            "dataset-presidio-entities-count",
+            [],
+            ["dataset-split-names", "split-presidio-scan"],
+            [
+                "config-info",
+                "config-parquet",
+                "config-parquet-and-info",
+                "config-parquet-metadata",
+                "config-split-names",
+                "dataset-config-names",
+                "dataset-split-names",
+                "split-presidio-scan",
+            ],
+        ),
         (
             "split-duckdb-index",
             ["config-duckdb-index-size", "split-is-valid"],

diff --git a/services/api/src/api/config.py b/services/api/src/api/config.py
@@ -82,6 +82,9 @@ class EndpointConfig:
                 "config": "config-opt-in-out-urls-count",
                 "split": "split-opt-in-out-urls-count",
             },
+            "/presidio-entities": {
+                "dataset": "dataset-presidio-entities-count",
+            },
             "/is-valid": {
                 "dataset": "dataset-is-valid",
                 "config": "config-is-valid",

diff --git a/services/worker/src/worker/dtos.py b/services/worker/src/worker/dtos.py
@@ -78,7 +78,7 @@ class PresidioEntity(TypedDict):
     column_name: str
 
 
-class PresidioEntitiesCountResponse(TypedDict):
+class PresidioAllEntitiesCountResponse(TypedDict):
     scanned_columns: list[str]
     num_in_vehicle_registration_entities: int
     num_organization_entities: int
@@ -145,10 +145,21 @@ class PresidioEntitiesCountResponse(TypedDict):
     full_scan: Union[bool, None]
 
 
-class PresidioEntitiesScanResponse(PresidioEntitiesCountResponse):
+class PresidioEntitiesScanResponse(PresidioAllEntitiesCountResponse):
     entities: list[PresidioEntity]
 
 
+class PresidioEntitiesCountResponse(TypedDict):
+    scanned_columns: list[str]
+    num_rows_with_person_entities: int
+    num_rows_with_phone_number_entities: int
+    num_rows_with_email_address_entities: int
+    num_rows_with_sensitive_pii: int
+    num_scanned_rows: int
+    has_scanned_columns: bool
+    full_scan: Union[bool, None]
+
+
 class ImageUrlColumnsResponse(TypedDict):
     columns: list[str]
 

diff --git a/services/worker/src/worker/job_runner_factory.py b/services/worker/src/worker/job_runner_factory.py
@@ -38,6 +38,7 @@
     DatasetOptInOutUrlsCountJobRunner,
 )
 from worker.job_runners.dataset.parquet import DatasetParquetJobRunner
+from worker.job_runners.dataset.presidio_entities_count import DatasetPresidioEntitiesCountJobRunner
 from worker.job_runners.dataset.size import DatasetSizeJobRunner
 from worker.job_runners.dataset.split_names import DatasetSplitNamesJobRunner
 from worker.job_runners.split.descriptive_statistics import (
@@ -199,6 +200,11 @@ def _create_job_runner(self, job_info: JobInfo) -> JobRunner:
                 app_config=self.app_config,
                 hf_datasets_cache=self.hf_datasets_cache,
             )
+        if job_type == DatasetPresidioEntitiesCountJobRunner.get_job_type():
+            return DatasetPresidioEntitiesCountJobRunner(
+                job_info=job_info,
+                app_config=self.app_config,
+            )
         if job_type == SplitDescriptiveStatisticsJobRunner.get_job_type():
             return SplitDescriptiveStatisticsJobRunner(
                 job_info=job_info,
@@ -264,6 +270,7 @@ def _create_job_runner(self, job_info: JobInfo) -> JobRunner:
             ConfigOptInOutUrlsCountJobRunner.get_job_type(),
             DatasetOptInOutUrlsCountJobRunner.get_job_type(),
             SplitPresidioEntitiesScanJobRunner.get_job_type(),
+            DatasetPresidioEntitiesCountJobRunner.get_job_type(),
             SplitDuckDbIndexJobRunner.get_job_type(),
             SplitDescriptiveStatisticsJobRunner.get_job_type(),
             ConfigDuckdbIndexSizeJobRunner.get_job_type(),