Skip to content

Commit

Permalink
Add dataset /presidio-entities endpoint (#2846)
Browse files Browse the repository at this point in the history
* add dataset-presidio-entities-count

* add to graph

* fix tests for graph

* add /presidio-entities endpoint

* child of dataset-split-names

* add openapi

* fix tests

* fix openapi

* Apply suggestions from code review

Co-authored-by: Sylvain Lesage <[email protected]>

---------

Co-authored-by: Sylvain Lesage <[email protected]>
  • Loading branch information
lhoestq and severo authored May 22, 2024
1 parent 9ba61fc commit f69fb2e
Show file tree
Hide file tree
Showing 9 changed files with 705 additions and 6 deletions.
187 changes: 186 additions & 1 deletion docs/source/openapi.json
Original file line number Diff line number Diff line change
Expand Up @@ -1061,7 +1061,8 @@
"num_opt_out_urls",
"num_urls",
"num_scanned_rows",
"has_urls_columns"
"has_urls_columns",
"full_scan"
],
"properties": {
"urls_columns": {
Expand All @@ -1088,6 +1089,45 @@
"full_scan": { "anyOf": [{ "type": "boolean" }, { "type": "null" }] }
}
},
"PresidioEntitiesCountResponse": {
"type": "object",
"required": [
"scanned_columns",
"num_rows_with_person_entities",
"num_rows_with_phone_number_entities",
"num_rows_with_email_address_entities",
"num_rows_with_sensitive_pii",
"num_scanned_rows",
"has_scanned_columns"
],
"properties": {
"scanned_columns": {
"type": "array",
"items": {
"type": "string"
}
},
"num_rows_with_person_entities": {
"type": "integer"
},
"num_rows_with_phone_number_entities": {
"type": "integer"
},
"num_rows_with_email_address_entities": {
"type": "integer"
},
"num_rows_with_sensitive_pii": {
"type": "integer"
},
"num_scanned_rows": {
"type": "integer"
},
"has_scanned_columns": {
"type": "boolean"
},
"full_scan": { "anyOf": [{ "type": "boolean" }, { "type": "null" }] }
}
},
"ColumnType": {
"type": "string",
"enum": [
Expand Down Expand Up @@ -5449,6 +5489,151 @@
}
}
},
"/presidio-entities": {
"get": {
"summary": "Get the number of rows containing Presidio entities in a dataset.",
"description": "Based on Presidio, returns the number of rows containing names, emails, phone numbers of sensitive PII. Only a sample of the rows is scanned, the first 10K rows at the moment.",
"externalDocs": {
"description": "See https://microsoft.github.io/presidio/. The Hub docs are still missing for the endpoint, see https://github.com/huggingface/dataset-viewer/issues/1664.",
"url": "https://huggingface.co/docs/datasets-server/"
},
"operationId": "getPresidioEntities",
"security": [
{},
{
"AuthorizationHuggingFaceApiToken": []
},
{
"AuthorizationHuggingFaceJWT": []
}
],
"parameters": [
{
"$ref": "#/components/parameters/RequiredDataset"
}
],
"responses": {
"200": {
"description": "The number of Presidio entities in the dataset.",
"headers": {
"Cache-Control": {
"$ref": "#/components/headers/Cache-Control"
},
"Access-Control-Allow-Origin": {
"$ref": "#/components/headers/Access-Control-Allow-Origin"
}
},
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/PresidioEntitiesCountResponse"
},
"examples": {
"number of URLS for a dataset": {
"summary": "number of entities for a dataset.",
"description": "Try with https://datasets-server.huggingface.co/presidio-entities?dataset=lhoestq/fake_name_and_ssn",
"value": {
"scanned_columns": ["fake_name", "fake_ssn"],
"num_rows_with_person_entities": 3,
"num_rows_with_phone_number_entities": 0,
"num_rows_with_email_address_entities": 0,
"num_rows_with_sensitive_pii": 2,
"num_scanned_rows": 3,
"has_scanned_columns": false,
"full_scan": true
}
},
"dataset that has no image URLs columns": {
"summary": "no scanned columns: values are zero.",
"description": "Try with https://datasets-server.huggingface.co/presidio-entities?dataset=mnist",
"value": {
"scanned_columns": [],
"num_rows_with_person_entities": 0,
"num_rows_with_phone_number_entities": 0,
"num_rows_with_email_address_entities": 0,
"num_rows_with_sensitive_pii": 0,
"num_scanned_rows": 0,
"has_scanned_columns": false,
"full_scan": false
}
}
}
}
}
},
"401": {
"$ref": "#/components/responses/Common401"
},
"404": {
"$ref": "#/components/responses/DatasetConfigSplit404"
},
"422": {
"$ref": "#/components/responses/Dataset422"
},
"500": {
"description": "The server crashed, the response still hasn't been generated (the process is asynchronous), or the response couldn't be generated successfully due to an error in the dataset itself. The client can retry after a time, in particular in the case of the response still being processed. If the error does not vanish, it's possibly due to a bug in the API software or in the dataset, and should be reported.",
"headers": {
"Cache-Control": {
"$ref": "#/components/headers/Cache-Control"
},
"Access-Control-Allow-Origin": {
"$ref": "#/components/headers/Access-Control-Allow-Origin"
},
"X-Error-Code": {
"$ref": "#/components/headers/X-Error-Code-500"
}
},
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/CustomError"
},
"examples": {
"response not ready": {
"$ref": "#/components/examples/ResponseNotReadyError"
},
"unexpected error": {
"$ref": "#/components/examples/UnexpectedJsonError"
}
}
},
"text/plain": {
"schema": {
"$ref": "#/components/schemas/ServerErrorResponse"
},
"examples": {
"internal server error": {
"$ref": "#/components/examples/UnexpectedTextError"
}
}
}
}
},
"501": {
"description": "The server does not implement the feature or Presidio is not enabled on this dataset.",
"headers": {
"Cache-Control": {
"$ref": "#/components/headers/Cache-Control"
},
"Access-Control-Allow-Origin": {
"$ref": "#/components/headers/Access-Control-Allow-Origin"
},
"X-Error-Code": {
"$ref": "#/components/headers/X-Error-Code-501"
}
},
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/CustomError"
},
"examples": {}
}
}
}
}
}
},
"/statistics": {
"get": {
"summary": "Descriptive statistics of a split's columns",
Expand Down
9 changes: 9 additions & 0 deletions libs/libcommon/src/libcommon/processing_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -655,6 +655,15 @@ def parse_id(id: str) -> tuple[str, str, Optional[str], Optional[str], str]:
"job_runner_version": 1,
"difficulty": 70,
},
"dataset-presidio-entities-count": {
"input_type": "dataset",
"triggered_by": [
"dataset-split-names", # required in case the dataset has no configs (error in previous step)
"split-presidio-scan",
],
"job_runner_version": 1,
"difficulty": 20,
},
"split-duckdb-index": {
"input_type": "split",
"triggered_by": "config-parquet-metadata",
Expand Down
7 changes: 6 additions & 1 deletion libs/libcommon/tests/test_backfill_on_real_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ def test_plan_job_creation_and_termination() -> None:
"dataset-modalities,dataset,revision",
"dataset-opt-in-out-urls-count,dataset,revision",
"dataset-parquet,dataset,revision",
"dataset-presidio-entities-count,dataset,revision",
"dataset-size,dataset,revision",
"dataset-split-names,dataset,revision",
"dataset-croissant-crumbs,dataset,revision",
Expand All @@ -68,7 +69,7 @@ def test_plan_job_creation_and_termination() -> None:
# The queue is empty, so no step is in process.
queue_status={"in_process": []},
# The root dataset-level steps, as well as the "fan-in" steps, are ready to be backfilled.
tasks=["CreateJobs,12"],
tasks=["CreateJobs,13"],
)

dataset_backfill_plan.run()
Expand All @@ -94,6 +95,7 @@ def test_plan_job_creation_and_termination() -> None:
"dataset-modalities,dataset,revision",
"dataset-opt-in-out-urls-count,dataset,revision",
"dataset-parquet,dataset,revision",
"dataset-presidio-entities-count,dataset,revision",
"dataset-size,dataset,revision",
"dataset-split-names,dataset,revision",
"dataset-croissant-crumbs,dataset,revision",
Expand All @@ -112,6 +114,7 @@ def test_plan_job_creation_and_termination() -> None:
"dataset-is-valid,dataset,revision",
"dataset-opt-in-out-urls-count,dataset,revision",
"dataset-parquet,dataset,revision",
"dataset-presidio-entities-count,dataset,revision",
"dataset-size,dataset,revision",
"dataset-compatible-libraries,dataset,revision",
"dataset-modalities,dataset,revision",
Expand Down Expand Up @@ -177,6 +180,7 @@ def test_plan_job_creation_and_termination() -> None:
"dataset-modalities,dataset,revision",
"dataset-opt-in-out-urls-count,dataset,revision",
"dataset-parquet,dataset,revision",
"dataset-presidio-entities-count,dataset,revision",
"dataset-size,dataset,revision",
"dataset-split-names,dataset,revision",
"dataset-croissant-crumbs,dataset,revision",
Expand All @@ -194,6 +198,7 @@ def test_plan_job_creation_and_termination() -> None:
"dataset-is-valid,dataset,revision",
"dataset-opt-in-out-urls-count,dataset,revision",
"dataset-parquet,dataset,revision",
"dataset-presidio-entities-count,dataset,revision",
"dataset-size,dataset,revision",
"dataset-compatible-libraries,dataset,revision",
"dataset-modalities,dataset,revision",
Expand Down
21 changes: 19 additions & 2 deletions libs/libcommon/tests/test_processing_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,9 @@ def test_graph() -> None:
),
(
"dataset-split-names",
[],
[
"dataset-presidio-entities-count",
],
[
"dataset-config-names",
"config-split-names",
Expand Down Expand Up @@ -273,7 +275,7 @@ def test_graph() -> None:
),
(
"split-presidio-scan",
[],
["dataset-presidio-entities-count"],
["config-parquet-metadata"],
[
"config-parquet",
Expand All @@ -282,6 +284,21 @@ def test_graph() -> None:
"dataset-config-names",
],
),
(
"dataset-presidio-entities-count",
[],
["dataset-split-names", "split-presidio-scan"],
[
"config-info",
"config-parquet",
"config-parquet-and-info",
"config-parquet-metadata",
"config-split-names",
"dataset-config-names",
"dataset-split-names",
"split-presidio-scan",
],
),
(
"split-duckdb-index",
["config-duckdb-index-size", "split-is-valid"],
Expand Down
3 changes: 3 additions & 0 deletions services/api/src/api/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,9 @@ class EndpointConfig:
"config": "config-opt-in-out-urls-count",
"split": "split-opt-in-out-urls-count",
},
"/presidio-entities": {
"dataset": "dataset-presidio-entities-count",
},
"/is-valid": {
"dataset": "dataset-is-valid",
"config": "config-is-valid",
Expand Down
15 changes: 13 additions & 2 deletions services/worker/src/worker/dtos.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ class PresidioEntity(TypedDict):
column_name: str


class PresidioEntitiesCountResponse(TypedDict):
class PresidioAllEntitiesCountResponse(TypedDict):
scanned_columns: list[str]
num_in_vehicle_registration_entities: int
num_organization_entities: int
Expand Down Expand Up @@ -145,10 +145,21 @@ class PresidioEntitiesCountResponse(TypedDict):
full_scan: Union[bool, None]


class PresidioEntitiesScanResponse(PresidioEntitiesCountResponse):
class PresidioEntitiesScanResponse(PresidioAllEntitiesCountResponse):
entities: list[PresidioEntity]


class PresidioEntitiesCountResponse(TypedDict):
scanned_columns: list[str]
num_rows_with_person_entities: int
num_rows_with_phone_number_entities: int
num_rows_with_email_address_entities: int
num_rows_with_sensitive_pii: int
num_scanned_rows: int
has_scanned_columns: bool
full_scan: Union[bool, None]


class ImageUrlColumnsResponse(TypedDict):
columns: list[str]

Expand Down
7 changes: 7 additions & 0 deletions services/worker/src/worker/job_runner_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
DatasetOptInOutUrlsCountJobRunner,
)
from worker.job_runners.dataset.parquet import DatasetParquetJobRunner
from worker.job_runners.dataset.presidio_entities_count import DatasetPresidioEntitiesCountJobRunner
from worker.job_runners.dataset.size import DatasetSizeJobRunner
from worker.job_runners.dataset.split_names import DatasetSplitNamesJobRunner
from worker.job_runners.split.descriptive_statistics import (
Expand Down Expand Up @@ -199,6 +200,11 @@ def _create_job_runner(self, job_info: JobInfo) -> JobRunner:
app_config=self.app_config,
hf_datasets_cache=self.hf_datasets_cache,
)
if job_type == DatasetPresidioEntitiesCountJobRunner.get_job_type():
return DatasetPresidioEntitiesCountJobRunner(
job_info=job_info,
app_config=self.app_config,
)
if job_type == SplitDescriptiveStatisticsJobRunner.get_job_type():
return SplitDescriptiveStatisticsJobRunner(
job_info=job_info,
Expand Down Expand Up @@ -264,6 +270,7 @@ def _create_job_runner(self, job_info: JobInfo) -> JobRunner:
ConfigOptInOutUrlsCountJobRunner.get_job_type(),
DatasetOptInOutUrlsCountJobRunner.get_job_type(),
SplitPresidioEntitiesScanJobRunner.get_job_type(),
DatasetPresidioEntitiesCountJobRunner.get_job_type(),
SplitDuckDbIndexJobRunner.get_job_type(),
SplitDescriptiveStatisticsJobRunner.get_job_type(),
ConfigDuckdbIndexSizeJobRunner.get_job_type(),
Expand Down
Loading

0 comments on commit f69fb2e

Please sign in to comment.