diff --git a/docs/source/openapi.json b/docs/source/openapi.json index 80abed0fc5..41c080ae97 100644 --- a/docs/source/openapi.json +++ b/docs/source/openapi.json @@ -1061,7 +1061,8 @@ "num_opt_out_urls", "num_urls", "num_scanned_rows", - "has_urls_columns" + "has_urls_columns", + "full_scan" ], "properties": { "urls_columns": { @@ -1088,6 +1089,45 @@ "full_scan": { "anyOf": [{ "type": "boolean" }, { "type": "null" }] } } }, + "PresidioEntitiesCountResponse": { + "type": "object", + "required": [ + "scanned_columns", + "num_rows_with_person_entities", + "num_rows_with_phone_number_entities", + "num_rows_with_email_address_entities", + "num_rows_with_sensitive_pii", + "num_scanned_rows", + "has_scanned_columns" + ], + "properties": { + "scanned_columns": { + "type": "array", + "items": { + "type": "string" + } + }, + "num_rows_with_person_entities": { + "type": "integer" + }, + "num_rows_with_phone_number_entities": { + "type": "integer" + }, + "num_rows_with_email_address_entities": { + "type": "integer" + }, + "num_rows_with_sensitive_pii": { + "type": "integer" + }, + "num_scanned_rows": { + "type": "integer" + }, + "has_scanned_columns": { + "type": "boolean" + }, + "full_scan": { "anyOf": [{ "type": "boolean" }, { "type": "null" }] } + } + }, "ColumnType": { "type": "string", "enum": [ @@ -5449,6 +5489,151 @@ } } }, + "/presidio-entities": { + "get": { + "summary": "Get the number of rows containing Presidio entities in a dataset.", + "description": "Based on Presidio, returns the number of rows containing names, emails, phone numbers of sensitive PII. Only a sample of the rows is scanned, the first 10K rows at the moment.", + "externalDocs": { + "description": "See https://microsoft.github.io/presidio/. The Hub docs are still missing for the endpoint, see https://github.com/huggingface/dataset-viewer/issues/1664.", + "url": "https://huggingface.co/docs/datasets-server/" + }, + "operationId": "getPresidioEntities", + "security": [ + {}, + { + "AuthorizationHuggingFaceApiToken": [] + }, + { + "AuthorizationHuggingFaceJWT": [] + } + ], + "parameters": [ + { + "$ref": "#/components/parameters/RequiredDataset" + } + ], + "responses": { + "200": { + "description": "The number of Presidio entities in the dataset.", + "headers": { + "Cache-Control": { + "$ref": "#/components/headers/Cache-Control" + }, + "Access-Control-Allow-Origin": { + "$ref": "#/components/headers/Access-Control-Allow-Origin" + } + }, + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/PresidioEntitiesCountResponse" + }, + "examples": { + "number of URLS for a dataset": { + "summary": "number of entities for a dataset.", + "description": "Try with https://datasets-server.huggingface.co/presidio-entities?dataset=lhoestq/fake_name_and_ssn", + "value": { + "scanned_columns": ["fake_name", "fake_ssn"], + "num_rows_with_person_entities": 3, + "num_rows_with_phone_number_entities": 0, + "num_rows_with_email_address_entities": 0, + "num_rows_with_sensitive_pii": 2, + "num_scanned_rows": 3, + "has_scanned_columns": false, + "full_scan": true + } + }, + "dataset that has no image URLs columns": { + "summary": "no scanned columns: values are zero.", + "description": "Try with https://datasets-server.huggingface.co/presidio-entities?dataset=mnist", + "value": { + "scanned_columns": [], + "num_rows_with_person_entities": 0, + "num_rows_with_phone_number_entities": 0, + "num_rows_with_email_address_entities": 0, + "num_rows_with_sensitive_pii": 0, + "num_scanned_rows": 0, + "has_scanned_columns": false, + "full_scan": false + } + } + } + } + } + }, + "401": { + "$ref": "#/components/responses/Common401" + }, + "404": { + "$ref": "#/components/responses/DatasetConfigSplit404" + }, + "422": { + "$ref": "#/components/responses/Dataset422" + }, + "500": { + "description": "The server crashed, the response still hasn't been generated (the process is asynchronous), or the response couldn't be generated successfully due to an error in the dataset itself. The client can retry after a time, in particular in the case of the response still being processed. If the error does not vanish, it's possibly due to a bug in the API software or in the dataset, and should be reported.", + "headers": { + "Cache-Control": { + "$ref": "#/components/headers/Cache-Control" + }, + "Access-Control-Allow-Origin": { + "$ref": "#/components/headers/Access-Control-Allow-Origin" + }, + "X-Error-Code": { + "$ref": "#/components/headers/X-Error-Code-500" + } + }, + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/CustomError" + }, + "examples": { + "response not ready": { + "$ref": "#/components/examples/ResponseNotReadyError" + }, + "unexpected error": { + "$ref": "#/components/examples/UnexpectedJsonError" + } + } + }, + "text/plain": { + "schema": { + "$ref": "#/components/schemas/ServerErrorResponse" + }, + "examples": { + "internal server error": { + "$ref": "#/components/examples/UnexpectedTextError" + } + } + } + } + }, + "501": { + "description": "The server does not implement the feature or Presidio is not enabled on this dataset.", + "headers": { + "Cache-Control": { + "$ref": "#/components/headers/Cache-Control" + }, + "Access-Control-Allow-Origin": { + "$ref": "#/components/headers/Access-Control-Allow-Origin" + }, + "X-Error-Code": { + "$ref": "#/components/headers/X-Error-Code-501" + } + }, + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/CustomError" + }, + "examples": {} + } + } + } + } + } + }, "/statistics": { "get": { "summary": "Descriptive statistics of a split's columns", diff --git a/libs/libcommon/src/libcommon/processing_graph.py b/libs/libcommon/src/libcommon/processing_graph.py index dd683418a1..4cef264a77 100644 --- a/libs/libcommon/src/libcommon/processing_graph.py +++ b/libs/libcommon/src/libcommon/processing_graph.py @@ -655,6 +655,15 @@ def parse_id(id: str) -> tuple[str, str, Optional[str], Optional[str], str]: "job_runner_version": 1, "difficulty": 70, }, + "dataset-presidio-entities-count": { + "input_type": "dataset", + "triggered_by": [ + "dataset-split-names", # required in case the dataset has no configs (error in previous step) + "split-presidio-scan", + ], + "job_runner_version": 1, + "difficulty": 20, + }, "split-duckdb-index": { "input_type": "split", "triggered_by": "config-parquet-metadata", diff --git a/libs/libcommon/tests/test_backfill_on_real_graph.py b/libs/libcommon/tests/test_backfill_on_real_graph.py index 786ff67a73..8d534a71fa 100644 --- a/libs/libcommon/tests/test_backfill_on_real_graph.py +++ b/libs/libcommon/tests/test_backfill_on_real_graph.py @@ -57,6 +57,7 @@ def test_plan_job_creation_and_termination() -> None: "dataset-modalities,dataset,revision", "dataset-opt-in-out-urls-count,dataset,revision", "dataset-parquet,dataset,revision", + "dataset-presidio-entities-count,dataset,revision", "dataset-size,dataset,revision", "dataset-split-names,dataset,revision", "dataset-croissant-crumbs,dataset,revision", @@ -68,7 +69,7 @@ def test_plan_job_creation_and_termination() -> None: # The queue is empty, so no step is in process. queue_status={"in_process": []}, # The root dataset-level steps, as well as the "fan-in" steps, are ready to be backfilled. - tasks=["CreateJobs,12"], + tasks=["CreateJobs,13"], ) dataset_backfill_plan.run() @@ -94,6 +95,7 @@ def test_plan_job_creation_and_termination() -> None: "dataset-modalities,dataset,revision", "dataset-opt-in-out-urls-count,dataset,revision", "dataset-parquet,dataset,revision", + "dataset-presidio-entities-count,dataset,revision", "dataset-size,dataset,revision", "dataset-split-names,dataset,revision", "dataset-croissant-crumbs,dataset,revision", @@ -112,6 +114,7 @@ def test_plan_job_creation_and_termination() -> None: "dataset-is-valid,dataset,revision", "dataset-opt-in-out-urls-count,dataset,revision", "dataset-parquet,dataset,revision", + "dataset-presidio-entities-count,dataset,revision", "dataset-size,dataset,revision", "dataset-compatible-libraries,dataset,revision", "dataset-modalities,dataset,revision", @@ -177,6 +180,7 @@ def test_plan_job_creation_and_termination() -> None: "dataset-modalities,dataset,revision", "dataset-opt-in-out-urls-count,dataset,revision", "dataset-parquet,dataset,revision", + "dataset-presidio-entities-count,dataset,revision", "dataset-size,dataset,revision", "dataset-split-names,dataset,revision", "dataset-croissant-crumbs,dataset,revision", @@ -194,6 +198,7 @@ def test_plan_job_creation_and_termination() -> None: "dataset-is-valid,dataset,revision", "dataset-opt-in-out-urls-count,dataset,revision", "dataset-parquet,dataset,revision", + "dataset-presidio-entities-count,dataset,revision", "dataset-size,dataset,revision", "dataset-compatible-libraries,dataset,revision", "dataset-modalities,dataset,revision", diff --git a/libs/libcommon/tests/test_processing_graph.py b/libs/libcommon/tests/test_processing_graph.py index 7aa7bd6665..ac9b729b48 100644 --- a/libs/libcommon/tests/test_processing_graph.py +++ b/libs/libcommon/tests/test_processing_graph.py @@ -90,7 +90,9 @@ def test_graph() -> None: ), ( "dataset-split-names", - [], + [ + "dataset-presidio-entities-count", + ], [ "dataset-config-names", "config-split-names", @@ -273,7 +275,7 @@ def test_graph() -> None: ), ( "split-presidio-scan", - [], + ["dataset-presidio-entities-count"], ["config-parquet-metadata"], [ "config-parquet", @@ -282,6 +284,21 @@ def test_graph() -> None: "dataset-config-names", ], ), + ( + "dataset-presidio-entities-count", + [], + ["dataset-split-names", "split-presidio-scan"], + [ + "config-info", + "config-parquet", + "config-parquet-and-info", + "config-parquet-metadata", + "config-split-names", + "dataset-config-names", + "dataset-split-names", + "split-presidio-scan", + ], + ), ( "split-duckdb-index", ["config-duckdb-index-size", "split-is-valid"], diff --git a/services/api/src/api/config.py b/services/api/src/api/config.py index 1a22087831..f8d37f403d 100644 --- a/services/api/src/api/config.py +++ b/services/api/src/api/config.py @@ -82,6 +82,9 @@ class EndpointConfig: "config": "config-opt-in-out-urls-count", "split": "split-opt-in-out-urls-count", }, + "/presidio-entities": { + "dataset": "dataset-presidio-entities-count", + }, "/is-valid": { "dataset": "dataset-is-valid", "config": "config-is-valid", diff --git a/services/worker/src/worker/dtos.py b/services/worker/src/worker/dtos.py index 2768388e48..d608069ac4 100644 --- a/services/worker/src/worker/dtos.py +++ b/services/worker/src/worker/dtos.py @@ -78,7 +78,7 @@ class PresidioEntity(TypedDict): column_name: str -class PresidioEntitiesCountResponse(TypedDict): +class PresidioAllEntitiesCountResponse(TypedDict): scanned_columns: list[str] num_in_vehicle_registration_entities: int num_organization_entities: int @@ -145,10 +145,21 @@ class PresidioEntitiesCountResponse(TypedDict): full_scan: Union[bool, None] -class PresidioEntitiesScanResponse(PresidioEntitiesCountResponse): +class PresidioEntitiesScanResponse(PresidioAllEntitiesCountResponse): entities: list[PresidioEntity] +class PresidioEntitiesCountResponse(TypedDict): + scanned_columns: list[str] + num_rows_with_person_entities: int + num_rows_with_phone_number_entities: int + num_rows_with_email_address_entities: int + num_rows_with_sensitive_pii: int + num_scanned_rows: int + has_scanned_columns: bool + full_scan: Union[bool, None] + + class ImageUrlColumnsResponse(TypedDict): columns: list[str] diff --git a/services/worker/src/worker/job_runner_factory.py b/services/worker/src/worker/job_runner_factory.py index dc7da63acf..4da653d4ca 100644 --- a/services/worker/src/worker/job_runner_factory.py +++ b/services/worker/src/worker/job_runner_factory.py @@ -38,6 +38,7 @@ DatasetOptInOutUrlsCountJobRunner, ) from worker.job_runners.dataset.parquet import DatasetParquetJobRunner +from worker.job_runners.dataset.presidio_entities_count import DatasetPresidioEntitiesCountJobRunner from worker.job_runners.dataset.size import DatasetSizeJobRunner from worker.job_runners.dataset.split_names import DatasetSplitNamesJobRunner from worker.job_runners.split.descriptive_statistics import ( @@ -199,6 +200,11 @@ def _create_job_runner(self, job_info: JobInfo) -> JobRunner: app_config=self.app_config, hf_datasets_cache=self.hf_datasets_cache, ) + if job_type == DatasetPresidioEntitiesCountJobRunner.get_job_type(): + return DatasetPresidioEntitiesCountJobRunner( + job_info=job_info, + app_config=self.app_config, + ) if job_type == SplitDescriptiveStatisticsJobRunner.get_job_type(): return SplitDescriptiveStatisticsJobRunner( job_info=job_info, @@ -264,6 +270,7 @@ def _create_job_runner(self, job_info: JobInfo) -> JobRunner: ConfigOptInOutUrlsCountJobRunner.get_job_type(), DatasetOptInOutUrlsCountJobRunner.get_job_type(), SplitPresidioEntitiesScanJobRunner.get_job_type(), + DatasetPresidioEntitiesCountJobRunner.get_job_type(), SplitDuckDbIndexJobRunner.get_job_type(), SplitDescriptiveStatisticsJobRunner.get_job_type(), ConfigDuckdbIndexSizeJobRunner.get_job_type(), diff --git a/services/worker/src/worker/job_runners/dataset/presidio_entities_count.py b/services/worker/src/worker/job_runners/dataset/presidio_entities_count.py new file mode 100644 index 0000000000..116131e8d7 --- /dev/null +++ b/services/worker/src/worker/job_runners/dataset/presidio_entities_count.py @@ -0,0 +1,100 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright 2024 The HuggingFace Authors. + +import logging +from http import HTTPStatus + +from libcommon.exceptions import PreviousStepFormatError +from libcommon.simple_cache import ( + CachedArtifactNotFoundError, + get_previous_step_or_raise, + get_response, +) + +from worker.dtos import JobResult, PresidioEntitiesCountResponse +from worker.job_runners.dataset.dataset_job_runner import DatasetJobRunner + + +def compute_presidio_entities_count_response(dataset: str) -> tuple[PresidioEntitiesCountResponse, float]: + logging.info(f"compute 'dataset-presidio-entities-count' for {dataset=}") + + split_names_response = get_previous_step_or_raise(kind="dataset-split-names", dataset=dataset) + content = split_names_response["content"] + if "splits" not in content: + raise PreviousStepFormatError("Previous step did not return the expected content: 'splits'.") + + scanned_columns = set() + presidio_entities_count_response = PresidioEntitiesCountResponse( + { + "scanned_columns": [], + "num_rows_with_person_entities": 0, + "num_rows_with_phone_number_entities": 0, + "num_rows_with_email_address_entities": 0, + "num_rows_with_sensitive_pii": 0, + "num_scanned_rows": 0, + "has_scanned_columns": False, + "full_scan": True, + } + ) + try: + total = 0 + pending = 0 + for split_item in content["splits"]: + config = split_item["config"] + split = split_item["split"] + total += 1 + try: + response = get_response(kind="split-presidio-scan", dataset=dataset, config=config, split=split) + except CachedArtifactNotFoundError: + logging.debug("No response found in previous step for this dataset: 'split-presidio-scan'.") + pending += 1 + continue + if response["http_status"] != HTTPStatus.OK: + logging.debug(f"Previous step gave an error: {response['http_status']}.") + continue + split_presidio_scan_content = response["content"] + scanned_columns.update(split_presidio_scan_content["scanned_columns"]) + if not split_presidio_scan_content["full_scan"]: + presidio_entities_count_response["full_scan"] = False + presidio_entities_count_response["num_rows_with_person_entities"] += split_presidio_scan_content[ + "num_rows_with_person_entities" + ] + presidio_entities_count_response["num_rows_with_phone_number_entities"] += split_presidio_scan_content[ + "num_rows_with_phone_number_entities" + ] + presidio_entities_count_response["num_rows_with_email_address_entities"] += split_presidio_scan_content[ + "num_rows_with_email_address_entities" + ] + presidio_entities_count_response["num_rows_with_sensitive_pii"] += split_presidio_scan_content[ + "num_rows_with_credit_card_entities" + ] + presidio_entities_count_response["num_rows_with_sensitive_pii"] += split_presidio_scan_content[ + "num_rows_with_us_ssn_entities" + ] + presidio_entities_count_response["num_rows_with_sensitive_pii"] += split_presidio_scan_content[ + "num_rows_with_us_passport_entities" + ] + presidio_entities_count_response["num_rows_with_sensitive_pii"] += split_presidio_scan_content[ + "num_rows_with_iban_code_entities" + ] + presidio_entities_count_response["num_scanned_rows"] += split_presidio_scan_content["num_scanned_rows"] + except Exception as e: + raise PreviousStepFormatError("Previous step did not return the expected content.", e) from e + + presidio_entities_count_response["scanned_columns"] = sorted(scanned_columns) + presidio_entities_count_response["has_scanned_columns"] = ( + len(presidio_entities_count_response["scanned_columns"]) > 0 + ) + progress = (total - pending) / total if total else 1.0 + + return (presidio_entities_count_response, progress) + + +class DatasetPresidioEntitiesCountJobRunner(DatasetJobRunner): + @staticmethod + def get_job_type() -> str: + return "dataset-presidio-entities-count" + + def compute(self) -> JobResult: + response_content, progress = compute_presidio_entities_count_response(dataset=self.dataset) + return JobResult(response_content, progress=progress) diff --git a/services/worker/tests/job_runners/dataset/test_presidio_entities_count.py b/services/worker/tests/job_runners/dataset/test_presidio_entities_count.py new file mode 100644 index 0000000000..1be9402c9d --- /dev/null +++ b/services/worker/tests/job_runners/dataset/test_presidio_entities_count.py @@ -0,0 +1,362 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright 2024 The HuggingFace Authors. + +from collections.abc import Callable +from http import HTTPStatus +from typing import Any + +import pytest +from libcommon.dtos import Priority +from libcommon.resources import CacheMongoResource, QueueMongoResource +from libcommon.simple_cache import CachedArtifactNotFoundError, upsert_response + +from worker.config import AppConfig +from worker.dtos import PresidioEntitiesScanResponse +from worker.job_runners.dataset.presidio_entities_count import ( + DatasetPresidioEntitiesCountJobRunner, +) + +from ..utils import REVISION_NAME + + +@pytest.fixture(autouse=True) +def prepare_and_clean_mongo(app_config: AppConfig) -> None: + # prepare the database before each test, and clean it afterwards + pass + + +GetJobRunner = Callable[[str, AppConfig], DatasetPresidioEntitiesCountJobRunner] + + +@pytest.fixture +def get_job_runner( + cache_mongo_resource: CacheMongoResource, + queue_mongo_resource: QueueMongoResource, +) -> GetJobRunner: + def _get_job_runner( + dataset: str, + app_config: AppConfig, + ) -> DatasetPresidioEntitiesCountJobRunner: + return DatasetPresidioEntitiesCountJobRunner( + job_info={ + "type": DatasetPresidioEntitiesCountJobRunner.get_job_type(), + "params": { + "dataset": dataset, + "revision": REVISION_NAME, + "config": None, + "split": None, + }, + "job_id": "job_id", + "priority": Priority.NORMAL, + "difficulty": 50, + }, + app_config=app_config, + ) + + return _get_job_runner + + +SAMPLE_RESPONSE = PresidioEntitiesScanResponse( + { + "scanned_columns": ["col"], + "num_in_vehicle_registration_entities": 0, + "num_organization_entities": 0, + "num_sg_nric_fin_entities": 0, + "num_person_entities": 2, + "num_credit_card_entities": 0, + "num_medical_license_entities": 0, + "num_nrp_entities": 0, + "num_us_ssn_entities": 1, + "num_crypto_entities": 0, + "num_date_time_entities": 0, + "num_location_entities": 0, + "num_us_driver_license_entities": 0, + "num_phone_number_entities": 0, + "num_url_entities": 0, + "num_us_passport_entities": 0, + "num_age_entities": 0, + "num_au_acn_entities": 0, + "num_email_address_entities": 1, + "num_in_pan_entities": 0, + "num_ip_address_entities": 1, + "num_id_entities": 0, + "num_us_bank_number_entities": 0, + "num_in_aadhaar_entities": 0, + "num_us_itin_entities": 0, + "num_au_medicare_entities": 0, + "num_iban_code_entities": 0, + "num_au_tfn_entities": 0, + "num_uk_nhs_entities": 0, + "num_email_entities": 0, + "num_au_abn_entities": 0, + "num_rows_with_in_vehicle_registration_entities": 0, + "num_rows_with_organization_entities": 0, + "num_rows_with_sg_nric_fin_entities": 0, + "num_rows_with_person_entities": 2, + "num_rows_with_credit_card_entities": 0, + "num_rows_with_medical_license_entities": 0, + "num_rows_with_nrp_entities": 0, + "num_rows_with_us_ssn_entities": 1, + "num_rows_with_crypto_entities": 0, + "num_rows_with_date_time_entities": 0, + "num_rows_with_location_entities": 0, + "num_rows_with_us_driver_license_entities": 0, + "num_rows_with_phone_number_entities": 0, + "num_rows_with_url_entities": 0, + "num_rows_with_us_passport_entities": 0, + "num_rows_with_age_entities": 0, + "num_rows_with_au_acn_entities": 0, + "num_rows_with_email_address_entities": 1, + "num_rows_with_in_pan_entities": 0, + "num_rows_with_ip_address_entities": 1, + "num_rows_with_id_entities": 0, + "num_rows_with_us_bank_number_entities": 0, + "num_rows_with_in_aadhaar_entities": 0, + "num_rows_with_us_itin_entities": 0, + "num_rows_with_au_medicare_entities": 0, + "num_rows_with_iban_code_entities": 0, + "num_rows_with_au_tfn_entities": 0, + "num_rows_with_uk_nhs_entities": 0, + "num_rows_with_email_entities": 0, + "num_rows_with_au_abn_entities": 0, + "num_scanned_rows": 6, + "has_scanned_columns": True, + "full_scan": True, + "entities": [ + {"column_name": "col", "row_idx": 0, "text": "Gi****** Gi*****", "type": "PERSON"}, + {"column_name": "col", "row_idx": 1, "text": "Gi*****", "type": "PERSON"}, + {"column_name": "col", "row_idx": 2, "text": "19*.***.*.*", "type": "IP_ADDRESS"}, + {"column_name": "col", "row_idx": 3, "text": "34*-**-****", "type": "US_SSN"}, + { + "column_name": "col", + "row_idx": 4, + "text": "gi******.*******@********.***", + "type": "EMAIL_ADDRESS", + }, + ], + } +) + +SAMPLE_RESPONSE_NOT_FULL_SCAN = PresidioEntitiesScanResponse( + { + "scanned_columns": ["col"], + "num_in_vehicle_registration_entities": 0, + "num_organization_entities": 0, + "num_sg_nric_fin_entities": 0, + "num_person_entities": 2, + "num_credit_card_entities": 0, + "num_medical_license_entities": 0, + "num_nrp_entities": 0, + "num_us_ssn_entities": 0, + "num_crypto_entities": 0, + "num_date_time_entities": 0, + "num_location_entities": 0, + "num_us_driver_license_entities": 0, + "num_phone_number_entities": 0, + "num_url_entities": 0, + "num_us_passport_entities": 0, + "num_age_entities": 0, + "num_au_acn_entities": 0, + "num_email_address_entities": 0, + "num_in_pan_entities": 0, + "num_ip_address_entities": 1, + "num_id_entities": 0, + "num_us_bank_number_entities": 0, + "num_in_aadhaar_entities": 0, + "num_us_itin_entities": 0, + "num_au_medicare_entities": 0, + "num_iban_code_entities": 0, + "num_au_tfn_entities": 0, + "num_uk_nhs_entities": 0, + "num_email_entities": 0, + "num_au_abn_entities": 0, + "num_rows_with_in_vehicle_registration_entities": 0, + "num_rows_with_organization_entities": 0, + "num_rows_with_sg_nric_fin_entities": 0, + "num_rows_with_person_entities": 2, + "num_rows_with_credit_card_entities": 0, + "num_rows_with_medical_license_entities": 0, + "num_rows_with_nrp_entities": 0, + "num_rows_with_us_ssn_entities": 0, + "num_rows_with_crypto_entities": 0, + "num_rows_with_date_time_entities": 0, + "num_rows_with_location_entities": 0, + "num_rows_with_us_driver_license_entities": 0, + "num_rows_with_phone_number_entities": 0, + "num_rows_with_url_entities": 0, + "num_rows_with_us_passport_entities": 0, + "num_rows_with_age_entities": 0, + "num_rows_with_au_acn_entities": 0, + "num_rows_with_email_address_entities": 0, + "num_rows_with_in_pan_entities": 0, + "num_rows_with_ip_address_entities": 1, + "num_rows_with_id_entities": 0, + "num_rows_with_us_bank_number_entities": 0, + "num_rows_with_in_aadhaar_entities": 0, + "num_rows_with_us_itin_entities": 0, + "num_rows_with_au_medicare_entities": 0, + "num_rows_with_iban_code_entities": 0, + "num_rows_with_au_tfn_entities": 0, + "num_rows_with_uk_nhs_entities": 0, + "num_rows_with_email_entities": 0, + "num_rows_with_au_abn_entities": 0, + "num_scanned_rows": 3, + "has_scanned_columns": True, + "full_scan": False, + "entities": [ + {"column_name": "col", "row_idx": 0, "text": "Gi****** Gi*****", "type": "PERSON"}, + {"column_name": "col", "row_idx": 1, "text": "Gi*****", "type": "PERSON"}, + {"column_name": "col", "row_idx": 2, "text": "19*.***.*.*", "type": "IP_ADDRESS"}, + ], + } +) + + +@pytest.mark.parametrize( + "dataset,split_names_status,split_names_content,split_upstream_status" + + ",split_upstream_content,expected_error_code,expected_content,should_raise", + [ + ( + "dataset_ok_full_scan", + HTTPStatus.OK, + { + "splits": [ + {"dataset": "dataset_ok_full_scan", "config": "config1", "split": "split1"}, + {"dataset": "dataset_ok_full_scan", "config": "config1", "split": "split2"}, + {"dataset": "dataset_ok_full_scan", "config": "config2", "split": "split3"}, + ] + }, + [HTTPStatus.OK] * 3, + [SAMPLE_RESPONSE] * 3, + None, + { + "scanned_columns": SAMPLE_RESPONSE["scanned_columns"], + "num_rows_with_person_entities": SAMPLE_RESPONSE["num_rows_with_person_entities"] * 3, + "num_rows_with_phone_number_entities": SAMPLE_RESPONSE["num_rows_with_phone_number_entities"] * 3, + "num_rows_with_email_address_entities": SAMPLE_RESPONSE["num_rows_with_email_address_entities"] * 3, + "num_rows_with_sensitive_pii": ( + SAMPLE_RESPONSE["num_rows_with_credit_card_entities"] + + SAMPLE_RESPONSE["num_rows_with_us_ssn_entities"] + + SAMPLE_RESPONSE["num_rows_with_us_passport_entities"] + + SAMPLE_RESPONSE["num_rows_with_iban_code_entities"] + ) + * 3, + "num_scanned_rows": SAMPLE_RESPONSE["num_scanned_rows"] * 3, + "has_scanned_columns": True, + "full_scan": True, + }, + False, + ), + ( + "dataset_ok_not_full_scan", + HTTPStatus.OK, + { + "splits": [ + {"dataset": "dataset_ok_not_full_scan", "config": "config1", "split": "split1"}, + {"dataset": "dataset_ok_not_full_scan", "config": "config1", "split": "split2"}, + {"dataset": "dataset_ok_not_full_scan", "config": "config2", "split": "split3"}, + ] + }, + [HTTPStatus.OK] * 3, + [SAMPLE_RESPONSE_NOT_FULL_SCAN] * 3, + None, + { + "scanned_columns": SAMPLE_RESPONSE_NOT_FULL_SCAN["scanned_columns"], + "num_rows_with_person_entities": SAMPLE_RESPONSE_NOT_FULL_SCAN["num_rows_with_person_entities"] * 3, + "num_rows_with_phone_number_entities": SAMPLE_RESPONSE_NOT_FULL_SCAN[ + "num_rows_with_phone_number_entities" + ] + * 3, + "num_rows_with_email_address_entities": SAMPLE_RESPONSE_NOT_FULL_SCAN[ + "num_rows_with_email_address_entities" + ] + * 3, + "num_rows_with_sensitive_pii": ( + SAMPLE_RESPONSE_NOT_FULL_SCAN["num_rows_with_credit_card_entities"] + + SAMPLE_RESPONSE_NOT_FULL_SCAN["num_rows_with_us_ssn_entities"] + + SAMPLE_RESPONSE_NOT_FULL_SCAN["num_rows_with_us_passport_entities"] + + SAMPLE_RESPONSE_NOT_FULL_SCAN["num_rows_with_iban_code_entities"] + ) + * 3, + "num_scanned_rows": SAMPLE_RESPONSE_NOT_FULL_SCAN["num_scanned_rows"] * 3, + "has_scanned_columns": True, + "full_scan": False, + }, + False, + ), + ( + "previous_step_error", + HTTPStatus.INTERNAL_SERVER_ERROR, + {}, + [], + [], + "CachedArtifactError", + None, + True, + ), + ( + "previous_step_format_error", + HTTPStatus.OK, + { + "splits": [ + {"dataset": "dataset_ok_full_scan", "config": "config1", "split": "split1"}, + {"dataset": "dataset_ok_full_scan", "config": "config1", "split": "split2"}, + {"dataset": "dataset_ok_full_scan", "config": "config2", "split": "split3"}, + ] + }, + [HTTPStatus.OK], + [{"wrong_format": None}], + "PreviousStepFormatError", + None, + True, + ), + ], +) +def test_compute( + app_config: AppConfig, + get_job_runner: GetJobRunner, + dataset: str, + split_names_status: HTTPStatus, + split_names_content: Any, + split_upstream_status: list[HTTPStatus], + split_upstream_content: list[Any], + expected_error_code: str, + expected_content: Any, + should_raise: bool, +) -> None: + upsert_response( + kind="dataset-split-names", + dataset=dataset, + dataset_git_revision=REVISION_NAME, + content=split_names_content, + http_status=split_names_status, + ) + + if split_names_status == HTTPStatus.OK: + for split_item, status, content in zip( + split_names_content["splits"], split_upstream_status, split_upstream_content + ): + upsert_response( + kind="split-presidio-scan", + dataset=dataset, + dataset_git_revision=REVISION_NAME, + config=split_item["config"], + split=split_item["split"], + content=content, + http_status=status, + ) + + job_runner = get_job_runner(dataset, app_config) + if should_raise: + with pytest.raises(Exception) as e: + job_runner.compute() + assert e.typename == expected_error_code + else: + assert job_runner.compute().content == expected_content + + +def test_doesnotexist(app_config: AppConfig, get_job_runner: GetJobRunner) -> None: + dataset = "doesnotexist" + job_runner = get_job_runner(dataset, app_config) + with pytest.raises(CachedArtifactNotFoundError): + job_runner.compute()