diff --git a/src/dapla_pseudo/v1/depseudo.py b/src/dapla_pseudo/v1/depseudo.py index 7c219b70..022506ca 100644 --- a/src/dapla_pseudo/v1/depseudo.py +++ b/src/dapla_pseudo/v1/depseudo.py @@ -4,6 +4,7 @@ import typing as t from concurrent.futures import ThreadPoolExecutor from concurrent.futures import as_completed +from datetime import date from typing import Optional import pandas as pd @@ -14,10 +15,12 @@ from dapla_pseudo.constants import PredefinedKeys from dapla_pseudo.constants import PseudoFunctionTypes from dapla_pseudo.types import FileLikeDatasetDecl +from dapla_pseudo.utils import convert_to_date from dapla_pseudo.v1.api_models import DaeadKeywordArgs from dapla_pseudo.v1.api_models import DepseudonymizeFileRequest from dapla_pseudo.v1.api_models import FF31KeywordArgs from dapla_pseudo.v1.api_models import KeyWrapper +from dapla_pseudo.v1.api_models import MapSidKeywordArgs from dapla_pseudo.v1.api_models import Mimetypes from dapla_pseudo.v1.api_models import PseudoConfig from dapla_pseudo.v1.api_models import PseudoFunction @@ -231,6 +234,38 @@ def __init__( self._fields = fields self._existing_rules = [] if rules is None else rules + def with_stable_id( + self, + sid_snapshot_date: Optional[str | date] = None, + custom_key: Optional[str] = None, + ) -> "Depseudonymize._Depseudonymizer": + """Depseudonymize the selected fields with the default encryption algorithm (DAEAD). + + 1) Decrypt stable-id + 2) Then map decrypted stable-id to fnr and return original fnr. + + Args: + sid_snapshot_date (Optional[str | date], optional): Date representing SID-catalogue version to use. + Latest if unspecified. Format: YYYY-MM-DD + custom_key (Optional[PredefinedKeys | str], optional): Override the key to use for pseudonymization. + Must be one of the keys defined in PredefinedKeys. If not defined, uses the default key for this function (papis-common-key-1) + + Returns: + Self: The object configured to be mapped to fnr + """ + kwargs = ( + MapSidKeywordArgs( + key_id=custom_key, + snapshot_date=convert_to_date(sid_snapshot_date), + ) + if custom_key + else MapSidKeywordArgs(snapshot_date=convert_to_date(sid_snapshot_date)) + ) + function = PseudoFunction( + function_type=PseudoFunctionTypes.MAP_SID, kwargs=kwargs + ) + return self._rule_constructor(function) + def with_default_encryption( self, custom_key: Optional[PredefinedKeys | str] = None ) -> "Depseudonymize._Depseudonymizer": diff --git a/tests/conftest.py b/tests/conftest.py index b5126420..9a28344f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -70,6 +70,36 @@ def df_personer_fnr_daead_encrypted() -> pl.DataFrame: ) +@pytest.fixture +def df_personer_pseudo_stable_id() -> pl.DataFrame: + JSON_FILE = "tests/data/person_3_sid_deid.json" + return pl.read_json( + JSON_FILE, + schema={ + "fnr": pl.String, + "fornavn": pl.String, + "etternavn": pl.String, + "kjonn": pl.String, + "fodselsdato": pl.String, + }, + ) + + +@pytest.fixture +def df_personer_depseudo_stable_id() -> pl.DataFrame: + JSON_FILE = "tests/data/person_3_sid.json" + return pl.read_json( + JSON_FILE, + schema={ + "fnr": pl.String, + "fornavn": pl.String, + "etternavn": pl.String, + "kjonn": pl.String, + "fodselsdato": pl.String, + }, + ) + + @pytest.fixture def df_pandas_personer_fnr_daead_encrypted() -> pd.DataFrame: JSON_FILE = "tests/data/personer_pseudonymized_default_encryption.json" diff --git a/tests/data/person_3_sid.json b/tests/data/person_3_sid.json new file mode 100644 index 00000000..77882da5 --- /dev/null +++ b/tests/data/person_3_sid.json @@ -0,0 +1,23 @@ +[ + { + "fnr": "11854898347", + "fornavn": "Mathias", + "etternavn": "Holm", + "kjonn": "M", + "fodselsdato": "020995" + }, + { + "fnr": "01839899544", + "fornavn": "Gunnar", + "etternavn": "Jørgensen", + "kjonn": "M", + "fodselsdato": "060970" + }, + { + "fnr": "02812289295", + "fornavn": "Kristoffer", + "etternavn": "Pedersen", + "kjonn": "M", + "fodselsdato": "180999" + } +] diff --git a/tests/data/person_3_sid_deid.json b/tests/data/person_3_sid_deid.json new file mode 100644 index 00000000..ba919696 --- /dev/null +++ b/tests/data/person_3_sid_deid.json @@ -0,0 +1,23 @@ +[ + { + "fnr": "jJuuj0i", + "fornavn": "Mathias", + "etternavn": "Holm", + "kjonn": "M", + "fodselsdato": "020995" + }, + { + "fnr": "ylc9488", + "fornavn": "Gunnar", + "etternavn": "Jørgensen", + "kjonn": "M", + "fodselsdato": "060970" + }, + { + "fnr": "mprMeNQ", + "fornavn": "Kristoffer", + "etternavn": "Pedersen", + "kjonn": "M", + "fodselsdato": "180999" + } +] diff --git a/tests/integration/test_integration_deseudonymize.py b/tests/integration/test_integration_deseudonymize.py index 17a313b9..ac679be3 100644 --- a/tests/integration/test_integration_deseudonymize.py +++ b/tests/integration/test_integration_deseudonymize.py @@ -21,3 +21,20 @@ def test_depseudonymize_default_encryption( .to_polars() ) assert result.equals(df_personer) + + +@integration_test() +def test_depseudonymize_sid( + setup: Generator[None, None, None], + df_personer: pl.DataFrame, + df_personer_pseudo_stable_id: pl.DataFrame, + df_personer_depseudo_stable_id: pl.DataFrame, +) -> None: + result = ( + Depseudonymize.from_polars(df_personer_pseudo_stable_id) + .on_fields("fnr") + .with_stable_id() + .run() + .to_polars() + ) + assert result.equals(df_personer_depseudo_stable_id) diff --git a/tests/v1/test_depseudo.py b/tests/v1/test_depseudo.py index 12a869f0..66417c99 100644 --- a/tests/v1/test_depseudo.py +++ b/tests/v1/test_depseudo.py @@ -116,6 +116,25 @@ def test_builder_fields_selector_multiple_fields( ] +@patch(f"{PKG}.pseudonymize_operation_field") +def test_builder_depseudo_function_selector_with_sid( + patch_depseudonymize_operation_field: MagicMock, df_personer: pl.DataFrame +) -> None: + mock_return_pseudonymize_operation_field(patch_depseudonymize_operation_field) + Depseudonymize.from_polars(df_personer).on_fields("fnr").with_stable_id().run() + patch_depseudonymize_operation_field.assert_called_once_with( + path="depseudonymize/field", + values=df_personer["fnr"].to_list(), + field_name="fnr", + pseudo_func=PseudoFunction( + function_type=PseudoFunctionTypes.MAP_SID, kwargs=MapSidKeywordArgs() + ), + timeout=TIMEOUT_DEFAULT, + pseudo_client=ANY, + keyset=None, + ) + + @patch(f"{PKG}.pseudo_operation_file") def test_builder_file_default( patched_pseudo_operation_file: MagicMock, personer_pseudonymized_file_path: str