Skip to content

Commit

Permalink
Add integration test for the combinations of input/output datatypes (#…
Browse files Browse the repository at this point in the history
…352)

* Add integration test for the combinations of input/output datatypes

* Run pre-commit
  • Loading branch information
mallport authored Mar 6, 2024
1 parent 9bc51ab commit 1062213
Show file tree
Hide file tree
Showing 11 changed files with 192 additions and 181 deletions.
2 changes: 1 addition & 1 deletion src/dapla_pseudo/v1/supported_file_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ def write_from_df(
case SupportedOutputFileFormat.CSV:
df.write_csv(file=file_like, **kwargs)
case SupportedOutputFileFormat.JSON:
df.write_json(file=file_like, **kwargs)
df.write_json(file=file_like, row_oriented=True, **kwargs)
case SupportedOutputFileFormat.XML:
df.to_pandas().to_xml(file_like, **kwargs)
case SupportedOutputFileFormat.PARQUET:
Expand Down

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
{
"document_version": "0.0.1",
"datadoc": null,
"pseudonymization": {
"document_version": "0.1.0",
"pseudo_dataset": null,
"pseudo_variables": [
{
"short_name": "fnr",
"data_element_path": "fnr",
"data_element_pattern": "**",
"stable_identifier_type": null,
"stable_identifier_version": null,
"encryption_algorithm": "TINK-FPE",
"encryption_key_reference": "papis-common-key-1",
"encryption_algorithm_parameters": [
{
"keyId": "papis-common-key-1"
},
{
"strategy": "skip"
}
],
"source_variable": null,
"source_variable_datatype": null
}
]
}
}
File renamed without changes.
77 changes: 23 additions & 54 deletions tests/data/personer_pseudonymized_default_encryption.json
Original file line number Diff line number Diff line change
@@ -1,54 +1,23 @@
{
"columns": [
{
"name": "fnr",
"datatype": "String",
"bit_settings": "",
"values": [
"AWIRfKLSNfR0ID+wBzogEcUT7JQPayk7Gosij6SXr8s=",
"AWIRfKKLagk0LqYCKpiC4xfPkHqIWGVfc3wg5gUwRNE=",
"AWIRfKIzL1T9iZqt+pLjNbHMsLa0aKSszsRrLiLSAAg="
]
},
{
"name": "fornavn",
"datatype": "String",
"bit_settings": "",
"values": [
"AWIRfKKWWRC1hURqsYw4S/h/NitvuP6bO/R7",
"AWIRfKJuYBaBQIXIprRO9UFDXNLd4YXcHtY=",
"AWIRfKKsIDQgWLnpsSln38z1RSfHjjL8FS4="
]
},
{
"name": "etternavn",
"datatype": "String",
"bit_settings": "",
"values": [
"AWIRfKIKAGiRoGTd/Cid5gxsIDx4H1ya6w==",
"AWIRfKLzOfzOw+Bdo9zIa4savOOeAiEr",
"AWIRfKIKAGiRoGTd/Cid5gxsIDx4H1ya6w=="
]
},
{
"name": "kjonn",
"datatype": "String",
"bit_settings": "",
"values": [
"AWIRfKJuDy4LnWA7y/9fGHhJg3hZ0Q==",
"AWIRfKJuDy4LnWA7y/9fGHhJg3hZ0Q==",
"AWIRfKJuDy4LnWA7y/9fGHhJg3hZ0Q=="
]
},
{
"name": "fodselsdato",
"datatype": "String",
"bit_settings": "",
"values": [
"AWIRfKJ0wod0IfL/dSrzF7pPLIgDSyNCofor",
"AWIRfKJNjb7vLdKwCLHuJPIQUUqdP8/nDYU0",
"AWIRfKKqsp5+sjjRDpFwUfMdo16j6URwkFuE"
]
}
]
}
[
{
"fnr": "AWIRfKLSNfR0ID+wBzogEcUT7JQPayk7Gosij6SXr8s=",
"fornavn": "Donald",
"etternavn": "Duck",
"kjonn": "M",
"fodselsdato": "020995"
},
{
"fnr": "AWIRfKKLagk0LqYCKpiC4xfPkHqIWGVfc3wg5gUwRNE=",
"fornavn": "Mikke",
"etternavn": "Mus",
"kjonn": "M",
"fodselsdato": "060970"
},
{
"fnr": "AWIRfKIzL1T9iZqt+pLjNbHMsLa0aKSszsRrLiLSAAg=",
"fornavn": "Anton",
"etternavn": "Duck",
"kjonn": "M",
"fodselsdato": "180999"
}
]
Original file line number Diff line number Diff line change
@@ -1,20 +1,20 @@
[
{
"fnr": "AQ24fCAIRx0TiPVD1awZchhvbvm7VzViIGThGdU99iY=",
"fnr": "KsJu12NHdoZ",
"fornavn": "Donald",
"etternavn": "Duck",
"kjonn": "M",
"fodselsdato": "020995"
},
{
"fnr": "AQ24fCDfDoLR6AZpcsRyLUvh2jzbYfFUYRkcUb4eUoo=",
"fnr": "QqaTeUtXvjk",
"fornavn": "Mikke",
"etternavn": "Mus",
"kjonn": "M",
"fodselsdato": "060970"
},
{
"fnr": "AQ24fCBTARXudzT6ZXBdwUvvhnhg5DAHUXbXy2Cogy8=",
"fnr": "rAHb6rOyFtA",
"fornavn": "Anton",
"etternavn": "Duck",
"kjonn": "M",
Expand Down
2 changes: 1 addition & 1 deletion tests/integration/test_integration_deseudonymize.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def test_depseudonymize_default_encryption(
) -> None:
result = (
Depseudonymize.from_polars(df_personer_fnr_daead_encrypted)
.on_fields("fnr", "fornavn", "etternavn", "kjonn", "fodselsdato")
.on_fields("fnr")
.with_default_encryption()
.run()
.to_polars()
Expand Down
52 changes: 23 additions & 29 deletions tests/integration/test_integration_pseudonymize.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,18 @@
import inspect
import json
import typing as t
from collections.abc import Generator
from functools import partial
from pathlib import Path
from unittest.mock import ANY

import pandas as pd
import polars as pl
import pytest

from dapla_pseudo import Pseudonymize
from tests.integration.utils import df_personer
from tests.integration.utils import df_personer_fnr_daead_encrypted
from tests.integration.utils import get_calling_function_name
from tests.integration.utils import get_expected_datadoc_metadata_container
from tests.integration.utils import integration_test
Expand Down Expand Up @@ -41,46 +49,32 @@ def test_pseudonymize_default_encryption(


@integration_test()
def test_pseudonymize_default_encryption_all_fields(
def test_pseudonymize_papis_compatible_encryption(
setup: Generator[None, None, None], df_personer: pl.DataFrame
) -> None:
expected_result_df = pl.read_json(
"tests/data/personer_pseudonymized_default_encryption.json"
expected_result_fnr_df = pl.DataFrame(
{
"fnr": [
"KsJu12NHdoZ",
"QqaTeUtXvjk",
"rAHb6rOyFtA",
]
}
)
expected_result_df = df_personer.clone().update(expected_result_fnr_df)
result = (
Pseudonymize.from_polars(df_personer)
.on_fields(*df_personer.columns)
.with_default_encryption()
.on_fields("fnr")
.with_papis_compatible_encryption()
.run()
)
current_function_name = get_calling_function_name()
expected_metadata_container = get_expected_datadoc_metadata_container(
current_function_name
)
# When comparing the expected metadata with the metadata generated by the API,
# we need the pseudo_variables to be in the same order. This is not guaranteed by the API
# `pseudo_variables` are therefore sorted before we make the comparison.
# In the context of this test we can guarantee that the `short_name` is unique, which allows us to sort by it.
if (result._datadoc.pseudonymization is not None) and (
expected_metadata_container.pseudonymization is not None
):
result._datadoc.pseudonymization.pseudo_variables = sorted(
result._datadoc.pseudonymization.pseudo_variables, # type: ignore
key=lambda pseudo_var: (
pseudo_var.short_name if isinstance(pseudo_var.short_name, str) else ""
),
)
expected_metadata_container.pseudonymization.pseudo_variables = sorted(
expected_metadata_container.pseudonymization.pseudo_variables, # type: ignore
key=lambda pseudo_var: (
pseudo_var.short_name if isinstance(pseudo_var.short_name, str) else ""
),
)

assert result._datadoc == expected_metadata_container
assert result.to_polars().equals(expected_result_df)
else:
raise AssertionError("MetadataContainer's pseudonymization object is None")

assert result.datadoc == expected_metadata_container.model_dump_json()
assert result.to_polars().equals(expected_result_df)


@integration_test()
Expand Down
70 changes: 70 additions & 0 deletions tests/integration/test_integration_result.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
import json
import typing as t
from collections.abc import Generator
from pathlib import Path

import pandas as pd
import polars as pl
import pytest

from dapla_pseudo import Pseudonymize
from tests.integration.utils import df_pandas_personer_fnr_daead_encrypted
from tests.integration.utils import df_personer
from tests.integration.utils import df_personer_fnr_daead_encrypted
from tests.integration.utils import df_personer_pandas
from tests.integration.utils import integration_test
from tests.integration.utils import personer_file_path
from tests.integration.utils import setup


@pytest.mark.parametrize(
"output_func",
[("file"), ("pandas"), ("polars")],
)
@pytest.mark.parametrize(
"input_func",
[("file"), ("pandas"), ("polars")],
)
@integration_test()
def test_pseudonymize_input_output_funcs(
setup: Generator[None, None, None],
input_func: t.Literal["file", "pandas", "polars"],
output_func: t.Literal["file", "pandas", "polars"],
tmp_path: Path,
personer_file_path: str,
df_personer_pandas: pd.DataFrame,
df_personer: pl.DataFrame,
df_personer_fnr_daead_encrypted: pl.DataFrame,
df_pandas_personer_fnr_daead_encrypted: pd.DataFrame,
) -> None:
"""This test runs several times, once for every combination of the possible input and output datatypes.
It is intended to end-to-end-test for the conversion between data types, e.g. Polars DataFrame -> File.
"""
match input_func:
case "file":
pseudonymizer = Pseudonymize.from_file(personer_file_path)
case "pandas":
pseudonymizer = Pseudonymize.from_pandas(df_personer_pandas)
case "polars":
pseudonymizer = Pseudonymize.from_polars(df_personer)

result = pseudonymizer.on_fields("fnr").with_default_encryption().run()

match output_func:
case "file":
file_path = tmp_path / "personer_pseudo.json"
result.to_file(str(file_path))

expected = json.loads(
open("tests/data/personer_pseudonymized_default_encryption.json").read()
)
actual = json.loads(file_path.open().read())

assert expected == actual
case "pandas":
df_pandas = result.to_pandas()
assert df_pandas_personer_fnr_daead_encrypted.equals(df_pandas)
case "polars":
df_polars = result.to_polars()
assert df_personer_fnr_daead_encrypted.equals(df_polars)
Loading

0 comments on commit 1062213

Please sign in to comment.