Add integration test for the combinations of input/output datatypes (#…

…352) * Add integration test for the combinations of input/output datatypes * Run pre-commit
statisticsnorway · Mar 6, 2024 · 1062213 · 1062213
1 parent 9bc51ab
commit 1062213
Show file tree

Hide file tree

Showing 11 changed files with 192 additions and 181 deletions.
diff --git a/src/dapla_pseudo/v1/supported_file_format.py b/src/dapla_pseudo/v1/supported_file_format.py
@@ -88,7 +88,7 @@ def write_from_df(
         case SupportedOutputFileFormat.CSV:
             df.write_csv(file=file_like, **kwargs)
         case SupportedOutputFileFormat.JSON:
-            df.write_json(file=file_like, **kwargs)
+            df.write_json(file=file_like, row_oriented=True, **kwargs)
         case SupportedOutputFileFormat.XML:
             df.to_pandas().to_xml(file_like, **kwargs)
         case SupportedOutputFileFormat.PARQUET:

diff --git a/tests/data/datadoc/expected_metadata_test_pseudonymize_default_encryption_all_fields.json b/tests/data/datadoc/expected_metadata_test_pseudonymize_default_encryption_all_fields.json
diff --git a/tests/data/datadoc/expected_metadata_test_pseudonymize_papis_compatible_encryption.json b/tests/data/datadoc/expected_metadata_test_pseudonymize_papis_compatible_encryption.json
@@ -0,0 +1,29 @@
+{
+  "document_version": "0.0.1",
+  "datadoc": null,
+  "pseudonymization": {
+    "document_version": "0.1.0",
+    "pseudo_dataset": null,
+    "pseudo_variables": [
+      {
+        "short_name": "fnr",
+        "data_element_path": "fnr",
+        "data_element_pattern": "**",
+        "stable_identifier_type": null,
+        "stable_identifier_version": null,
+        "encryption_algorithm": "TINK-FPE",
+        "encryption_key_reference": "papis-common-key-1",
+        "encryption_algorithm_parameters": [
+          {
+            "keyId": "papis-common-key-1"
+          },
+          {
+            "strategy": "skip"
+          }
+        ],
+        "source_variable": null,
+        "source_variable_datatype": null
+      }
+    ]
+  }
+}
diff --git a/tests/data/personer_pseudonymized.csv → ...oner_pseudonymized_default_encryption.csv b/tests/data/personer_pseudonymized.csv → ...oner_pseudonymized_default_encryption.csv
diff --git a/tests/data/personer_pseudonymized_default_encryption.json b/tests/data/personer_pseudonymized_default_encryption.json
@@ -1,54 +1,23 @@
-{
-  "columns": [
-    {
-      "name": "fnr",
-      "datatype": "String",
-      "bit_settings": "",
-      "values": [
-        "AWIRfKLSNfR0ID+wBzogEcUT7JQPayk7Gosij6SXr8s=",
-        "AWIRfKKLagk0LqYCKpiC4xfPkHqIWGVfc3wg5gUwRNE=",
-        "AWIRfKIzL1T9iZqt+pLjNbHMsLa0aKSszsRrLiLSAAg="
-      ]
-    },
-    {
-      "name": "fornavn",
-      "datatype": "String",
-      "bit_settings": "",
-      "values": [
-        "AWIRfKKWWRC1hURqsYw4S/h/NitvuP6bO/R7",
-        "AWIRfKJuYBaBQIXIprRO9UFDXNLd4YXcHtY=",
-        "AWIRfKKsIDQgWLnpsSln38z1RSfHjjL8FS4="
-      ]
-    },
-    {
-      "name": "etternavn",
-      "datatype": "String",
-      "bit_settings": "",
-      "values": [
-        "AWIRfKIKAGiRoGTd/Cid5gxsIDx4H1ya6w==",
-        "AWIRfKLzOfzOw+Bdo9zIa4savOOeAiEr",
-        "AWIRfKIKAGiRoGTd/Cid5gxsIDx4H1ya6w=="
-      ]
-    },
-    {
-      "name": "kjonn",
-      "datatype": "String",
-      "bit_settings": "",
-      "values": [
-        "AWIRfKJuDy4LnWA7y/9fGHhJg3hZ0Q==",
-        "AWIRfKJuDy4LnWA7y/9fGHhJg3hZ0Q==",
-        "AWIRfKJuDy4LnWA7y/9fGHhJg3hZ0Q=="
-      ]
-    },
-    {
-      "name": "fodselsdato",
-      "datatype": "String",
-      "bit_settings": "",
-      "values": [
-        "AWIRfKJ0wod0IfL/dSrzF7pPLIgDSyNCofor",
-        "AWIRfKJNjb7vLdKwCLHuJPIQUUqdP8/nDYU0",
-        "AWIRfKKqsp5+sjjRDpFwUfMdo16j6URwkFuE"
-      ]
-    }
-  ]
-}
+[
+  {
+    "fnr": "AWIRfKLSNfR0ID+wBzogEcUT7JQPayk7Gosij6SXr8s=",
+    "fornavn": "Donald",
+    "etternavn": "Duck",
+    "kjonn": "M",
+    "fodselsdato": "020995"
+  },
+  {
+    "fnr": "AWIRfKKLagk0LqYCKpiC4xfPkHqIWGVfc3wg5gUwRNE=",
+    "fornavn": "Mikke",
+    "etternavn": "Mus",
+    "kjonn": "M",
+    "fodselsdato": "060970"
+  },
+  {
+    "fnr": "AWIRfKIzL1T9iZqt+pLjNbHMsLa0aKSszsRrLiLSAAg=",
+    "fornavn": "Anton",
+    "etternavn": "Duck",
+    "kjonn": "M",
+    "fodselsdato": "180999"
+  }
+]
diff --git a/tests/data/personer_pseudonymized.json → ...onymized_papis_compatible_encryption.json b/tests/data/personer_pseudonymized.json → ...onymized_papis_compatible_encryption.json
@@ -1,20 +1,20 @@
 [
     {
-        "fnr": "AQ24fCAIRx0TiPVD1awZchhvbvm7VzViIGThGdU99iY=",
+        "fnr": "KsJu12NHdoZ",
         "fornavn": "Donald",
         "etternavn": "Duck",
         "kjonn": "M",
         "fodselsdato": "020995"
     },
     {
-        "fnr": "AQ24fCDfDoLR6AZpcsRyLUvh2jzbYfFUYRkcUb4eUoo=",
+        "fnr": "QqaTeUtXvjk",
         "fornavn": "Mikke",
         "etternavn": "Mus",
         "kjonn": "M",
         "fodselsdato": "060970"
     },
     {
-        "fnr": "AQ24fCBTARXudzT6ZXBdwUvvhnhg5DAHUXbXy2Cogy8=",
+        "fnr": "rAHb6rOyFtA",
         "fornavn": "Anton",
         "etternavn": "Duck",
         "kjonn": "M",

diff --git a/tests/integration/test_integration_deseudonymize.py b/tests/integration/test_integration_deseudonymize.py
@@ -17,7 +17,7 @@ def test_depseudonymize_default_encryption(
 ) -> None:
     result = (
         Depseudonymize.from_polars(df_personer_fnr_daead_encrypted)
-        .on_fields("fnr", "fornavn", "etternavn", "kjonn", "fodselsdato")
+        .on_fields("fnr")
         .with_default_encryption()
         .run()
         .to_polars()

diff --git a/tests/integration/test_integration_pseudonymize.py b/tests/integration/test_integration_pseudonymize.py
@@ -1,10 +1,18 @@
 import inspect
+import json
+import typing as t
 from collections.abc import Generator
+from functools import partial
+from pathlib import Path
+from unittest.mock import ANY
 
+import pandas as pd
 import polars as pl
+import pytest
 
 from dapla_pseudo import Pseudonymize
 from tests.integration.utils import df_personer
+from tests.integration.utils import df_personer_fnr_daead_encrypted
 from tests.integration.utils import get_calling_function_name
 from tests.integration.utils import get_expected_datadoc_metadata_container
 from tests.integration.utils import integration_test
@@ -41,46 +49,32 @@ def test_pseudonymize_default_encryption(
 
 
 @integration_test()
-def test_pseudonymize_default_encryption_all_fields(
+def test_pseudonymize_papis_compatible_encryption(
     setup: Generator[None, None, None], df_personer: pl.DataFrame
 ) -> None:
-    expected_result_df = pl.read_json(
-        "tests/data/personer_pseudonymized_default_encryption.json"
+    expected_result_fnr_df = pl.DataFrame(
+        {
+            "fnr": [
+                "KsJu12NHdoZ",
+                "QqaTeUtXvjk",
+                "rAHb6rOyFtA",
+            ]
+        }
     )
+    expected_result_df = df_personer.clone().update(expected_result_fnr_df)
     result = (
         Pseudonymize.from_polars(df_personer)
-        .on_fields(*df_personer.columns)
-        .with_default_encryption()
+        .on_fields("fnr")
+        .with_papis_compatible_encryption()
         .run()
     )
     current_function_name = get_calling_function_name()
     expected_metadata_container = get_expected_datadoc_metadata_container(
         current_function_name
     )
-    # When comparing the expected metadata with the metadata generated by the API,
-    # we need the pseudo_variables to be in the same order. This is not guaranteed by the API
-    # `pseudo_variables` are therefore sorted before we make the comparison.
-    # In the context of this test we can guarantee that the `short_name` is unique, which allows us to sort by it.
-    if (result._datadoc.pseudonymization is not None) and (
-        expected_metadata_container.pseudonymization is not None
-    ):
-        result._datadoc.pseudonymization.pseudo_variables = sorted(
-            result._datadoc.pseudonymization.pseudo_variables,  # type: ignore
-            key=lambda pseudo_var: (
-                pseudo_var.short_name if isinstance(pseudo_var.short_name, str) else ""
-            ),
-        )
-        expected_metadata_container.pseudonymization.pseudo_variables = sorted(
-            expected_metadata_container.pseudonymization.pseudo_variables,  # type: ignore
-            key=lambda pseudo_var: (
-                pseudo_var.short_name if isinstance(pseudo_var.short_name, str) else ""
-            ),
-        )
-
-        assert result._datadoc == expected_metadata_container
-        assert result.to_polars().equals(expected_result_df)
-    else:
-        raise AssertionError("MetadataContainer's pseudonymization object is None")
+
+    assert result.datadoc == expected_metadata_container.model_dump_json()
+    assert result.to_polars().equals(expected_result_df)
 
 
 @integration_test()

diff --git a/tests/integration/test_integration_result.py b/tests/integration/test_integration_result.py
@@ -0,0 +1,70 @@
+import json
+import typing as t
+from collections.abc import Generator
+from pathlib import Path
+
+import pandas as pd
+import polars as pl
+import pytest
+
+from dapla_pseudo import Pseudonymize
+from tests.integration.utils import df_pandas_personer_fnr_daead_encrypted
+from tests.integration.utils import df_personer
+from tests.integration.utils import df_personer_fnr_daead_encrypted
+from tests.integration.utils import df_personer_pandas
+from tests.integration.utils import integration_test
+from tests.integration.utils import personer_file_path
+from tests.integration.utils import setup
+
+
+@pytest.mark.parametrize(
+    "output_func",
+    [("file"), ("pandas"), ("polars")],
+)
+@pytest.mark.parametrize(
+    "input_func",
+    [("file"), ("pandas"), ("polars")],
+)
+@integration_test()
+def test_pseudonymize_input_output_funcs(
+    setup: Generator[None, None, None],
+    input_func: t.Literal["file", "pandas", "polars"],
+    output_func: t.Literal["file", "pandas", "polars"],
+    tmp_path: Path,
+    personer_file_path: str,
+    df_personer_pandas: pd.DataFrame,
+    df_personer: pl.DataFrame,
+    df_personer_fnr_daead_encrypted: pl.DataFrame,
+    df_pandas_personer_fnr_daead_encrypted: pd.DataFrame,
+) -> None:
+    """This test runs several times, once for every combination of the possible input and output datatypes.
+
+    It is intended to end-to-end-test for the conversion between data types, e.g. Polars DataFrame -> File.
+    """
+    match input_func:
+        case "file":
+            pseudonymizer = Pseudonymize.from_file(personer_file_path)
+        case "pandas":
+            pseudonymizer = Pseudonymize.from_pandas(df_personer_pandas)
+        case "polars":
+            pseudonymizer = Pseudonymize.from_polars(df_personer)
+
+    result = pseudonymizer.on_fields("fnr").with_default_encryption().run()
+
+    match output_func:
+        case "file":
+            file_path = tmp_path / "personer_pseudo.json"
+            result.to_file(str(file_path))
+
+            expected = json.loads(
+                open("tests/data/personer_pseudonymized_default_encryption.json").read()
+            )
+            actual = json.loads(file_path.open().read())
+
+            assert expected == actual
+        case "pandas":
+            df_pandas = result.to_pandas()
+            assert df_pandas_personer_fnr_daead_encrypted.equals(df_pandas)
+        case "polars":
+            df_polars = result.to_polars()
+            assert df_personer_fnr_daead_encrypted.equals(df_polars)