Read files with polars (#244)

* Read csv and parquet with polars * Add example of how to ovveride service url and token * Revert to reading csv with pandas and fix tests * Fix typing issue for PARQUET enum * Map filetypes to their corresponding reader functions
statisticsnorway · Sep 13, 2023 · db8db61 · db8db61
1 parent e115965
commit db8db61
Show file tree

Hide file tree

Showing 5 changed files with 53 additions and 16 deletions.
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -85,6 +85,15 @@ $ nox --session=tests
 Unit tests are located in the _tests_ directory,
 and are written using the [pytest] testing framework.
 
+### Local Testing
+
+When testing against a local instance of [dapla-pseudo-service](https://github.com/statisticsnorway/dapla-dlp-pseudo-service), you can configure the URL and authentication token by providing the following environment variables:
+
+```console
+PSEUDO_SERVICE_URL=http://localhost:<PORT>
+PSEUDO_SERVICE_AUTH_TOKEN=<KEYCLOAK_TOKEN>
+```
+
 [pytest]: https://pytest.readthedocs.io/
 
 ## How to submit changes

diff --git a/src/dapla_pseudo/v1/builder.py b/src/dapla_pseudo/v1/builder.py
@@ -18,6 +18,7 @@
 from dapla_pseudo.v1.ops import _client
 from dapla_pseudo.v1.supported_file_format import NoFileExtensionError
 from dapla_pseudo.v1.supported_file_format import SupportedFileFormat
+from dapla_pseudo.v1.supported_file_format import read_to_df
 
 
 class PseudonymizationResult:
@@ -104,8 +105,7 @@ def from_file(file_path_str: str, **kwargs: Any) -> "PseudoData._FieldSelector":
 
         file_format = SupportedFileFormat(file_extension)
 
-        pandas_function = getattr(pd, file_format.get_pandas_function_name())
-        return PseudoData._FieldSelector(pandas_function(file_path_str, **kwargs))
+        return PseudoData._FieldSelector(read_to_df(file_format, file_path_str, **kwargs))
 
     class _FieldSelector:
         """Select one or multiple fields to be pseudonymized."""

diff --git a/src/dapla_pseudo/v1/supported_file_format.py b/src/dapla_pseudo/v1/supported_file_format.py
@@ -1,5 +1,11 @@
 """Classes used to support reading of dataframes from file."""
 from enum import Enum
+from typing import Any
+from typing import Dict
+from typing import Union
+
+import pandas as pd
+import polars as pl
 
 
 class SupportedFileFormat(Enum):
@@ -10,9 +16,21 @@ class SupportedFileFormat(Enum):
     XML = "xml"
     PARQUET = "parquet"
 
-    def get_pandas_function_name(self) -> str:
-        """Return the pandas function name for the file format."""
-        return f"read_{self.value}"
+
+FORMAT_TO_READER_FUNCTION = {
+    SupportedFileFormat.CSV: pd.read_csv,
+    SupportedFileFormat.JSON: pd.read_json,
+    SupportedFileFormat.XML: pd.read_xml,
+    SupportedFileFormat.PARQUET: pl.read_parquet,
+}
+
+
+def read_to_df(
+    supported_format: SupportedFileFormat, file_path: str, **kwargs: Dict[str, Any]
+) -> Union[pl.DataFrame, pd.DataFrame]:
+    """Reads a file with a supported file format to a Dataframe."""
+    reader_function = FORMAT_TO_READER_FUNCTION[supported_format]
+    return reader_function(file_path, **kwargs)
 
 
 class NoFileExtensionError(Exception):

diff --git a/tests/v1/test_builder.py b/tests/v1/test_builder.py
@@ -211,8 +211,8 @@ def test_builder_from_file_no_file_extension() -> None:
         PseudoData.from_file(path)
 
 
-@patch(f"{PKG}.pd.read_csv")
-def test_builder_from_file_with_storage_options(pandas_form_csv: Mock) -> None:
+@patch(f"{PKG}.read_to_df")
+def test_builder_from_file_with_storage_options(_mock_read_to_df: Mock) -> None:
     # This should not raise a FileNotFoundError
     # since the file is not on the local filesystem
     try:
@@ -225,7 +225,7 @@ def test_builder_from_file_with_storage_options(pandas_form_csv: Mock) -> None:
 
 @pytest.mark.parametrize(
     "file_format,expected_error",
-    [("json", "ValueError"), ("csv", "EmptyDataError"), ("xml", "XMLSyntaxError"), ("parquet", "ArrowInvalid")],
+    [("json", "ValueError"), ("csv", "EmptyDataError"), ("xml", "XMLSyntaxError"), ("parquet", "ArrowErrorException")],
 )
 @patch("pathlib.Path.suffix")
 def test_builder_from_file_empty_file(mock_path_suffix: Mock, file_format: str, expected_error: str) -> None:

diff --git a/tests/v1/test_supported_file_format.py b/tests/v1/test_supported_file_format.py
@@ -1,21 +1,31 @@
-import pandas as pd
+import polars as pl
 import pytest
 
 from dapla_pseudo.v1.supported_file_format import SupportedFileFormat
+from dapla_pseudo.v1.supported_file_format import read_to_df
 
 
 PKG = "dapla_pseudo.v1.supported_file_format"
-
-
-@pytest.mark.parametrize("file_format", ["json", "csv", "xml", "parquet"])
-def test_get_pandas_function_name(file_format: str) -> None:
-    # Checks that a pandas function exists for all supported file formats.
-    supported_file_format = SupportedFileFormat(file_format)
-    assert getattr(pd, supported_file_format.get_pandas_function_name())
+TEST_FILE_PATH = "tests/v1/test_files"
 
 
 def test_get_pandas_function_name_unsupported_format() -> None:
     # Checks that a unsupported file extension raise a value error.
     unsupported_format = "notsupported"
     with pytest.raises(ValueError):
         SupportedFileFormat(unsupported_format)
+
+
+@pytest.mark.parametrize(
+    "file_format, read_with_polars",
+    [
+        ("json", False),
+        ("csv", False),
+        ("xml", False),
+        ("parquet", True),
+    ],
+)
+def test_supported_files_read_with_polars(file_format: str, read_with_polars: bool) -> None:
+    supported_file_format = SupportedFileFormat(file_format)
+    df = read_to_df(supported_file_format, f"{TEST_FILE_PATH}/test.{file_format}")
+    assert isinstance(df, pl.DataFrame) is read_with_polars