Skip to content

Commit

Permalink
Read files with polars (#244)
Browse files Browse the repository at this point in the history
* Read csv and parquet with polars

* Add example of how to ovveride service url and token

* Revert to reading csv with pandas and fix tests

* Fix typing issue for PARQUET enum

* Map filetypes to their corresponding reader functions
  • Loading branch information
Andilun authored Sep 13, 2023
1 parent e115965 commit db8db61
Show file tree
Hide file tree
Showing 5 changed files with 53 additions and 16 deletions.
9 changes: 9 additions & 0 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,15 @@ $ nox --session=tests
Unit tests are located in the _tests_ directory,
and are written using the [pytest] testing framework.

### Local Testing

When testing against a local instance of [dapla-pseudo-service](https://github.com/statisticsnorway/dapla-dlp-pseudo-service), you can configure the URL and authentication token by providing the following environment variables:

```console
PSEUDO_SERVICE_URL=http://localhost:<PORT>
PSEUDO_SERVICE_AUTH_TOKEN=<KEYCLOAK_TOKEN>
```

[pytest]: https://pytest.readthedocs.io/

## How to submit changes
Expand Down
4 changes: 2 additions & 2 deletions src/dapla_pseudo/v1/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from dapla_pseudo.v1.ops import _client
from dapla_pseudo.v1.supported_file_format import NoFileExtensionError
from dapla_pseudo.v1.supported_file_format import SupportedFileFormat
from dapla_pseudo.v1.supported_file_format import read_to_df


class PseudonymizationResult:
Expand Down Expand Up @@ -104,8 +105,7 @@ def from_file(file_path_str: str, **kwargs: Any) -> "PseudoData._FieldSelector":

file_format = SupportedFileFormat(file_extension)

pandas_function = getattr(pd, file_format.get_pandas_function_name())
return PseudoData._FieldSelector(pandas_function(file_path_str, **kwargs))
return PseudoData._FieldSelector(read_to_df(file_format, file_path_str, **kwargs))

class _FieldSelector:
"""Select one or multiple fields to be pseudonymized."""
Expand Down
24 changes: 21 additions & 3 deletions src/dapla_pseudo/v1/supported_file_format.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
"""Classes used to support reading of dataframes from file."""
from enum import Enum
from typing import Any
from typing import Dict
from typing import Union

import pandas as pd
import polars as pl


class SupportedFileFormat(Enum):
Expand All @@ -10,9 +16,21 @@ class SupportedFileFormat(Enum):
XML = "xml"
PARQUET = "parquet"

def get_pandas_function_name(self) -> str:
"""Return the pandas function name for the file format."""
return f"read_{self.value}"

FORMAT_TO_READER_FUNCTION = {
SupportedFileFormat.CSV: pd.read_csv,
SupportedFileFormat.JSON: pd.read_json,
SupportedFileFormat.XML: pd.read_xml,
SupportedFileFormat.PARQUET: pl.read_parquet,
}


def read_to_df(
supported_format: SupportedFileFormat, file_path: str, **kwargs: Dict[str, Any]
) -> Union[pl.DataFrame, pd.DataFrame]:
"""Reads a file with a supported file format to a Dataframe."""
reader_function = FORMAT_TO_READER_FUNCTION[supported_format]
return reader_function(file_path, **kwargs)


class NoFileExtensionError(Exception):
Expand Down
6 changes: 3 additions & 3 deletions tests/v1/test_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,8 +211,8 @@ def test_builder_from_file_no_file_extension() -> None:
PseudoData.from_file(path)


@patch(f"{PKG}.pd.read_csv")
def test_builder_from_file_with_storage_options(pandas_form_csv: Mock) -> None:
@patch(f"{PKG}.read_to_df")
def test_builder_from_file_with_storage_options(_mock_read_to_df: Mock) -> None:
# This should not raise a FileNotFoundError
# since the file is not on the local filesystem
try:
Expand All @@ -225,7 +225,7 @@ def test_builder_from_file_with_storage_options(pandas_form_csv: Mock) -> None:

@pytest.mark.parametrize(
"file_format,expected_error",
[("json", "ValueError"), ("csv", "EmptyDataError"), ("xml", "XMLSyntaxError"), ("parquet", "ArrowInvalid")],
[("json", "ValueError"), ("csv", "EmptyDataError"), ("xml", "XMLSyntaxError"), ("parquet", "ArrowErrorException")],
)
@patch("pathlib.Path.suffix")
def test_builder_from_file_empty_file(mock_path_suffix: Mock, file_format: str, expected_error: str) -> None:
Expand Down
26 changes: 18 additions & 8 deletions tests/v1/test_supported_file_format.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,31 @@
import pandas as pd
import polars as pl
import pytest

from dapla_pseudo.v1.supported_file_format import SupportedFileFormat
from dapla_pseudo.v1.supported_file_format import read_to_df


PKG = "dapla_pseudo.v1.supported_file_format"


@pytest.mark.parametrize("file_format", ["json", "csv", "xml", "parquet"])
def test_get_pandas_function_name(file_format: str) -> None:
# Checks that a pandas function exists for all supported file formats.
supported_file_format = SupportedFileFormat(file_format)
assert getattr(pd, supported_file_format.get_pandas_function_name())
TEST_FILE_PATH = "tests/v1/test_files"


def test_get_pandas_function_name_unsupported_format() -> None:
# Checks that a unsupported file extension raise a value error.
unsupported_format = "notsupported"
with pytest.raises(ValueError):
SupportedFileFormat(unsupported_format)


@pytest.mark.parametrize(
"file_format, read_with_polars",
[
("json", False),
("csv", False),
("xml", False),
("parquet", True),
],
)
def test_supported_files_read_with_polars(file_format: str, read_with_polars: bool) -> None:
supported_file_format = SupportedFileFormat(file_format)
df = read_to_df(supported_file_format, f"{TEST_FILE_PATH}/test.{file_format}")
assert isinstance(df, pl.DataFrame) is read_with_polars

0 comments on commit db8db61

Please sign in to comment.