diff --git a/src/nested_pandas/__init__.py b/src/nested_pandas/__init__.py index 613f651..e63bd1c 100644 --- a/src/nested_pandas/__init__.py +++ b/src/nested_pandas/__init__.py @@ -1,8 +1,9 @@ from .example_module import greetings, meaning from .nestedframe import NestedFrame +from .nestedframe.io import read_parquet # Import for registering from .series.accessor import NestSeriesAccessor # noqa: F401 from .series.dtype import NestedDtype -__all__ = ["greetings", "meaning", "NestedDtype", "NestedFrame"] +__all__ = ["greetings", "meaning", "NestedDtype", "NestedFrame", "read_parquet"] diff --git a/src/nested_pandas/nestedframe/__init__.py b/src/nested_pandas/nestedframe/__init__.py index 54af689..a656cf3 100644 --- a/src/nested_pandas/nestedframe/__init__.py +++ b/src/nested_pandas/nestedframe/__init__.py @@ -1 +1,2 @@ from .core import NestedFrame # noqa +from .io import read_parquet # noqa diff --git a/src/nested_pandas/nestedframe/io.py b/src/nested_pandas/nestedframe/io.py new file mode 100644 index 0000000..e0f773f --- /dev/null +++ b/src/nested_pandas/nestedframe/io.py @@ -0,0 +1,75 @@ +# typing.Self and "|" union syntax don't exist in Python 3.9 +from __future__ import annotations + +import pandas as pd +from pandas._libs import lib +from pandas._typing import ( + DtypeBackend, + FilePath, + ReadBuffer, +) + +from .core import NestedFrame + + +def read_parquet( + data: FilePath | ReadBuffer[bytes], + to_pack: dict, + columns: list[str] | None = None, + pack_columns: dict | None = None, + dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default, +) -> NestedFrame: + """ + Load a parquet object from a file path and load a set of other + parquet objects to pack into the resulting NestedFrame. + + Docstring based on the Pandas equivalent. + + #TODO after MVP: Include full kwarg-set + #TODO: Switch dtype backend default? + + Parameters + ---------- + data : str, path object or file-like object + String, path object (implementing ``os.PathLike[str]``), or file-like + object implementing a binary ``read()`` function. + The string could be a URL. Valid URL schemes include http, ftp, s3, + gs, and file. For file URLs, a host is expected. A local file could be: + ``file://localhost/path/to/table.parquet``. + A file URL can also be a path to a directory that contains multiple + partitioned parquet files. Both pyarrow and fastparquet support + paths to directories as well as file URLs. A directory path could be: + ``file://localhost/path/to/tables`` or ``s3://bucket/partition_dir``. + to_pack: dict, + A dictionary of parquet data paths (same criteria as `data`), where + each key reflects the desired column name to pack the data into and + each value reflects the parquet data to pack. + columns : list, default=None + If not None, only these columns will be read from the file. + pack_columns: dict, default=None + If not None, selects a set of columns from each keyed nested parquet + object to read from the nested files. + dtype_backend : {{'numpy_nullable', 'pyarrow'}}, default 'numpy_nullable' + Back-end data type applied to the resultant :class:`DataFrame` + (still experimental). Behaviour is as follows: + + * ``"numpy_nullable"``: returns nullable-dtype-backed :class:`DataFrame` + (default). + * ``"pyarrow"``: returns pyarrow-backed nullable :class:`ArrowDtype` + DataFrame. + + Returns + ------- + NestedFrame + """ + + df = NestedFrame(pd.read_parquet(data, engine="pyarrow", columns=columns, dtype_backend=dtype_backend)) + + for pack_key in to_pack: + col_subset = pack_columns.get(pack_key, None) if pack_columns is not None else None + packed = pd.read_parquet( + to_pack[pack_key], engine="pyarrow", columns=col_subset, dtype_backend=dtype_backend + ) + df = df.add_nested(packed, pack_key) + + return df diff --git a/tests/nested_pandas/nestedframe/test_io.py b/tests/nested_pandas/nestedframe/test_io.py new file mode 100644 index 0000000..8037755 --- /dev/null +++ b/tests/nested_pandas/nestedframe/test_io.py @@ -0,0 +1,59 @@ +import os + +import pandas as pd +import pytest +from nested_pandas import read_parquet + + +@pytest.mark.parametrize("columns", [["a"], None]) +@pytest.mark.parametrize("pack_columns", [{"nested1": ["c"], "nested2": ["e"]}, {"nested1": ["d"]}, None]) +def test_read_parquet(tmp_path, columns, pack_columns): + """Test nested parquet loading""" + # Setup a temporary directory for files + save_path = os.path.join(tmp_path, ".") + + # Generate some test data + base = pd.DataFrame(data={"a": [1, 2, 3], "b": [2, 4, 6]}, index=[0, 1, 2]) + + nested1 = pd.DataFrame( + data={"c": [0, 2, 4, 1, 4, 3, 1, 4, 1], "d": [5, 4, 7, 5, 3, 1, 9, 3, 4]}, + index=[0, 0, 0, 1, 1, 1, 2, 2, 2], + ) + + nested2 = pd.DataFrame( + data={"e": [0, 2, 4, 1, 4, 3, 1, 4, 1], "f": [5, 4, 7, 5, 3, 1, 9, 3, 4]}, + index=[0, 0, 0, 1, 1, 1, 2, 2, 2], + ) + + # Save to parquet + base.to_parquet(os.path.join(save_path, "base.parquet")) + nested1.to_parquet(os.path.join(save_path, "nested1.parquet")) + nested2.to_parquet(os.path.join(save_path, "nested2.parquet")) + + # Read from parquet + nf = read_parquet( + data=os.path.join(save_path, "base.parquet"), + to_pack={ + "nested1": os.path.join(save_path, "nested1.parquet"), + "nested2": os.path.join(save_path, "nested2.parquet"), + }, + columns=columns, + pack_columns=pack_columns, + ) + + # Check Base Columns + if columns is not None: + assert nf.columns.tolist() == columns + ["nested1", "nested2"] + else: + assert nf.columns.tolist() == base.columns.tolist() + ["nested1", "nested2"] + + # Check Nested Columns + if pack_columns is not None: + for nested_col in pack_columns: + assert nf[nested_col].nest.fields == pack_columns[nested_col] + else: + for nested_col in nf.nested_columns: + if nested_col == "nested1": + assert nf[nested_col].nest.fields == nested1.columns.tolist() + elif nested_col == "nested2": + assert nf[nested_col].nest.fields == nested2.columns.tolist()