diff --git a/src/nested_dask/__init__.py b/src/nested_dask/__init__.py index 4b83f63..c84f36d 100644 --- a/src/nested_dask/__init__.py +++ b/src/nested_dask/__init__.py @@ -2,4 +2,5 @@ from .core import NestedFrame # noqa from .io import read_parquet # noqa from .datasets import generate_data # noqa +from .utils import count_nested # noqa from ._version import __version__ # noqa diff --git a/src/nested_dask/utils/__init__.py b/src/nested_dask/utils/__init__.py new file mode 100644 index 0000000..ed5d0c5 --- /dev/null +++ b/src/nested_dask/utils/__init__.py @@ -0,0 +1 @@ +from .utils import * # noqa diff --git a/src/nested_dask/utils/utils.py b/src/nested_dask/utils/utils.py new file mode 100644 index 0000000..f115cac --- /dev/null +++ b/src/nested_dask/utils/utils.py @@ -0,0 +1,54 @@ +import nested_pandas as npd +import pandas as pd +from nested_pandas import utils as npd_utils + +from ..core import NestedFrame + + +def count_nested(df, nested, by=None, join=True) -> NestedFrame: + """Counts the number of rows of a nested dataframe. + + Wraps Nested-Pandas count_nested. + + Parameters + ---------- + df: NestedFrame + A NestedFrame that contains the desired `nested` series + to count. + nested: 'str' + The label of the nested series to count. + by: 'str', optional + Specifies a column within nested to count by, returning + a count for each unique value in `by`. + join: bool, optional + Join the output count columns to df and return df, otherwise + just return a NestedFrame containing only the count columns. + + Returns + ------- + NestedFrame + """ + + # The meta varies depending on the parameters + + # first depending on by + if by is not None: + # will have one column per unique value of the specified column + # requires some computation to determine these values + # TODO: Requires modification of nested-pandas to always produce + # sorted output columns for meta + by_cols = sorted(df[nested].nest.to_flat()[by].unique()) + out_cols = [f"n_{nested}_{col}" for col in by_cols] + else: + # otherwise just have a single column output + out_cols = [f"n_{nested}"] + + # add dtypes + meta = npd.NestedFrame({col: 0 for col in out_cols}, index=[]) + + # and second depending on join + if join: + # adds the meta onto the existing meta + meta = pd.concat([df.head(0), meta]) + + return df.map_partitions(lambda x: npd_utils.count_nested(x, nested, by=by, join=join), meta=meta) diff --git a/tests/nested_dask/test_utils.py b/tests/nested_dask/test_utils.py new file mode 100644 index 0000000..d23bc8f --- /dev/null +++ b/tests/nested_dask/test_utils.py @@ -0,0 +1,17 @@ +import nested_dask as nd +import pytest +from nested_pandas.utils import count_nested + + +@pytest.mark.parametrize("join", [True, False]) +@pytest.mark.parametrize("by", [None, "band"]) +def test_count_nested(test_dataset, join, by): + """test the count_nested wrapper""" + + # count_nested functionality is tested on the nested-pandas side + # let's just make sure the behavior here is identical. + + result_dsk = nd.utils.count_nested(test_dataset, "nested", join=join, by=by).compute() + result_pd = count_nested(test_dataset.compute(), "nested", join=join, by=by) + + assert result_dsk.equals(result_pd)