Skip to content

Commit

Permalink
wrapper implementation
Browse files Browse the repository at this point in the history
  • Loading branch information
dougbrn committed Jun 11, 2024
1 parent cb844e6 commit be70681
Show file tree
Hide file tree
Showing 4 changed files with 73 additions and 0 deletions.
1 change: 1 addition & 0 deletions src/nested_dask/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@
from .core import NestedFrame # noqa
from .io import read_parquet # noqa
from .datasets import generate_data # noqa
from .utils import count_nested # noqa
from ._version import __version__ # noqa
1 change: 1 addition & 0 deletions src/nested_dask/utils/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .utils import * # noqa
54 changes: 54 additions & 0 deletions src/nested_dask/utils/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import nested_pandas as npd
import pandas as pd
from nested_pandas import utils as npd_utils

from ..core import NestedFrame


def count_nested(df, nested, by=None, join=True) -> NestedFrame:
"""Counts the number of rows of a nested dataframe.
Wraps Nested-Pandas count_nested.
Parameters
----------
df: NestedFrame
A NestedFrame that contains the desired `nested` series
to count.
nested: 'str'
The label of the nested series to count.
by: 'str', optional
Specifies a column within nested to count by, returning
a count for each unique value in `by`.
join: bool, optional
Join the output count columns to df and return df, otherwise
just return a NestedFrame containing only the count columns.
Returns
-------
NestedFrame
"""

# The meta varies depending on the parameters

# first depending on by
if by is not None:
# will have one column per unique value of the specified column
# requires some computation to determine these values
# TODO: Requires modification of nested-pandas to always produce
# sorted output columns for meta
by_cols = sorted(df[nested].nest.to_flat()[by].unique())
out_cols = [f"n_{nested}_{col}" for col in by_cols]
else:
# otherwise just have a single column output
out_cols = [f"n_{nested}"]

# add dtypes
meta = npd.NestedFrame({col: 0 for col in out_cols}, index=[])

# and second depending on join
if join:
# adds the meta onto the existing meta
meta = pd.concat([df.head(0), meta])

return df.map_partitions(lambda x: npd_utils.count_nested(x, nested, by=by, join=join), meta=meta)
17 changes: 17 additions & 0 deletions tests/nested_dask/test_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
import nested_dask as nd
import pytest
from nested_pandas.utils import count_nested


@pytest.mark.parametrize("join", [True, False])
@pytest.mark.parametrize("by", [None, "band"])
def test_count_nested(test_dataset, join, by):
"""test the count_nested wrapper"""

# count_nested functionality is tested on the nested-pandas side
# let's just make sure the behavior here is identical.

result_dsk = nd.utils.count_nested(test_dataset, "nested", join=join, by=by).compute()
result_pd = count_nested(test_dataset.compute(), "nested", join=join, by=by)

assert result_dsk.equals(result_pd)

0 comments on commit be70681

Please sign in to comment.