Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Wrapper for count_nested #31

Merged
merged 5 commits into from
Jul 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ classifiers = [
dynamic = ["version"]
requires-python = ">=3.9"
dependencies = [
'nested-pandas==0.1.1',
'nested-pandas==0.1.2',
'numpy',
'dask>=2024.3.0',
'dask[distributed]>=2024.3.0',
Expand Down
1 change: 1 addition & 0 deletions src/nested_dask/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,5 @@
from .core import NestedFrame # noqa
from .io import read_parquet # noqa
from .datasets import generate_data # noqa
from .utils import count_nested # noqa
from ._version import __version__ # noqa
1 change: 1 addition & 0 deletions src/nested_dask/utils/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .utils import * # noqa
54 changes: 54 additions & 0 deletions src/nested_dask/utils/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import nested_pandas as npd
import pandas as pd
from nested_pandas import utils as npd_utils

from ..core import NestedFrame


def count_nested(df, nested, by=None, join=True) -> NestedFrame:
"""Counts the number of rows of a nested dataframe.

Wraps Nested-Pandas count_nested.

Parameters
----------
df: NestedFrame
A NestedFrame that contains the desired `nested` series
to count.
nested: 'str'
The label of the nested series to count.
by: 'str', optional
Specifies a column within nested to count by, returning
a count for each unique value in `by`.
join: bool, optional
Join the output count columns to df and return df, otherwise
just return a NestedFrame containing only the count columns.

Returns
-------
NestedFrame
"""

# The meta varies depending on the parameters

# first depending on by
if by is not None:
# will have one column per unique value of the specified column
# requires some computation to determine these values
# TODO: Requires modification of nested-pandas to always produce
# sorted output columns for meta
by_cols = sorted(df[nested].nest.to_flat()[by].unique())
out_cols = [f"n_{nested}_{col}" for col in by_cols]
else:
# otherwise just have a single column output
out_cols = [f"n_{nested}"]

# add dtypes
meta = npd.NestedFrame({col: 0 for col in out_cols}, index=[])

# and second depending on join
if join:
# adds the meta onto the existing meta
meta = pd.concat([df._meta, meta])

return df.map_partitions(lambda x: npd_utils.count_nested(x, nested, by=by, join=join), meta=meta)
17 changes: 17 additions & 0 deletions tests/nested_dask/test_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
import nested_dask as nd
import pytest
from nested_pandas.utils import count_nested


@pytest.mark.parametrize("join", [True, False])
@pytest.mark.parametrize("by", [None, "band"])
def test_count_nested(test_dataset, join, by):
"""test the count_nested wrapper"""

# count_nested functionality is tested on the nested-pandas side
# let's just make sure the behavior here is identical.

result_dsk = nd.utils.count_nested(test_dataset, "nested", join=join, by=by).compute()
result_pd = count_nested(test_dataset.compute(), "nested", join=join, by=by)

assert result_dsk.equals(result_pd)