-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
4 changed files
with
73 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
from .utils import * # noqa |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
import nested_pandas as npd | ||
import pandas as pd | ||
from nested_pandas import utils as npd_utils | ||
|
||
from ..core import NestedFrame | ||
|
||
|
||
def count_nested(df, nested, by=None, join=True) -> NestedFrame: | ||
"""Counts the number of rows of a nested dataframe. | ||
Wraps Nested-Pandas count_nested. | ||
Parameters | ||
---------- | ||
df: NestedFrame | ||
A NestedFrame that contains the desired `nested` series | ||
to count. | ||
nested: 'str' | ||
The label of the nested series to count. | ||
by: 'str', optional | ||
Specifies a column within nested to count by, returning | ||
a count for each unique value in `by`. | ||
join: bool, optional | ||
Join the output count columns to df and return df, otherwise | ||
just return a NestedFrame containing only the count columns. | ||
Returns | ||
------- | ||
NestedFrame | ||
""" | ||
|
||
# The meta varies depending on the parameters | ||
|
||
# first depending on by | ||
if by is not None: | ||
# will have one column per unique value of the specified column | ||
# requires some computation to determine these values | ||
# TODO: Requires modification of nested-pandas to always produce | ||
# sorted output columns for meta | ||
by_cols = sorted(df[nested].nest.to_flat()[by].unique()) | ||
out_cols = [f"n_{nested}_{col}" for col in by_cols] | ||
else: | ||
# otherwise just have a single column output | ||
out_cols = [f"n_{nested}"] | ||
|
||
# add dtypes | ||
meta = npd.NestedFrame({col: 0 for col in out_cols}, index=[]) | ||
|
||
# and second depending on join | ||
if join: | ||
# adds the meta onto the existing meta | ||
meta = pd.concat([df.head(0), meta]) | ||
|
||
return df.map_partitions(lambda x: npd_utils.count_nested(x, nested, by=by, join=join), meta=meta) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
import nested_dask as nd | ||
import pytest | ||
from nested_pandas.utils import count_nested | ||
|
||
|
||
@pytest.mark.parametrize("join", [True, False]) | ||
@pytest.mark.parametrize("by", [None, "band"]) | ||
def test_count_nested(test_dataset, join, by): | ||
"""test the count_nested wrapper""" | ||
|
||
# count_nested functionality is tested on the nested-pandas side | ||
# let's just make sure the behavior here is identical. | ||
|
||
result_dsk = nd.utils.count_nested(test_dataset, "nested", join=join, by=by).compute() | ||
result_pd = count_nested(test_dataset.compute(), "nested", join=join, by=by) | ||
|
||
assert result_dsk.equals(result_pd) |