Skip to content

Commit

Permalink
add from_flat
Browse files Browse the repository at this point in the history
  • Loading branch information
dougbrn committed Jul 16, 2024
1 parent 3cd3baa commit e32efc2
Show file tree
Hide file tree
Showing 2 changed files with 74 additions and 0 deletions.
50 changes: 50 additions & 0 deletions src/nested_pandas/nestedframe/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,56 @@ def add_nested(
new_df[label] = packed
return new_df

@classmethod
def from_flat(cls, df, base_columns, nested_columns=None, index=None, name="nested"):
"""Creates a NestedFrame with base and nested columns from a flat
dataframe.
Parameters
----------
df: pd.DataFrame or NestedFrame
A flat dataframe.
base_columns: list-like
The columns that should be used as base (flat) columns in the
output dataframe.
nested_columns: list-like, or None
The columns that should be packed into a nested column. All columns
in the list will attempt to be packed into a single nested column
with the name provided in `nested_name`. If None, is defined as all
columns not in `base_columns`.
index: str, or None
The name of a column to use as the new index. If not provided the
current index will be used.
name:
The name of the output column the `nested_columns` are packed into.
Returns
-------
NestedFrame
A NestedFrame with the specified nesting structure.
Examples
--------
>>> nf = NestedFrame({"a":[1,1,1,2,2], "b":[2,2,2,4,4], "c":[1,2,3,4,5], "d":[2,4,6,8,10]}, index=[0,0,0,1,1])
>>> NestedFrame.from_flat(nf, base_columns=["a","b"])
"""

# Resolve new index
if index is not None:
# if a base column is chosen remove it
if index in base_columns:
base_columns = [col for col in base_columns if col != index]
df = df.set_index(index)

# drop duplicates on index
out_df = df[base_columns][~df.index.duplicated(keep="first")]

# add nested
nested_columns = [col for col in df.columns if col not in base_columns]
return out_df.add_nested(df[nested_columns], name=name)

def _split_query(self, expr) -> dict:
"""Splits a pandas query into multiple subqueries for nested and base layers"""
# Ensure query has needed spacing for upcoming split
Expand Down
24 changes: 24 additions & 0 deletions tests/nested_pandas/nestedframe/test_nestedframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,30 @@ def test_add_nested_for_empty_df():
assert_frame_equal(new_base.nested.nest.to_flat(), nested.astype(pd.ArrowDtype(pa.float64())))


@pytest.mark.parametrize("index", [None, "a", "c"])
def test_from_flat(index):
"""Test the NestedFrame.from_flat functionality"""
nf = NestedFrame(
{"a": [1, 1, 1, 2, 2], "b": [2, 2, 2, 4, 4], "c": [1, 2, 3, 4, 5], "d": [2, 4, 6, 8, 10]},
index=[0, 0, 0, 1, 1],
)

out_nf = NestedFrame.from_flat(nf, base_columns=["a", "b"], index=index, name="new_nested")

if index is None:
assert list(out_nf.columns) == ["a", "b", "new_nested"]
assert list(out_nf.new_nested.nest.fields) == ["c", "d"]
assert len(out_nf) == 2
elif index == "a":
assert list(out_nf.columns) == ["b", "new_nested"]
assert list(out_nf.new_nested.nest.fields) == ["c", "d"]
assert len(out_nf) == 2
elif index == "c": # not what a user likely wants, but should still work
assert list(out_nf.columns) == ["a", "b", "new_nested"]
assert list(out_nf.new_nested.nest.fields) == ["d"]
assert len(out_nf) == 5


def test_query():
"""Test that NestedFrame.query handles nested queries correctly"""

Expand Down

0 comments on commit e32efc2

Please sign in to comment.