From e32efc20bc3bb08a4718df3893d0ecb94c0d16f2 Mon Sep 17 00:00:00 2001 From: Doug Branton Date: Tue, 16 Jul 2024 14:57:27 -0700 Subject: [PATCH] add from_flat --- src/nested_pandas/nestedframe/core.py | 50 +++++++++++++++++++ .../nestedframe/test_nestedframe.py | 24 +++++++++ 2 files changed, 74 insertions(+) diff --git a/src/nested_pandas/nestedframe/core.py b/src/nested_pandas/nestedframe/core.py index f0169b5..4295369 100644 --- a/src/nested_pandas/nestedframe/core.py +++ b/src/nested_pandas/nestedframe/core.py @@ -103,6 +103,56 @@ def add_nested( new_df[label] = packed return new_df + @classmethod + def from_flat(cls, df, base_columns, nested_columns=None, index=None, name="nested"): + """Creates a NestedFrame with base and nested columns from a flat + dataframe. + + Parameters + ---------- + df: pd.DataFrame or NestedFrame + A flat dataframe. + base_columns: list-like + The columns that should be used as base (flat) columns in the + output dataframe. + nested_columns: list-like, or None + The columns that should be packed into a nested column. All columns + in the list will attempt to be packed into a single nested column + with the name provided in `nested_name`. If None, is defined as all + columns not in `base_columns`. + index: str, or None + The name of a column to use as the new index. If not provided the + current index will be used. + name: + The name of the output column the `nested_columns` are packed into. + + Returns + ------- + NestedFrame + A NestedFrame with the specified nesting structure. + + Examples + -------- + + >>> nf = NestedFrame({"a":[1,1,1,2,2], "b":[2,2,2,4,4], "c":[1,2,3,4,5], "d":[2,4,6,8,10]}, index=[0,0,0,1,1]) + + >>> NestedFrame.from_flat(nf, base_columns=["a","b"]) + """ + + # Resolve new index + if index is not None: + # if a base column is chosen remove it + if index in base_columns: + base_columns = [col for col in base_columns if col != index] + df = df.set_index(index) + + # drop duplicates on index + out_df = df[base_columns][~df.index.duplicated(keep="first")] + + # add nested + nested_columns = [col for col in df.columns if col not in base_columns] + return out_df.add_nested(df[nested_columns], name=name) + def _split_query(self, expr) -> dict: """Splits a pandas query into multiple subqueries for nested and base layers""" # Ensure query has needed spacing for upcoming split diff --git a/tests/nested_pandas/nestedframe/test_nestedframe.py b/tests/nested_pandas/nestedframe/test_nestedframe.py index 7590fce..6dc52b2 100644 --- a/tests/nested_pandas/nestedframe/test_nestedframe.py +++ b/tests/nested_pandas/nestedframe/test_nestedframe.py @@ -145,6 +145,30 @@ def test_add_nested_for_empty_df(): assert_frame_equal(new_base.nested.nest.to_flat(), nested.astype(pd.ArrowDtype(pa.float64()))) +@pytest.mark.parametrize("index", [None, "a", "c"]) +def test_from_flat(index): + """Test the NestedFrame.from_flat functionality""" + nf = NestedFrame( + {"a": [1, 1, 1, 2, 2], "b": [2, 2, 2, 4, 4], "c": [1, 2, 3, 4, 5], "d": [2, 4, 6, 8, 10]}, + index=[0, 0, 0, 1, 1], + ) + + out_nf = NestedFrame.from_flat(nf, base_columns=["a", "b"], index=index, name="new_nested") + + if index is None: + assert list(out_nf.columns) == ["a", "b", "new_nested"] + assert list(out_nf.new_nested.nest.fields) == ["c", "d"] + assert len(out_nf) == 2 + elif index == "a": + assert list(out_nf.columns) == ["b", "new_nested"] + assert list(out_nf.new_nested.nest.fields) == ["c", "d"] + assert len(out_nf) == 2 + elif index == "c": # not what a user likely wants, but should still work + assert list(out_nf.columns) == ["a", "b", "new_nested"] + assert list(out_nf.new_nested.nest.fields) == ["d"] + assert len(out_nf) == 5 + + def test_query(): """Test that NestedFrame.query handles nested queries correctly"""