diff --git a/src/nested_pandas/nestedframe/core.py b/src/nested_pandas/nestedframe/core.py index a7ec7dc..cb743fc 100644 --- a/src/nested_pandas/nestedframe/core.py +++ b/src/nested_pandas/nestedframe/core.py @@ -214,6 +214,69 @@ def from_flat(cls, df, base_columns, nested_columns=None, index=None, name="nest nested_columns = [col for col in df.columns if col not in base_columns] return out_df.add_nested(df[nested_columns], name=name) + @classmethod + def from_lists(cls, df, base_columns=None, list_columns=None, name="nested"): + """Creates a NestedFrame with base and nested columns from a flat + dataframe. + + Parameters + ---------- + df: pd.DataFrame or NestedFrame + A dataframe with list columns. + base_columns: list-like, or None + Any columns that have non-list values in the input df. These will + simply be kept as identical columns in the result + list_columns: list-like, or None + The list-value columns that should be packed into a nested column. + All columns in the list will attempt to be packed into a single + nested column with the name provided in `nested_name`. If None, is + defined as all columns not in `base_columns`. + name: + The name of the output column the `nested_columns` are packed into. + + Returns + ------- + NestedFrame + A NestedFrame with the specified nesting structure. + + Examples + -------- + + >>> nf = NestedFrame({"c":[1,2,3], "d":[2,4,6], + ... "e":[[1,2,3], [4,5,6], [7,8,9]]}, + ... index=[0,1,2]) + + + >>> NestedFrame.from_lists(nf, base_columns=["c","d"]) + """ + + # Resolve base and list columns + if base_columns is None: + if list_columns is None: + # with no inputs, assume all columns are list-valued + list_columns = df.columns + else: + # if list_columns are defined, assume everything else is base + base_columns = [col for col in df.columns if col not in list_columns] + else: + if list_columns is None: + # with defined base_columns, assume everything else is list + list_columns = [col for col in df.columns if col not in base_columns] + + if len(list_columns) == 0: + raise ValueError("No columns were assigned as list columns.") + + # Pack list columns into a nested column + packed_df = packer.pack_lists(df[list_columns]) + packed_df.name = name + + # join the nested column to the base_column df + if base_columns is not None: + return df[base_columns].join(packed_df) + # or just return the packed_df as a nestedframe if no base cols + else: + return NestedFrame(packed_df.to_frame()) + def _split_query(self, expr) -> dict: """Splits a pandas query into multiple subqueries for nested and base layers""" # Ensure query has needed spacing for upcoming split diff --git a/tests/nested_pandas/nestedframe/test_nestedframe.py b/tests/nested_pandas/nestedframe/test_nestedframe.py index 3dbf34b..7c4d2fc 100644 --- a/tests/nested_pandas/nestedframe/test_nestedframe.py +++ b/tests/nested_pandas/nestedframe/test_nestedframe.py @@ -320,6 +320,59 @@ def test_recover_from_flat(): assert nf2.equals(nf) +def test_from_lists(): + """Test NestedFrame.from_lists behavior""" + nf = NestedFrame( + {"c": [1, 2, 3], "d": [2, 4, 6], "e": [[1, 2, 3], [4, 5, 6], [7, 8, 9]]}, index=[0, 1, 2] + ) + + # Test a few combinations + res = NestedFrame.from_lists(nf, base_columns=["c", "d"], name="nested_e") + assert list(res.columns) == ["c", "d", "nested_e"] + assert list(res.nested_columns) == ["nested_e"] + + res = NestedFrame.from_lists(nf, base_columns=["c", "d"], list_columns=["e"]) + assert list(res.columns) == ["c", "d", "nested"] + assert list(res.nested_columns) == ["nested"] + + res = NestedFrame.from_lists(nf, list_columns=["e"]) + assert list(res.columns) == ["c", "d", "nested"] + assert list(res.nested_columns) == ["nested"] + + # Check for the no list columns error + with pytest.raises(ValueError): + res = NestedFrame.from_lists(nf, base_columns=["c", "d", "e"]) + + # Multiple list columns (of uneven length) + nf2 = NestedFrame( + { + "c": [1, 2, 3], + "d": [2, 4, 6], + "e": [[1, 2, 3], [4, 5, 6, 7], [8, 9]], + "f": [[10, 20, 30], [40, 50, 60, 70], [80, 90]], + }, + index=[0, 1, 2], + ) + + res = NestedFrame.from_lists(nf2, list_columns=["e", "f"]) + assert list(res.columns) == ["c", "d", "nested"] + assert list(res.nested_columns) == ["nested"] + assert list(res.nested.nest.fields) == ["e", "f"] + + # Check for subsetting + res = NestedFrame.from_lists(nf, base_columns=["c"], list_columns=["e"]) + assert list(res.columns) == ["c", "nested"] + assert list(res.nested_columns) == ["nested"] + + res = NestedFrame.from_lists(nf, base_columns=[], list_columns=["e"]) + assert list(res.columns) == ["nested"] + assert list(res.nested_columns) == ["nested"] + + res = NestedFrame.from_lists(nf[["e"]], base_columns=None, list_columns=None) + assert list(res.columns) == ["nested"] + assert list(res.nested_columns) == ["nested"] + + def test_query(): """Test that NestedFrame.query handles nested queries correctly"""