Skip to content

Commit

Permalink
Merge pull request #168 from lincc-frameworks/non-uniq-idx
Browse files Browse the repository at this point in the history
Handle non-unique index
  • Loading branch information
hombit authored Nov 6, 2024
2 parents 1a2a1d2 + b652f7a commit 96503cd
Show file tree
Hide file tree
Showing 6 changed files with 313 additions and 54 deletions.
95 changes: 56 additions & 39 deletions src/nested_pandas/nestedframe/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,9 @@
from pandas.api.extensions import no_default
from pandas.core.computation.expr import PARSERS, PandasExprVisitor

from nested_pandas.series import packer
from nested_pandas.nestedframe.utils import extract_nest_names
from nested_pandas.series.dtype import NestedDtype

from ..series.packer import pack_sorted_df_into_struct
from .utils import extract_nest_names
from nested_pandas.series.packer import pack, pack_lists, pack_sorted_df_into_struct


class NestedPandasExprVisitor(PandasExprVisitor):
Expand Down Expand Up @@ -219,10 +217,8 @@ def __setitem__(self, key, value):
"." in key and key.split(".")[0] in self.nested_columns
):
nested, col = key.split(".")
new_flat = self[nested].nest.to_flat()
new_flat[col] = value
packed = packer.pack(new_flat)
return super().__setitem__(nested, packed)
new_nested_series = self[nested].nest.with_flat_field(col, value)
return super().__setitem__(nested, new_nested_series)

# Adding a new nested structure from a column
# Allows statements like ndf["new_nested.t"] = ndf["nested.t"] - 5
Expand All @@ -231,8 +227,9 @@ def __setitem__(self, key, value):
if isinstance(value, pd.Series):
value.name = col
value = value.to_frame()
packed = packer.pack(value)
return super().__setitem__(new_nested, packed)
new_df = self.add_nested(value, name=new_nested)
self._update_inplace(new_df)
return None

return super().__setitem__(key, value)

Expand All @@ -242,6 +239,7 @@ def add_nested(
name: str,
*,
how: str = "left",
on: None | str | list[str] = None,
dtype: NestedDtype | pd.ArrowDtype | pa.DataType | None = None,
) -> Self: # type: ignore[name-defined] # noqa: F821
"""Packs input object to a nested column and adds it to the NestedFrame
Expand Down Expand Up @@ -272,6 +270,8 @@ def add_nested(
index, and sort it lexicographically.
- inner: form intersection of calling frame's index with other
frame's index, preserving the order of the calling index.
on : str, default: None
A column in the list
dtype : dtype or None
NestedDtype to use for the nested column; pd.ArrowDtype or
pa.DataType can also be used to specify the nested dtype. If None,
Expand All @@ -282,13 +282,16 @@ def add_nested(
NestedFrame
A new NestedFrame with the added nested column.
"""
if on is not None and not isinstance(on, str):
raise ValueError("Currently we only support a single column for 'on'")
# Add sources to objects
packed = packer.pack(obj, name=name, dtype=dtype)
packed = pack(obj, name=name, on=on, dtype=dtype)
new_df = self.copy()
return new_df.join(packed, how=how)
res = new_df.join(packed, how=how, on=on)
return res

@classmethod
def from_flat(cls, df, base_columns, nested_columns=None, index=None, name="nested"):
def from_flat(cls, df, base_columns, nested_columns=None, on: str | None = None, name="nested"):
"""Creates a NestedFrame with base and nested columns from a flat
dataframe.
Expand All @@ -304,7 +307,7 @@ def from_flat(cls, df, base_columns, nested_columns=None, index=None, name="nest
in the list will attempt to be packed into a single nested column
with the name provided in `nested_name`. If None, is defined as all
columns not in `base_columns`.
index: str, or None
on: str or None
The name of a column to use as the new index. Typically, the index
should have a unique value per row for base columns, and should
repeat for nested columns. For example, a dataframe with two
Expand All @@ -330,11 +333,11 @@ def from_flat(cls, df, base_columns, nested_columns=None, index=None, name="nest
"""

# Resolve new index
if index is not None:
if on is not None:
# if a base column is chosen remove it
if index in base_columns:
base_columns = [col for col in base_columns if col != index]
df = df.set_index(index)
if on in base_columns:
base_columns = [col for col in base_columns if col != on]
df = df.set_index(on)

# drop duplicates on index
out_df = df[base_columns][~df.index.duplicated(keep="first")]
Expand Down Expand Up @@ -401,7 +404,7 @@ def from_lists(cls, df, base_columns=None, list_columns=None, name="nested"):
raise ValueError("No columns were assigned as list columns.")

# Pack list columns into a nested column
packed_df = packer.pack_lists(df[list_columns])
packed_df = pack_lists(df[list_columns])
packed_df.name = name

# join the nested column to the base_column df
Expand Down Expand Up @@ -519,17 +522,33 @@ def query(self, expr: str, *, inplace: bool = False, **kwargs) -> NestedFrame |
# since it operated on the base attributes.
if isinstance(result, _SeriesFromNest):
nest_name, flat_nest = result.nest_name, result.flat_nest
new_flat_nest = flat_nest.loc[result]
result = self.copy()
result[nest_name] = pack_sorted_df_into_struct(new_flat_nest)
# Reset index to "ordinal" like [0, 0, 0, 1, 1, 2, 2, 2]
list_index = self[nest_name].array.get_list_index()
flat_nest = flat_nest.set_index(list_index)
query_result = result.set_axis(list_index)
# Selecting flat values matching the query result
new_flat_nest = flat_nest[query_result]
new_df = self._set_filtered_flat_df(nest_name, new_flat_nest)
else:
result = self.loc[result]
new_df = self.loc[result]

if inplace:
self._update_inplace(result)
self._update_inplace(new_df)
return None
else:
return result
return new_df

def _set_filtered_flat_df(self, nest_name, flat_df):
"""Set a filtered flat dataframe for a nested column
Here we assume that flat_df has filtered "ordinal" index,
e.g. flat_df.index == [0, 2, 2, 2], while self.index
is arbitrary (e.g. ["a", "b", "a"]),
and self[nest_name].array.list_index is [0, 0, 1, 1, 1, 2, 2, 2, 2].
"""
new_df = self.reset_index(drop=True)
new_df[nest_name] = pack_sorted_df_into_struct(flat_df, name=nest_name)
return new_df.set_index(self.index)

def _resolve_dropna_target(self, on_nested, subset):
"""resolves the target layer for a given set of dropna kwargs"""
Expand Down Expand Up @@ -654,34 +673,32 @@ def dropna(
return super().dropna(
axis=axis, how=how, thresh=thresh, subset=subset, inplace=inplace, ignore_index=ignore_index
)
if ignore_index:
raise ValueError("ignore_index is not supported for nested columns")
if subset is not None:
subset = [col.split(".")[-1] for col in subset]
target_flat = self[target].nest.to_flat()
target_flat = target_flat.set_index(self[target].array.get_list_index())
if inplace:
target_flat = self[target].nest.to_flat()
target_flat.dropna(
axis=axis,
how=how,
thresh=thresh,
subset=subset,
inplace=inplace,
ignore_index=ignore_index,
inplace=True,
)
self[target] = packer.pack_flat(target_flat)
return self
# Or if not inplace
new_df = self.copy()
new_df[target] = packer.pack_flat(
new_df[target]
.nest.to_flat()
.dropna(
else:
target_flat = target_flat.dropna(
axis=axis,
how=how,
thresh=thresh,
subset=subset,
inplace=inplace,
ignore_index=ignore_index,
inplace=False,
)
)
new_df = self._set_filtered_flat_df(nest_name=target, flat_df=target_flat)
if inplace:
self._update_inplace(new_df)
return None
return new_df

def reduce(self, func, *args, **kwargs) -> NestedFrame: # type: ignore[override]
Expand Down
8 changes: 8 additions & 0 deletions src/nested_pandas/series/ext_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -648,6 +648,14 @@ def num_chunks(self) -> int:
"""Number of chunks in underlying pyarrow.ChunkedArray"""
return self._chunked_array.num_chunks

def get_list_index(self) -> np.ndarray:
"""Keys mapping values to lists"""
if len(self) == 0:
# Since we have no list offsets, return an empty array
return np.array([], dtype=int)
list_index = np.arange(len(self))
return np.repeat(list_index, np.diff(self.list_offsets))

def iter_field_lists(self, field: str) -> Generator[np.ndarray, None, None]:
"""Iterate over single field nested lists, as numpy arrays
Expand Down
15 changes: 11 additions & 4 deletions src/nested_pandas/series/packer.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ def pack(
name: str | None = None,
*,
index=None,
on: None | str | list[str] = None,
dtype: NestedDtype | pd.ArrowDtype | pa.DataType | None = None,
) -> pd.Series:
"""Pack a "flat" dataframe or a sequence of dataframes into a "nested" series.
Expand All @@ -40,6 +41,8 @@ def pack(
index : convertable to pd.Index, optional
Index of the output series. If obj is a pd.DataFrame, it is always nested by the original index,
and this value is used to override the index after the nesting.
on: str or list of str, optional
Column name(s) to join on. If None, the index is used.
dtype : dtype or None
NestedDtype of the output series, or other type to derive from. If None,
the dtype is inferred from the first non-missing dataframe.
Expand All @@ -50,14 +53,14 @@ def pack(
Output series.
"""
if isinstance(obj, pd.DataFrame):
nested = pack_flat(obj, name=name)
nested = pack_flat(obj, name=name, on=on)
if index is not None:
nested.index = index
return nested
return pack_seq(obj, name=name, index=index, dtype=dtype)


def pack_flat(df: pd.DataFrame, name: str | None = None) -> pd.Series:
def pack_flat(df: pd.DataFrame, name: str | None = None, *, on: None | str | list[str] = None) -> pd.Series:
"""Make a structure of lists representation of a "flat" dataframe.
For the input dataframe with repeated indexes, make a pandas.Series,
Expand All @@ -73,6 +76,8 @@ def pack_flat(df: pd.DataFrame, name: str | None = None) -> pd.Series:
Input dataframe, with repeated indexes.
name : str, optional
Name of the pd.Series.
on : str or list of str, optional
Column name(s) to join on. If None, the df's index is used.
Returns
-------
Expand All @@ -86,9 +91,11 @@ def pack_flat(df: pd.DataFrame, name: str | None = None) -> pd.Series:
nested_pandas.series.packer.pack_lists : Pack a dataframe of nested arrays.
"""

if on is not None:
df = df.set_index(on)
# pandas knows when index is pre-sorted, so it would do nothing if it is already sorted
flat = df.sort_index(kind="stable")
return pack_sorted_df_into_struct(flat, name=name)
sorted_flat = df.sort_index(kind="stable")
return pack_sorted_df_into_struct(sorted_flat, name=name)


def pack_seq(
Expand Down
Loading

0 comments on commit 96503cd

Please sign in to comment.