Merge pull request #168 from lincc-frameworks/non-uniq-idx

Handle non-unique index
lincc-frameworks · Nov 6, 2024 · 96503cd · 96503cd
2 parents 1a2a1d2 + b652f7a
commit 96503cd
Show file tree

Hide file tree

Showing 6 changed files with 313 additions and 54 deletions.
diff --git a/src/nested_pandas/nestedframe/core.py b/src/nested_pandas/nestedframe/core.py
@@ -12,11 +12,9 @@
 from pandas.api.extensions import no_default
 from pandas.core.computation.expr import PARSERS, PandasExprVisitor
 
-from nested_pandas.series import packer
+from nested_pandas.nestedframe.utils import extract_nest_names
 from nested_pandas.series.dtype import NestedDtype
-
-from ..series.packer import pack_sorted_df_into_struct
-from .utils import extract_nest_names
+from nested_pandas.series.packer import pack, pack_lists, pack_sorted_df_into_struct
 
 
 class NestedPandasExprVisitor(PandasExprVisitor):
@@ -219,10 +217,8 @@ def __setitem__(self, key, value):
             "." in key and key.split(".")[0] in self.nested_columns
         ):
             nested, col = key.split(".")
-            new_flat = self[nested].nest.to_flat()
-            new_flat[col] = value
-            packed = packer.pack(new_flat)
-            return super().__setitem__(nested, packed)
+            new_nested_series = self[nested].nest.with_flat_field(col, value)
+            return super().__setitem__(nested, new_nested_series)
 
         # Adding a new nested structure from a column
         # Allows statements like ndf["new_nested.t"] = ndf["nested.t"] - 5
@@ -231,8 +227,9 @@ def __setitem__(self, key, value):
             if isinstance(value, pd.Series):
                 value.name = col
                 value = value.to_frame()
-            packed = packer.pack(value)
-            return super().__setitem__(new_nested, packed)
+            new_df = self.add_nested(value, name=new_nested)
+            self._update_inplace(new_df)
+            return None
 
         return super().__setitem__(key, value)
 
@@ -242,6 +239,7 @@ def add_nested(
         name: str,
         *,
         how: str = "left",
+        on: None | str | list[str] = None,
         dtype: NestedDtype | pd.ArrowDtype | pa.DataType | None = None,
     ) -> Self:  # type: ignore[name-defined] # noqa: F821
         """Packs input object to a nested column and adds it to the NestedFrame
@@ -272,6 +270,8 @@ def add_nested(
               index, and sort it lexicographically.
             - inner: form intersection of calling frame's index with other
               frame's index, preserving the order of the calling index.
+        on : str, default: None
+            A column in the list
         dtype : dtype or None
             NestedDtype to use for the nested column; pd.ArrowDtype or
             pa.DataType can also be used to specify the nested dtype. If None,
@@ -282,13 +282,16 @@ def add_nested(
         NestedFrame
             A new NestedFrame with the added nested column.
         """
+        if on is not None and not isinstance(on, str):
+            raise ValueError("Currently we only support a single column for 'on'")
         # Add sources to objects
-        packed = packer.pack(obj, name=name, dtype=dtype)
+        packed = pack(obj, name=name, on=on, dtype=dtype)
         new_df = self.copy()
-        return new_df.join(packed, how=how)
+        res = new_df.join(packed, how=how, on=on)
+        return res
 
     @classmethod
-    def from_flat(cls, df, base_columns, nested_columns=None, index=None, name="nested"):
+    def from_flat(cls, df, base_columns, nested_columns=None, on: str | None = None, name="nested"):
         """Creates a NestedFrame with base and nested columns from a flat
         dataframe.
 
@@ -304,7 +307,7 @@ def from_flat(cls, df, base_columns, nested_columns=None, index=None, name="nest
             in the list will attempt to be packed into a single nested column
             with the name provided in `nested_name`. If None, is defined as all
             columns not in `base_columns`.
-        index: str, or None
+        on: str or None
             The name of a column to use as the new index. Typically, the index
             should have a unique value per row for base columns, and should
             repeat for nested columns. For example, a dataframe with two
@@ -330,11 +333,11 @@ def from_flat(cls, df, base_columns, nested_columns=None, index=None, name="nest
         """
 
         # Resolve new index
-        if index is not None:
+        if on is not None:
             # if a base column is chosen remove it
-            if index in base_columns:
-                base_columns = [col for col in base_columns if col != index]
-            df = df.set_index(index)
+            if on in base_columns:
+                base_columns = [col for col in base_columns if col != on]
+            df = df.set_index(on)
 
         # drop duplicates on index
         out_df = df[base_columns][~df.index.duplicated(keep="first")]
@@ -401,7 +404,7 @@ def from_lists(cls, df, base_columns=None, list_columns=None, name="nested"):
             raise ValueError("No columns were assigned as list columns.")
 
         # Pack list columns into a nested column
-        packed_df = packer.pack_lists(df[list_columns])
+        packed_df = pack_lists(df[list_columns])
         packed_df.name = name
 
         # join the nested column to the base_column df
@@ -519,17 +522,33 @@ def query(self, expr: str, *, inplace: bool = False, **kwargs) -> NestedFrame |
         # since it operated on the base attributes.
         if isinstance(result, _SeriesFromNest):
             nest_name, flat_nest = result.nest_name, result.flat_nest
-            new_flat_nest = flat_nest.loc[result]
-            result = self.copy()
-            result[nest_name] = pack_sorted_df_into_struct(new_flat_nest)
+            # Reset index to "ordinal" like [0, 0, 0, 1, 1, 2, 2, 2]
+            list_index = self[nest_name].array.get_list_index()
+            flat_nest = flat_nest.set_index(list_index)
+            query_result = result.set_axis(list_index)
+            # Selecting flat values matching the query result
+            new_flat_nest = flat_nest[query_result]
+            new_df = self._set_filtered_flat_df(nest_name, new_flat_nest)
         else:
-            result = self.loc[result]
+            new_df = self.loc[result]
 
         if inplace:
-            self._update_inplace(result)
+            self._update_inplace(new_df)
             return None
         else:
-            return result
+            return new_df
+
+    def _set_filtered_flat_df(self, nest_name, flat_df):
+        """Set a filtered flat dataframe for a nested column
+
+        Here we assume that flat_df has filtered "ordinal" index,
+        e.g. flat_df.index == [0, 2, 2, 2], while self.index
+        is arbitrary (e.g. ["a", "b", "a"]),
+        and self[nest_name].array.list_index is [0, 0, 1, 1, 1, 2, 2, 2, 2].
+        """
+        new_df = self.reset_index(drop=True)
+        new_df[nest_name] = pack_sorted_df_into_struct(flat_df, name=nest_name)
+        return new_df.set_index(self.index)
 
     def _resolve_dropna_target(self, on_nested, subset):
         """resolves the target layer for a given set of dropna kwargs"""
@@ -654,34 +673,32 @@ def dropna(
             return super().dropna(
                 axis=axis, how=how, thresh=thresh, subset=subset, inplace=inplace, ignore_index=ignore_index
             )
+        if ignore_index:
+            raise ValueError("ignore_index is not supported for nested columns")
         if subset is not None:
             subset = [col.split(".")[-1] for col in subset]
+        target_flat = self[target].nest.to_flat()
+        target_flat = target_flat.set_index(self[target].array.get_list_index())
         if inplace:
-            target_flat = self[target].nest.to_flat()
             target_flat.dropna(
                 axis=axis,
                 how=how,
                 thresh=thresh,
                 subset=subset,
-                inplace=inplace,
-                ignore_index=ignore_index,
+                inplace=True,
             )
-            self[target] = packer.pack_flat(target_flat)
-            return self
-        # Or if not inplace
-        new_df = self.copy()
-        new_df[target] = packer.pack_flat(
-            new_df[target]
-            .nest.to_flat()
-            .dropna(
+        else:
+            target_flat = target_flat.dropna(
                 axis=axis,
                 how=how,
                 thresh=thresh,
                 subset=subset,
-                inplace=inplace,
-                ignore_index=ignore_index,
+                inplace=False,
             )
-        )
+        new_df = self._set_filtered_flat_df(nest_name=target, flat_df=target_flat)
+        if inplace:
+            self._update_inplace(new_df)
+            return None
         return new_df
 
     def reduce(self, func, *args, **kwargs) -> NestedFrame:  # type: ignore[override]

diff --git a/src/nested_pandas/series/ext_array.py b/src/nested_pandas/series/ext_array.py
@@ -648,6 +648,14 @@ def num_chunks(self) -> int:
         """Number of chunks in underlying pyarrow.ChunkedArray"""
         return self._chunked_array.num_chunks
 
+    def get_list_index(self) -> np.ndarray:
+        """Keys mapping values to lists"""
+        if len(self) == 0:
+            # Since we have no list offsets, return an empty array
+            return np.array([], dtype=int)
+        list_index = np.arange(len(self))
+        return np.repeat(list_index, np.diff(self.list_offsets))
+
     def iter_field_lists(self, field: str) -> Generator[np.ndarray, None, None]:
         """Iterate over single field nested lists, as numpy arrays
 

diff --git a/src/nested_pandas/series/packer.py b/src/nested_pandas/series/packer.py
@@ -27,6 +27,7 @@ def pack(
     name: str | None = None,
     *,
     index=None,
+    on: None | str | list[str] = None,
     dtype: NestedDtype | pd.ArrowDtype | pa.DataType | None = None,
 ) -> pd.Series:
     """Pack a "flat" dataframe or a sequence of dataframes into a "nested" series.
@@ -40,6 +41,8 @@ def pack(
     index : convertable to pd.Index, optional
         Index of the output series. If obj is a pd.DataFrame, it is always nested by the original index,
         and this value is used to override the index after the nesting.
+    on: str or list of str, optional
+        Column name(s) to join on. If None, the index is used.
     dtype : dtype or None
         NestedDtype of the output series, or other type to derive from. If None,
         the dtype is inferred from the first non-missing dataframe.
@@ -50,14 +53,14 @@ def pack(
         Output series.
     """
     if isinstance(obj, pd.DataFrame):
-        nested = pack_flat(obj, name=name)
+        nested = pack_flat(obj, name=name, on=on)
         if index is not None:
             nested.index = index
         return nested
     return pack_seq(obj, name=name, index=index, dtype=dtype)
 
 
-def pack_flat(df: pd.DataFrame, name: str | None = None) -> pd.Series:
+def pack_flat(df: pd.DataFrame, name: str | None = None, *, on: None | str | list[str] = None) -> pd.Series:
     """Make a structure of lists representation of a "flat" dataframe.
 
     For the input dataframe with repeated indexes, make a pandas.Series,
@@ -73,6 +76,8 @@ def pack_flat(df: pd.DataFrame, name: str | None = None) -> pd.Series:
         Input dataframe, with repeated indexes.
     name : str, optional
         Name of the pd.Series.
+    on : str or list of str, optional
+        Column name(s) to join on. If None, the df's index is used.
 
     Returns
     -------
@@ -86,9 +91,11 @@ def pack_flat(df: pd.DataFrame, name: str | None = None) -> pd.Series:
     nested_pandas.series.packer.pack_lists : Pack a dataframe of nested arrays.
     """
 
+    if on is not None:
+        df = df.set_index(on)
     # pandas knows when index is pre-sorted, so it would do nothing if it is already sorted
-    flat = df.sort_index(kind="stable")
-    return pack_sorted_df_into_struct(flat, name=name)
+    sorted_flat = df.sort_index(kind="stable")
+    return pack_sorted_df_into_struct(sorted_flat, name=name)
 
 
 def pack_seq(