From f6a21e3183d9bbb7f8510b9688bcf93aaa71396c Mon Sep 17 00:00:00 2001 From: Doug Branton Date: Tue, 21 May 2024 09:29:20 -0700 Subject: [PATCH] address review comments --- src/dask_nested/accessor.py | 2 +- src/dask_nested/backends.py | 4 +++- src/dask_nested/core.py | 24 +++++++++++++++++++++--- 3 files changed, 25 insertions(+), 5 deletions(-) diff --git a/src/dask_nested/accessor.py b/src/dask_nested/accessor.py index 11082fe..ceb8fb9 100644 --- a/src/dask_nested/accessor.py +++ b/src/dask_nested/accessor.py @@ -35,7 +35,7 @@ def _check_series(series): def fields(self) -> list[str]: """Names of the nested columns""" - return self._series.head(0).nest.fields # hacky + return list(self._series.dtype.fields) def to_lists(self, fields: list[str] | None = None) -> dd.DataFrame: """Convert nested series into dataframe of list-array columns diff --git a/src/dask_nested/backends.py b/src/dask_nested/backends.py index 991d182..bd589f2 100644 --- a/src/dask_nested/backends.py +++ b/src/dask_nested/backends.py @@ -21,7 +21,9 @@ @make_meta_dispatch.register(npd.NestedFrame) def make_meta_frame(x, index=None) -> npd.NestedFrame: """Create an empty NestedFrame to use as Dask's underlying object meta.""" - result = x.head(0) + + dtypes = x.dtypes.to_dict() + result = npd.NestedFrame({key: pd.Series(dtype=d) for key, d in dtypes.items()}) return result diff --git a/src/dask_nested/core.py b/src/dask_nested/core.py index 2f3cc80..a93b3da 100644 --- a/src/dask_nested/core.py +++ b/src/dask_nested/core.py @@ -83,7 +83,9 @@ def from_nested_pandas( the size and index of the dataframe, the output may have fewer partitions than requested. chunksize: `int`, optional - Size of the individual chunks of data in non-parallel objects that make up Dask frames. + The desired number of rows per index partition to use. Note that + depending on the size and index of the dataframe, actual partition + sizes may vary. sort: `bool`, optional Whether to sort the frame by a default index. @@ -133,7 +135,7 @@ def nested_columns(self) -> list: nest_cols.append(column) return nest_cols - def add_nested(self, nested, name) -> NestedFrame: # type: ignore[name-defined] # noqa: F821 + def add_nested(self, nested, name, how="outer") -> NestedFrame: # type: ignore[name-defined] # noqa: F821 """Packs a dataframe into a nested column Parameters @@ -142,13 +144,29 @@ def add_nested(self, nested, name) -> NestedFrame: # type: ignore[name-defined] A flat dataframe to pack into a nested column name: The name given to the nested column + how: {‘left’, ‘right’, ‘outer’, ‘inner’, ‘cross’}, default ‘outer’ + How to handle the operation of the two objects. + + * left: use calling frame’s index (or column if on is specified) + + * right: use other’s index. + + * outer: form union of calling frame’s index (or column if on is + specified) with other’s index, and sort it lexicographically. + + * inner: form intersection of calling frame’s index (or column if + on is specified) with other’s index, preserving the order of the + calling’s one. + + * cross: creates the cartesian product from both frames, preserves + the order of the left keys. Returns ------- `dask_nested.NestedFrame` """ nested = nested.map_partitions(lambda x: pack_flat(x)).rename(name) - return self.join(nested, how="outer") + return self.join(nested, how=how) def query(self, expr) -> Self: # type: ignore # noqa: F821: """