diff --git a/docs/tutorials/low_level.ipynb b/docs/tutorials/low_level.ipynb index 7672662..307366c 100644 --- a/docs/tutorials/low_level.ipynb +++ b/docs/tutorials/low_level.ipynb @@ -17,12 +17,7 @@ "cell_type": "code", "execution_count": null, "id": "619f088e7ac0f327", - "metadata": { - "ExecuteTime": { - "end_time": "2024-05-09T12:43:47.641800Z", - "start_time": "2024-05-09T12:43:47.634903Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", @@ -49,12 +44,7 @@ "cell_type": "code", "execution_count": null, "id": "f9dd16a4bb9aaa63", - "metadata": { - "ExecuteTime": { - "end_time": "2024-05-09T12:43:47.708715Z", - "start_time": "2024-05-09T12:43:47.700005Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "nested_df = generate_data(4, 3, seed=42)\n", @@ -83,20 +73,17 @@ "source": [ "### `.nest` object is a mapping\n", "\n", - "`.nest` accessor provides an object implementing `Mapping` interface, so you can use it like a dictionary.\n", - "Keys of this mapping are the names of the nested columns (fields), and values are \"flat\" Series representing the nested data." + "`.nest` accessor provides an object implementing `Mapping` interface, so you can use it like an immutable dictionary.\n", + "Keys of this mapping are the names of the nested columns (fields), and values are \"flat\" Series representing the nested data.\n", + "\n", + "The only way to modify the nested data in-place with this interface is to re-assign the whole field with a new data of the same length and dtype, see the discussion about the mutability limitations in [this GitHub issue](https://github.com/lincc-frameworks/nested-pandas/issues/87)." ] }, { "cell_type": "code", "execution_count": null, "id": "fb7beb750d3e2893", - "metadata": { - "ExecuteTime": { - "end_time": "2024-05-09T12:43:47.711893Z", - "start_time": "2024-05-09T12:43:47.709614Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "list(nested_series.nest.keys())" @@ -114,12 +101,7 @@ "cell_type": "code", "execution_count": null, "id": "56b0d9ffc5820d22", - "metadata": { - "ExecuteTime": { - "end_time": "2024-05-09T12:43:47.714235Z", - "start_time": "2024-05-09T12:43:47.712499Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "nested_series.nest.fields" @@ -137,12 +119,7 @@ "cell_type": "code", "execution_count": null, "id": "30ee9a430b6ff641", - "metadata": { - "ExecuteTime": { - "end_time": "2024-05-09T12:43:47.717863Z", - "start_time": "2024-05-09T12:43:47.715368Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "nested_series.nest[\"t\"]" @@ -160,12 +137,7 @@ "cell_type": "code", "execution_count": null, "id": "f0db15d31b289140", - "metadata": { - "ExecuteTime": { - "end_time": "2024-05-09T12:43:47.720405Z", - "start_time": "2024-05-09T12:43:47.718626Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "nested_series.nest[[\"t\", \"flux\"]].dtype" @@ -177,7 +149,9 @@ "metadata": {}, "source": [ "You can add new columns, drop existing ones, or modify the existing ones.\n", - "The modification is currently limited to the case when you replace the whole \"flat\" Series with a new one of the same length.\n", + "These operations would create new nested Series, however they would create shallow copies of the rest of the fields, so they are quite efficient.\n", + "\n", + "The in-place modification is currently limited to the case when you replace the whole \"flat\" Series with a new one of the same length and compatible dtype.\n", "When modifying the nested data, only the column you are working with is changed, the rest of the data are not affected and not copied." ] }, @@ -185,12 +159,7 @@ "cell_type": "code", "execution_count": null, "id": "66ae5cc26fa17458", - "metadata": { - "ExecuteTime": { - "end_time": "2024-05-09T12:43:47.726619Z", - "start_time": "2024-05-09T12:43:47.721070Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "new_series = nested_series.copy()\n", @@ -198,14 +167,20 @@ "# Change the data in-place\n", "new_series.nest[\"flux\"] = new_series.nest[\"flux\"] - new_series.nest[\"flux\"].mean()\n", "\n", - "# Add new column\n", - "new_series.nest[\"lsst_band\"] = \"lsst_\" + new_series.nest[\"band\"]\n", + "# Create a new series with a new column\n", + "new_series = new_series.nest.with_field(\"lsst_band\", \"lsst_\" + new_series.nest[\"band\"])\n", "\n", - "# Drop the column, .pop() method is also available\n", - "del new_series.nest[\"band\"]\n", + "# Create a new series with a column removed, you can also pass a list of columns to remove\n", + "new_series = new_series.nest.without_field(\"band\")\n", "\n", "# Add a new column with a python list instead of a Series\n", - "new_series.nest[\"new_column\"] = [1, 2] * (new_series.nest.flat_length // 2)\n", + "new_series = new_series.nest.with_field(\n", + " \"new_column\",\n", + " [1, 2] * (new_series.nest.flat_length // 2),\n", + ")\n", + "\n", + "# Create a new series, with a column dtype changed\n", + "new_series = new_series.nest.with_field(\"t\", new_series.nest[\"t\"].astype(np.int8))\n", "\n", "new_series.nest.to_flat()" ] @@ -228,12 +203,7 @@ "cell_type": "code", "execution_count": null, "id": "ce6d519d8d37ead3", - "metadata": { - "ExecuteTime": { - "end_time": "2024-05-09T12:43:47.768616Z", - "start_time": "2024-05-09T12:43:47.764343Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "nested_series.nest.to_flat([\"flux\", \"t\"])" @@ -243,12 +213,7 @@ "cell_type": "code", "execution_count": null, "id": "2421b91387487995", - "metadata": { - "ExecuteTime": { - "end_time": "2024-05-09T12:43:47.798697Z", - "start_time": "2024-05-09T12:43:47.795583Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "lists_df = nested_series.nest.to_lists() # may also accept a list of fields (nested columns) to get\n", @@ -267,19 +232,12 @@ "cell_type": "code", "execution_count": null, "id": "f2c205e95affb9ba", - "metadata": { - "ExecuteTime": { - "end_time": "2024-05-09T12:43:47.833034Z", - "start_time": "2024-05-09T12:43:47.827805Z" - } - }, + "metadata": {}, "outputs": [], "source": [ - "new_series = nested_series.copy()\n", - "\n", "# Adjust each time to be relative to the first observation\n", "dt = new_series.nest.to_lists()[\"t\"].apply(lambda t: t - t.min())\n", - "new_series.nest.set_list_field(\"dt\", dt)\n", + "new_series = new_series.nest.with_list_field(\"dt\", dt)\n", "new_series.nest.to_flat()" ] }, @@ -313,12 +271,7 @@ "cell_type": "code", "execution_count": null, "id": "8ef96243c6d74aff", - "metadata": { - "ExecuteTime": { - "end_time": "2024-05-09T12:43:47.875752Z", - "start_time": "2024-05-09T12:43:47.872293Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "struct_series = pd.Series(nested_series, dtype=nested_series.dtype.to_pandas_arrow_dtype())\n", @@ -329,12 +282,7 @@ "cell_type": "code", "execution_count": null, "id": "422e719861ae40f6", - "metadata": { - "ExecuteTime": { - "end_time": "2024-05-09T12:43:47.925465Z", - "start_time": "2024-05-09T12:43:47.922965Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "nested_series.equals(pd.Series(struct_series, dtype=NestedDtype.from_pandas_arrow_dtype(struct_series.dtype)))" @@ -364,12 +312,7 @@ "cell_type": "code", "execution_count": null, "id": "926f2c9fcffc5f03", - "metadata": { - "ExecuteTime": { - "end_time": "2024-05-09T12:43:47.937490Z", - "start_time": "2024-05-09T12:43:47.933878Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "new_series = pack(nested_series.nest.to_flat())\n", @@ -380,12 +323,7 @@ "cell_type": "code", "execution_count": null, "id": "3a1d2025c232ac82", - "metadata": { - "ExecuteTime": { - "end_time": "2024-05-09T12:43:47.969831Z", - "start_time": "2024-05-09T12:43:47.964948Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "series_from_flat = pack(\n", @@ -422,12 +360,7 @@ "cell_type": "code", "execution_count": null, "id": "2de4619726ab3d5c", - "metadata": { - "ExecuteTime": { - "end_time": "2024-05-09T12:43:47.991261Z", - "start_time": "2024-05-09T12:43:47.986129Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "series_from_pack = pack(\n", @@ -454,12 +387,7 @@ "cell_type": "code", "execution_count": null, "id": "9c63ae45dd0b6a29", - "metadata": { - "ExecuteTime": { - "end_time": "2024-05-09T12:43:47.995869Z", - "start_time": "2024-05-09T12:43:47.992016Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "series_from_pack = pack(\n", @@ -500,12 +428,7 @@ "cell_type": "code", "execution_count": null, "id": "1284d9b536b9e784", - "metadata": { - "ExecuteTime": { - "end_time": "2024-05-09T12:43:48.000441Z", - "start_time": "2024-05-09T12:43:47.996620Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "series_from_dtype = pd.Series(\n", @@ -531,12 +454,7 @@ "cell_type": "code", "execution_count": null, "id": "b7c7fd878bc97f68", - "metadata": { - "ExecuteTime": { - "end_time": "2024-05-09T12:43:48.004677Z", - "start_time": "2024-05-09T12:43:48.001129Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "series_pa_type = pa.struct({\"t\": pa.list_(pa.float64()), \"band\": pa.list_(pa.string())})\n", @@ -568,12 +486,7 @@ "cell_type": "code", "execution_count": null, "id": "e837d25dcb0a2b4d", - "metadata": { - "ExecuteTime": { - "end_time": "2024-05-09T12:43:48.015257Z", - "start_time": "2024-05-09T12:43:48.013217Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "pa_struct_array = pa.StructArray.from_arrays(\n", @@ -611,12 +524,7 @@ "cell_type": "code", "execution_count": null, "id": "116c902ea8681c9e", - "metadata": { - "ExecuteTime": { - "end_time": "2024-05-09T12:43:48.040801Z", - "start_time": "2024-05-09T12:43:48.038106Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "# Convert to pd.ArrowDtype Series of struct-arrays\n", @@ -641,12 +549,7 @@ "cell_type": "code", "execution_count": null, "id": "30ea40dee30795d1", - "metadata": { - "ExecuteTime": { - "end_time": "2024-05-09T12:43:48.055678Z", - "start_time": "2024-05-09T12:43:48.050677Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "for element in nested_series:\n", @@ -665,12 +568,7 @@ "cell_type": "code", "execution_count": null, "id": "81f6c1f98dfc26a9", - "metadata": { - "ExecuteTime": { - "end_time": "2024-05-09T12:43:48.060166Z", - "start_time": "2024-05-09T12:43:48.056425Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "nested_elements = list(nested_series)\n", @@ -689,12 +587,7 @@ "cell_type": "code", "execution_count": null, "id": "69ed758c48c55015", - "metadata": { - "ExecuteTime": { - "end_time": "2024-05-09T12:43:48.063115Z", - "start_time": "2024-05-09T12:43:48.060863Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "nested_series_with_na = pack([None, pd.NA, {\"t\": [1, 2], \"flux\": [0.1, None]}])\n", @@ -707,12 +600,7 @@ "cell_type": "code", "execution_count": null, "id": "99ce9d18bc69ae49", - "metadata": { - "ExecuteTime": { - "end_time": "2024-05-09T12:43:48.088986Z", - "start_time": "2024-05-09T12:43:48.086255Z" - } - }, + "metadata": {}, "outputs": [], "source": [ "# Would have empty pd.DataFrame for top-level missed data\n", diff --git a/src/nested_pandas/series/accessor.py b/src/nested_pandas/series/accessor.py index 85ca6dc..4cd67fd 100644 --- a/src/nested_pandas/series/accessor.py +++ b/src/nested_pandas/series/accessor.py @@ -2,7 +2,7 @@ from __future__ import annotations from collections import defaultdict -from collections.abc import Generator, MutableMapping +from collections.abc import Generator, Mapping from typing import cast import numpy as np @@ -18,7 +18,7 @@ @register_series_accessor("nest") -class NestSeriesAccessor(MutableMapping): +class NestSeriesAccessor(Mapping): """Accessor for operations on Series of NestedDtype This accessor implements `MutableMapping` interface over the fields of the @@ -124,8 +124,10 @@ def fields(self) -> list[str]: """Names of the nested columns""" return self._series.array.field_names - def set_flat_field(self, field: str, value: ArrayLike) -> None: - """Set the field from flat-array of values, in-place + def with_field(self, field: str, value: ArrayLike) -> pd.Series: + """Set the field from flat-array of values and return a new series + + It is an alias for `.nest.with_flat_field`. Parameters ---------- @@ -134,11 +136,36 @@ def set_flat_field(self, field: str, value: ArrayLike) -> None: value : ArrayLike Array of values to set. It must be a scalar or have the same length as the flat arrays, e.g. `self.flat_length`. + + Returns + ------- + pd.Series + The new series with the field set. """ - self._series.array.set_flat_field(field, value) + return self.with_flat_field(field, value) - def set_list_field(self, field: str, value: ArrayLike) -> None: - """Set the field from list-array, in-place + def with_flat_field(self, field: str, value: ArrayLike) -> pd.Series: + """Set the field from flat-array of values and return a new series + + Parameters + ---------- + field : str + Name of the field to set. If not present, it will be added. + value : ArrayLike + Array of values to set. It must be a scalar or have the same length + as the flat arrays, e.g. `self.flat_length`. + + Returns + ------- + pd.Series + The new series with the field set. + """ + new_array = self._series.array.copy() + new_array.set_flat_field(field, value) + return pd.Series(new_array, copy=False, index=self._series.index, name=self._series.name) + + def with_list_field(self, field: str, value: ArrayLike) -> pd.Series: + """Set the field from list-array of values and return a new series Parameters ---------- @@ -147,27 +174,37 @@ def set_list_field(self, field: str, value: ArrayLike) -> None: value : ArrayLike Array of values to set. It must be a list-array of the same length as the series, e.g. length of the series. + + Returns + ------- + pd.Series + The new series with the field set. """ - self._series.array.set_list_field(field, value) + new_array = self._series.array.copy() + new_array.set_list_field(field, value) + return pd.Series(new_array, copy=False, index=self._series.index, name=self._series.name) - # I intentionally don't call it `drop` or `drop_field` because `pd.DataFrame.drop` is not inplace - # by default, and I wouldn't like to surprise the user. - def pop_field(self, field: str) -> pd.Series: - """Delete the field from the struct and return it. + def without_field(self, field: str | list[str]) -> pd.Series: + """Remove the field(s) from the series and return a new series + + Note, that at least one field must be left in the series. Parameters ---------- - field : str - Name of the field to delete. + field : str or list[str] + Name of the field(s) to remove. Returns ------- pd.Series - The deleted field. + The new series without the field(s). """ - series = self[field] - self._series.array.pop_field(field) - return series + if isinstance(field, str): + field = [field] + + new_array = self._series.array.copy() + new_array.pop_fields(field) + return pd.Series(new_array, copy=False, index=self._series.index, name=self._series.name) def query_flat(self, query: str) -> pd.Series: """Query the flat arrays with a boolean expression @@ -255,6 +292,12 @@ def __getitem__(self, key: str | list[str]) -> pd.Series: return self.get_flat_series(key) def __setitem__(self, key: str, value: ArrayLike) -> None: + """Replace the field values from flat-array of values + + Currently, only replacement of the whole field is supported, the length + and dtype of the input value must match the field. + https://github.com/lincc-frameworks/nested-pandas/issues/87 + """ # TODO: we can be much-much smarter about the performance here # TODO: think better about underlying pa.ChunkArray in both self._series.array and value @@ -268,7 +311,7 @@ def __setitem__(self, key: str, value: ArrayLike) -> None: # Set single value for all rows if ndim == 0: - self.set_flat_field(key, value) + self._series.array.set_flat_field(key, value, keep_dtype=True) return if isinstance(value, pd.Series) and not self.get_flat_index().equals(value.index): @@ -284,13 +327,22 @@ def __setitem__(self, key: str, value: ArrayLike) -> None: f"{len(self._series)}." ) - self.set_flat_field(key, pa_array) - - def __delitem__(self, key: str) -> None: - self.pop_field(key) + self._series.array.set_flat_field(key, pa_array, keep_dtype=True) def __iter__(self) -> Generator[str, None, None]: - yield from iter(self._series.array.field_names) + return iter(self._series.array.field_names) def __len__(self) -> int: return len(self._series.array.field_names) + + def __eq__(self, other) -> bool: + if not isinstance(other, type(self)): + return False + return self._series.equals(other._series) + + def clear(self) -> None: + """Mandatory MutableMapping method, always fails with NotImplementedError + + The reason is that we cannot delete all nested fields from the nested series. + """ + raise NotImplementedError("Cannot delete fields from nested series") diff --git a/src/nested_pandas/series/ext_array.py b/src/nested_pandas/series/ext_array.py index e915cdd..cc968ae 100644 --- a/src/nested_pandas/series/ext_array.py +++ b/src/nested_pandas/series/ext_array.py @@ -35,7 +35,7 @@ # typing.Self and "|" union syntax don't exist in Python 3.9 from __future__ import annotations -from collections.abc import Iterator, Sequence +from collections.abc import Iterable, Iterator, Sequence from typing import Any, Callable, cast import numpy as np @@ -676,43 +676,94 @@ def view_fields(self, fields: str | list[str]) -> Self: # type: ignore[name-def return self.__class__(pa_array, validate=False) - def set_flat_field(self, field: str, value: ArrayLike) -> None: + def set_flat_field(self, field: str, value: ArrayLike, *, keep_dtype: bool = False) -> None: """Set the field from flat-array of values + Note that if this updates the dtype, it would not affect the dtype of + the pd.Series back-ended by this extension array. + Parameters ---------- field : str The name of the field. value : ArrayLike The 'flat' array of values to be set. + keep_dtype : bool, default False + Whether to keep the original dtype of the field. If True, + now new field will be created, and the dtype of the existing + field will be kept. If False, the dtype of the field will be + inferred from the input value. """ # TODO: optimize for the case when the input is a pa.ChunkedArray + if keep_dtype: + if field not in self.field_names: + raise ValueError( + "If keep_dtype is True, the field must exist in the series. " + f"Got: {field}, available: {self.field_names}" + ) + # Get the current element type of list-array + pa_type = self._pa_array.chunk(0).field(field).type.value_type + else: + pa_type = None + if np.ndim(value) == 0: value = np.repeat(value, self.flat_length) - pa_array = pa.array(value) + try: + pa_array = pa.array(value, from_pandas=True, type=pa_type) + except (ValueError, TypeError) as e: + raise TypeError( + f"New values must be convertible to the existing element pyarrow type, {pa_type}. " + "If you want to replace field with values of a new type, use series.nest.with_flat_field() " + "or NestedExtensionArray.set_flat_field(..., keep_dtype=False) instead." + ) from e if len(pa_array) != self.flat_length: raise ValueError("The input must be a struct_scalar or have the same length as the flat arrays") list_array = pa.ListArray.from_arrays(values=pa_array, offsets=self.list_offsets) - return self.set_list_field(field, list_array) + return self.set_list_field(field, list_array, keep_dtype=keep_dtype) - def set_list_field(self, field: str, value: ArrayLike) -> None: + def set_list_field(self, field: str, value: ArrayLike, *, keep_dtype: bool = False) -> None: """Set the field from list-array + Note that if this updates the dtype, it would not affect the dtype of + the pd.Series back-ended by this extension array. + Parameters ---------- field : str The name of the field. value : ArrayLike The list-array of values to be set. + keep_dtype : bool, default False + Whether to keep the original dtype of the field. If True, + now new field will be created, and the dtype of the existing + field will be kept. If False, the dtype of the field will be + inferred from the input value. """ # TODO: optimize for the case when the input is a pa.ChunkedArray - pa_array = pa.array(value) + if keep_dtype: + if field not in self.field_names: + raise ValueError( + "If keep_dtype is True, the field must exist in the series. " + f"Got: {field}, available: {self.field_names}" + ) + pa_type = self._pa_array.chunk(0).field(field).type + else: + pa_type = None + + try: + pa_array = pa.array(value, from_pandas=True, type=pa_type) + except (ValueError, TypeError) as e: + raise TypeError( + f"New values must be convertible to the existing list pyarrow type, {pa_type}. " + "If you want to replace field with values of a new type, use series.nest.with_list_field() " + "or NestedExtensionArray.set_list_field(..., keep_dtype=False) instead." + ) from e if not is_pa_type_a_list(pa_array.type): raise ValueError(f"Expected a list array, got {pa_array.type}") @@ -724,38 +775,42 @@ def set_list_field(self, field: str, value: ArrayLike) -> None: for sl, chunk in enumerate_chunks(self._pa_array): chunk = cast(pa.StructArray, chunk) - # Build a new struct array. We collect all existing fields and add the new one. + # Build a new struct array. We collect all existing fields and add/replace the new one. struct_dict = {} for pa_field in chunk.type: struct_dict[pa_field.name] = chunk.field(pa_field.name) - struct_dict[field] = pa.array(pa_array[sl]) + struct_dict[field] = pa_array[sl] struct_array = pa.StructArray.from_arrays(struct_dict.values(), struct_dict.keys()) chunks.append(struct_array) - pa_array = pa.chunked_array(chunks) + chunked_array = pa.chunked_array(chunks) + + self._replace_pa_array(chunked_array, validate=True) - self._replace_pa_array(pa_array, validate=True) + def pop_fields(self, fields: Iterable[str]): + """Delete fields from the struct array - def pop_field(self, field: str): - """Delete a field from the struct array + Note that at least one field must be left in the struct array. Parameters ---------- - field : str - The name of the field to be deleted. + fields : iterable of str + The names of the fields to delete. """ - if field not in self.field_names: - raise ValueError(f"Field '{field}' not found") + fields = frozenset(fields) + + if not fields.issubset(self.field_names): + raise ValueError(f"Some fields are not found, given: {fields}, available: {self.field_names}") - if len(self.field_names) == 1: - raise ValueError("Cannot delete the last field") + if len(self.field_names) - len(fields) == 0: + raise ValueError("Cannot delete all fields") chunks = [] for chunk in self._pa_array.iterchunks(): chunk = cast(pa.StructArray, chunk) struct_dict = {} for pa_field in chunk.type: - if pa_field.name != field: + if pa_field.name not in fields: struct_dict[pa_field.name] = chunk.field(pa_field.name) struct_array = pa.StructArray.from_arrays(struct_dict.values(), struct_dict.keys()) chunks.append(struct_array) diff --git a/tests/nested_pandas/series/test_accessor.py b/tests/nested_pandas/series/test_accessor.py index fcb6ee5..a9c5549 100644 --- a/tests/nested_pandas/series/test_accessor.py +++ b/tests/nested_pandas/series/test_accessor.py @@ -4,7 +4,7 @@ import pytest from nested_pandas import NestedDtype from nested_pandas.series.ext_array import NestedExtensionArray -from nested_pandas.series.packer import pack_flat +from nested_pandas.series.packer import pack_flat, pack_seq from numpy.testing import assert_array_equal from pandas.testing import assert_frame_equal, assert_index_equal, assert_series_equal @@ -299,7 +299,7 @@ def test_flat_length(): assert series.nest.flat_length == 6 -def test_set_flat_field(): +def test_with_flat_field(): """Test that the .nest.set_flat_field() method works.""" struct_array = pa.StructArray.from_arrays( arrays=[ @@ -310,10 +310,10 @@ def test_set_flat_field(): ) series = pd.Series(struct_array, dtype=NestedDtype(struct_array.type), index=[0, 1]) - series.nest.set_flat_field("a", np.array(["a", "b", "c", "d", "e", "f"])) + new_series = series.nest.with_flat_field("a", np.array(["a", "b", "c", "d", "e", "f"])) assert_series_equal( - series.nest["a"], + new_series.nest["a"], pd.Series( data=["a", "b", "c", "d", "e", "f"], index=[0, 0, 0, 1, 1, 1], @@ -323,7 +323,23 @@ def test_set_flat_field(): ) -def test_set_list_field(): +def test_with_field(): + """Test that .nest.with_field is just an alias to .nest.with_flat_field.""" + struct_array = pa.StructArray.from_arrays( + arrays=[ + pa.array([np.array([1.0, 2.0, 3.0]), np.array([1.0, 2.0, 1.0])]), + pa.array([-np.array([4.0, 5.0, 6.0]), -np.array([3.0, 4.0, 5.0])]), + ], + names=["a", "b"], + ) + series = pd.Series(struct_array, dtype=NestedDtype(struct_array.type), index=[0, 1]) + assert_series_equal( + series.nest.with_field("a", np.array(["a", "b", "c", "d", "e", "f"])), + series.nest.with_flat_field("a", np.array(["a", "b", "c", "d", "e", "f"])), + ) + + +def test_with_list_field(): """Test that the .nest.set_list_field() method works.""" struct_array = pa.StructArray.from_arrays( arrays=[ @@ -334,10 +350,10 @@ def test_set_list_field(): ) series = pd.Series(struct_array, dtype=NestedDtype(struct_array.type), index=[0, 1]) - series.nest.set_list_field("c", [["a", "b", "c"], ["d", "e", "f"]]) + new_series = series.nest.with_list_field("c", [["a", "b", "c"], ["d", "e", "f"]]) assert_series_equal( - series.nest["c"], + new_series.nest["c"], pd.Series( data=["a", "b", "c", "d", "e", "f"], index=[0, 0, 0, 1, 1, 1], @@ -347,29 +363,85 @@ def test_set_list_field(): ) -def test_pop_field(): - """Test that the .nest.pop_field() method works.""" +def test_without_field_single_field(): + """Test .nest.without_field("field")""" struct_array = pa.StructArray.from_arrays( arrays=[ - pa.array([np.array([1.0, 2.0, 3.0]), np.array([1.0, 2.0, 1.0])]), - pa.array([-np.array([4.0, 5.0, 6.0]), -np.array([3.0, 4.0, 5.0])]), + pa.array([np.array([1, 2, 3]), np.array([4, 5, 6])]), + pa.array([np.array([6, 4, 2]), np.array([1, 2, 3])]), ], names=["a", "b"], ) - series = pd.Series(struct_array, dtype=NestedDtype(struct_array.type), index=[0, 1]) + series = pd.Series(struct_array, dtype=NestedDtype(struct_array.type), index=[5, 7]) - a = series.nest.pop_field("a") + new_series = series.nest.without_field("a") - assert_array_equal(series.nest.fields, ["b"]) - assert_series_equal( - a, - pd.Series( - [1.0, 2.0, 3.0, 1.0, 2.0, 1.0], - dtype=pd.ArrowDtype(pa.float64()), - index=[0, 0, 0, 1, 1, 1], - name="a", - ), + desired_struct_array = pa.StructArray.from_arrays( + arrays=[ + pa.array([np.array([6, 4, 2]), np.array([1, 2, 3])]), + ], + names=["b"], + ) + desired = pd.Series(desired_struct_array, dtype=NestedDtype(desired_struct_array.type), index=[5, 7]) + + assert_series_equal(new_series, desired) + + +def test_without_field_multiple_fields(): + """Test .nest.without_field(["field1", "field2"])""" + struct_array = pa.StructArray.from_arrays( + arrays=[ + pa.array([np.array([1, 2, 3]), np.array([4, 5, 6])]), + pa.array([np.array([6, 4, 2]), np.array([1, 2, 3])]), + pa.array([["a", "b", "c"], ["d", "e", "f"]]), + ], + names=["a", "b", "c"], + ) + series = pd.Series(struct_array, dtype=NestedDtype(struct_array.type), index=[5, 7]) + + new_series = series.nest.without_field(["a", "b"]) + + desired_struct_array = pa.StructArray.from_arrays( + arrays=[ + pa.array([["a", "b", "c"], ["d", "e", "f"]]), + ], + names=["c"], + ) + desired = pd.Series(desired_struct_array, dtype=NestedDtype(desired_struct_array.type), index=[5, 7]) + + assert_series_equal(new_series, desired) + + +def test_without_field_raises_for_missing_field(): + """Test .nest.without_field("field") raises for missing field.""" + struct_array = pa.StructArray.from_arrays( + arrays=[ + pa.array([np.array([1, 2, 3]), np.array([4, 5, 6])]), + pa.array([np.array([6, 4, 2]), np.array([1, 2, 3])]), + pa.array([["a", "b", "c"], ["d", "e", "f"]]), + ], + names=["a", "b", "c"], + ) + series = pd.Series(struct_array, dtype=NestedDtype(struct_array.type), index=[5, 7]) + + with pytest.raises(ValueError): + _ = series.nest.without_field("d") + + +def test_without_field_raises_for_missing_fields(): + """Test .nest.without_field(["field1", "field2"]) raises for missing fields.""" + struct_array = pa.StructArray.from_arrays( + arrays=[ + pa.array([np.array([1, 2, 3]), np.array([4, 5, 6])]), + pa.array([np.array([6, 4, 2]), np.array([1, 2, 3])]), + pa.array([["a", "b", "c"], ["d", "e", "f"]]), + ], + names=["a", "b", "c"], ) + series = pd.Series(struct_array, dtype=NestedDtype(struct_array.type), index=[5, 7]) + + with pytest.raises(ValueError): + _ = series.nest.without_field(["a", "d"]) def test_query_flat_1(): @@ -459,6 +531,20 @@ def test_get_list_series(): ) +def test_get(): + """Test .nest.get() which is implemented by the base class""" + series = pack_seq( + [ + pd.DataFrame({"a": [1, 2, 3], "b": [1.0, 5.0, 6.0]}), + pd.DataFrame({"a": [1, 2], "b": [None, 0.0]}), + None, + ] + ) + assert_series_equal(series.nest.get("a"), series.nest.to_flat()["a"]) + assert_series_equal(series.nest.get("b"), series.nest.to_flat()["b"]) + assert series.nest.get("c", "default_value") == "default_value" + + def test___getitem___single_field(): """Test that the .nest["field"] works for a single field.""" struct_array = pa.StructArray.from_arrays( @@ -531,15 +617,15 @@ def test___setitem__(): ) series = pd.Series(struct_array, dtype=NestedDtype(struct_array.type), index=[0, 1]) - series.nest["a"] = np.array(["a", "b", "c", "d", "e", "f"]) + series.nest["a"] = np.arange(6, 0, -1) assert_series_equal( series.nest["a"], pd.Series( - data=["a", "b", "c", "d", "e", "f"], + data=[6, 5, 4, 3, 2, 1], index=[0, 0, 0, 1, 1, 1], name="a", - dtype=pd.ArrowDtype(pa.string()), + dtype=pd.ArrowDtype(pa.float64()), ), ) @@ -556,23 +642,23 @@ def test___setitem___with_series_with_index(): series = pd.Series(struct_array, dtype=NestedDtype(struct_array.type), index=[0, 1]) flat_series = pd.Series( - data=["a", "b", "c", "d", "e", "f"], + data=np.arange(6, 0, -1), index=[0, 0, 0, 1, 1, 1], name="a", - dtype=pd.ArrowDtype(pa.string()), + dtype=pd.ArrowDtype(pa.float32()), ) series.nest["a"] = flat_series assert_series_equal( series.nest["a"], - flat_series, + flat_series.astype(pd.ArrowDtype(pa.float64())), ) assert_series_equal( series.nest.get_list_series("a"), pd.Series( - data=[np.array(["a", "b", "c"]), np.array(["d", "e", "f"])], - dtype=pd.ArrowDtype(pa.list_(pa.string())), + data=[np.array([6, 5, 4]), np.array([3, 2, 1])], + dtype=pd.ArrowDtype(pa.list_(pa.float64())), index=[0, 1], name="a", ), @@ -580,7 +666,7 @@ def test___setitem___with_series_with_index(): def test___setitem___empty_series(): - """Test that the series.nest["field"] = [] for empty series.""" + """Test that series.nest["field"] = [] does nothing for empty series.""" empty_series = pd.Series([], dtype=NestedDtype.from_fields({"a": pa.float64()})) empty_series.nest["a"] = [] assert len(empty_series) == 0 @@ -597,11 +683,12 @@ def test___setitem___with_single_value(): ) series = pd.Series(struct_array, dtype=NestedDtype(struct_array.type), index=[0]) - series.nest["a"] = 1.0 + series.nest["a"] = -1.0 + assert_series_equal( series.nest["a"], pd.Series( - data=[1.0, 1.0, 1.0], + data=[-1.0, -1.0, -1.0], index=[0, 0, 0], name="a", dtype=pd.ArrowDtype(pa.float64()), @@ -609,6 +696,21 @@ def test___setitem___with_single_value(): ) +def test___setitem___raises_for_wrong_dtype(): + """Test that the .nest["field"] = ... raises for a wrong dtype.""" + struct_array = pa.StructArray.from_arrays( + arrays=[ + pa.array([np.array([1.0, 2.0, 3.0]), np.array([1.0, 2.0, 1.0])]), + pa.array([-np.array([4.0, 5.0, 6.0]), -np.array([3.0, 4.0, 5.0])]), + ], + names=["a", "b"], + ) + series = pd.Series(struct_array, dtype=NestedDtype(struct_array.type), index=[0, 1]) + + with pytest.raises(TypeError): + series.nest["a"] = np.array(["a", "b", "c", "d", "e", "f"]) + + def test___setitem___raises_for_wrong_length(): """Test that the .nest["field"] = ... raises for a wrong length.""" struct_array = pa.StructArray.from_arrays( @@ -646,8 +748,15 @@ def test___setitem___raises_for_wrong_index(): series.nest["a"] = flat_series -def test___delitem__(): - """Test that the `del .nest["field"]` works.""" +def test___setitem___raises_for_new_field(): + """Test that series.nest["field"] = ... raises for a new field.""" + series = pack_seq([{"a": [1, 2, 3]}, {"a": [4, None]}]) + with pytest.raises(ValueError): + series.nest["b"] = series.nest["a"] - 1 + + +def test___delitem___raises(): + """Test that the `del .nest["field"]` is not implemented.""" struct_array = pa.StructArray.from_arrays( arrays=[ pa.array([np.array([1.0, 2.0, 3.0]), np.array([1.0, 2.0, 1.0])]), @@ -657,9 +766,8 @@ def test___delitem__(): ) series = pd.Series(struct_array, dtype=NestedDtype(struct_array.type), index=[0, 1]) - del series.nest["a"] - - assert_array_equal(series.nest.fields, ["b"]) + with pytest.raises(AttributeError): + del series.nest["a"] def test___iter__(): @@ -714,3 +822,97 @@ def test_to_flat_dropna(): ), check_dtype=False, # filtered's Series are pd.ArrowDtype ) + + +def test___contains__(): + """Test that the `"field" in .nest` works. + + We haven't implemented it, but base class does + """ + series = pack_seq([pd.DataFrame({"a": [1, 2, 3]})]) + assert "a" in series.nest + assert "x" not in series.nest + + +def test___eq__(): + """Test that one.nest == other.nest works.""" + + series1 = pack_seq([pd.DataFrame({"a": [1, 2, 3]})]) + series2 = pack_seq([pd.DataFrame({"b": [1, 2, 3]})]) + series3 = pack_seq([pd.DataFrame({"a": [1, 2, 3, 4]})]) + series4 = pack_seq([pd.DataFrame({"a": [1, 2, 3], "b": [3, 2, 1]})]) + + assert series1.nest == series1.nest + + assert series2.nest == series2.nest + assert series1.nest != series2.nest + + assert series3.nest == series3.nest + assert series1.nest != series3.nest + + assert series4.nest == series4.nest + assert series1.nest != series4.nest + + +def test___eq___false_for_different_types(): + """Test that one.nest == other.nest is False for different types.""" + seq = [{"a": [1, 2, 3]}, {"a": [4, None]}] + series = pack_seq(seq) + assert series.nest != pd.Series(seq, dtype=pd.ArrowDtype(pa.struct([("a", pa.list_(pa.int64()))]))) + + +def test_clear_raises(): + """Test that .nest.clear() raises - we cannot handle nested series with no fields""" + series = pack_seq([pd.DataFrame({"a": [1, 2, 3], "b": [3, 2, 1]}), None]) + with pytest.raises(NotImplementedError): + series.nest.clear() + + +def test_popitem_raises(): + """Test .nest.popitem() raises""" + series = pack_seq( + [pd.DataFrame({"a": [1, 2, 3], "b": [3, 2, 1]}), pd.DataFrame({"a": [1, 2], "b": [2.0, None]}), None] + ) + + with pytest.raises(AttributeError): + _ = series.nest.popitem() + + +def test_setdefault_raises(): + """Test .nest.setdefault() is not implemented""" + series = pack_seq([{"a": [1, 2, 3]}, {"a": [4, None]}]) + with pytest.raises(AttributeError): + series.nest.setdefault("b", series.nest["a"] * 2.0) + + +def test_update_raises(): + """test series.nest.update(other.nest) is not implemented""" + series1 = pack_seq([{"a": [1, 2, 3], "b": [4, 5, 6]}, {"a": [4, None], "b": [7, 8]}]) + series2 = pack_seq( + [ + {"b": ["x", "y", "z"], "c": [-2.0, -3.0, -4.0]}, + {"b": ["!", "?"], "c": [-5.0, -6.0]}, + ] + ) + with pytest.raises(AttributeError): + series1.nest.update(series2.nest) + + +def test_items(): + """Test series.nest.items() implemented by the base class""" + series = pack_seq([{"a": [1, 2, 3], "b": [3, 2, 1]}, {"a": [4, None], "b": [7, 8]}]) + for key, value in series.nest.items(): + assert_series_equal(value, series.nest[key]) + + +def test_keys(): + """Test series.nest.keys() implemented by the base class""" + series = pack_seq([{"a": [1, 2, 3], "b": [3, 2, 1]}, {"a": [4, None], "b": [7, 8]}]) + assert_array_equal(list(series.nest.keys()), ["a", "b"]) + + +def test_values(): + """Test series.nest.values() implemented by the base class""" + series = pack_seq([{"a": [1, 2, 3], "b": [3, 2, 1]}, {"a": [4, None], "b": [7, 8]}]) + for value in series.nest.values(): + assert_series_equal(value, series.nest[value.name]) diff --git a/tests/nested_pandas/series/test_ext_array.py b/tests/nested_pandas/series/test_ext_array.py index 1ea0193..555362d 100644 --- a/tests/nested_pandas/series/test_ext_array.py +++ b/tests/nested_pandas/series/test_ext_array.py @@ -1370,6 +1370,42 @@ def test_set_flat_field_replace_field_array(): assert_series_equal(pd.Series(ext_array), pd.Series(desired)) +def test_set_flat_field_keep_dtype_raises_for_wrong_dtype(): + """Tests that set_flat_field(keep_dtype=True) raises for a wrong input dtype.""" + struct_array = pa.StructArray.from_arrays( + arrays=[ + pa.array([np.array([1.0, 2.0, 3.0]), np.array([1.0, 2.0, 3.0, 4.0])]), + pa.array([-np.array([4.0, 5.0, 6.0]), -np.array([3.0, 4.0, 5.0, 6.0])]), + ], + names=["a", "b"], + ) + ext_array = NestedExtensionArray(struct_array) + + with pytest.raises(TypeError): + ext_array.set_flat_field("b", ["x", "y", "z", "w", "v", "u", "t"], keep_dtype=True) + + # Do not raise when keep_dtype=False + ext_array.set_flat_field("b", ["x", "y", "z", "w", "v", "u", "t"], keep_dtype=False) + + +def test_set_flat_field_keep_dtype_raises_for_new_field(): + """Tests that set_flat_field(keep_dtype=True) raises for a new field.""" + struct_array = pa.StructArray.from_arrays( + arrays=[ + pa.array([np.array([1.0, 2.0, 3.0]), np.array([1.0, 2.0, 3.0, 4.0])]), + pa.array([-np.array([4.0, 5.0, 6.0]), -np.array([3.0, 4.0, 5.0, 6.0])]), + ], + names=["a", "b"], + ) + ext_array = NestedExtensionArray(struct_array) + + with pytest.raises(ValueError): + ext_array.set_flat_field("c", [True, False, True, False, True, False, True], keep_dtype=True) + + # Do not raise when keep_dtype=False + ext_array.set_flat_field("c", [True, False, True, False, True, False, True], keep_dtype=False) + + def test_set_list_field_new_field(): """Tests setting a new field with a new "list" array""" struct_array = pa.StructArray.from_arrays( @@ -1453,7 +1489,43 @@ def test_set_list_field_raises_for_wrong_length(): ext_array.set_list_field("b", longer_array) -def test_pop_field(): +def test_set_list_field_keep_dtype_raises_for_wrong_dtype(): + """Tests that set_list_field(keep_dtype=True) raises for a wrong input dtype.""" + struct_array = pa.StructArray.from_arrays( + arrays=[ + pa.array([np.array([1.0, 2.0, 3.0]), np.array([1.0, 2.0, 3.0])]), + pa.array([-np.array([4.0, 5.0, 6.0]), -np.array([3.0, 4.0, 5.0])]), + ], + names=["a", "b"], + ) + ext_array = NestedExtensionArray(struct_array) + + with pytest.raises(TypeError): + ext_array.set_list_field("b", [["x", "y", "z"]] * 2, keep_dtype=True) + + # Do not raise when keep_dtype=False + ext_array.set_list_field("b", [["x", "y", "z"]] * 2, keep_dtype=False) + + +def test_set_list_field_keep_dtype_raises_for_new_field(): + """Tests that set_list_field(keep_dtype=True) raises for a new field.""" + struct_array = pa.StructArray.from_arrays( + arrays=[ + pa.array([np.array([1.0, 2.0, 3.0])]), + pa.array([-np.array([4.0, 5.0, 6.0])]), + ], + names=["a", "b"], + ) + ext_array = NestedExtensionArray(struct_array) + + with pytest.raises(ValueError): + ext_array.set_list_field("c", [["x", "y", "z"]], keep_dtype=True) + + # Do not raise when keep_dtype=False + ext_array.set_list_field("c", [["x", "y", "z"]], keep_dtype=False) + + +def test_pop_fields(): """Tests that we can pop a field from the extension array.""" struct_array = pa.StructArray.from_arrays( arrays=[ @@ -1465,7 +1537,7 @@ def test_pop_field(): ) ext_array = NestedExtensionArray(struct_array) - ext_array.pop_field("c") + ext_array.pop_fields(["c"]) desired_struct_array = pa.StructArray.from_arrays( arrays=[ @@ -1479,7 +1551,30 @@ def test_pop_field(): assert_series_equal(pd.Series(ext_array), pd.Series(desired)) -def test_pop_field_raises_for_invalid_field(): +def test_pop_fields_multiple_fields(): + """Tests that we can pop multiple fields from the extension array.""" + struct_array = pa.StructArray.from_arrays( + arrays=[ + pa.array([np.array([1.0, 2.0, 3.0]), np.array([1.0, 2.0, 1.0, 2.0])]), + pa.array([-np.array([4.0, 5.0, 6.0]), -np.array([3.0, 4.0, 5.0, 6.0])]), + pa.array([np.array(["x", "y", "z"]), np.array(["x1", "x2", "x3", "x4"])]), + ], + names=["a", "b", "c"], + ) + ext_array = NestedExtensionArray(struct_array) + + ext_array.pop_fields(["a", "c"]) + + desired_struct_array = pa.StructArray.from_arrays( + arrays=[pa.array([-np.array([4.0, 5.0, 6.0]), -np.array([3.0, 4.0, 5.0, 6.0])])], + names=["b"], + ) + desired = NestedExtensionArray(desired_struct_array) + + assert_series_equal(pd.Series(ext_array), pd.Series(desired)) + + +def test_pop_fields_raises_for_invalid_field(): """Tests that we raise an error when trying to pop a field that does not exist.""" struct_array = pa.StructArray.from_arrays( arrays=[ @@ -1491,7 +1586,22 @@ def test_pop_field_raises_for_invalid_field(): ext_array = NestedExtensionArray(struct_array) with pytest.raises(ValueError): - ext_array.pop_field("c") + ext_array.pop_fields(["c"]) + + +def test_pop_fields_raises_for_some_invalid_fields(): + """Tests that we raise an error when trying to pop some fields that do not exist.""" + struct_array = pa.StructArray.from_arrays( + arrays=[ + pa.array([np.array([1.0, 2.0, 3.0])]), + pa.array([-np.array([4.0, 5.0, 6.0])]), + ], + names=["a", "b"], + ) + ext_array = NestedExtensionArray(struct_array) + + with pytest.raises(ValueError): + ext_array.pop_fields(["a", "c"]) def test_delete_last_field_raises(): @@ -1506,14 +1616,11 @@ def test_delete_last_field_raises(): ) ext_array = NestedExtensionArray(struct_array) - ext_array.pop_field("a") - assert ext_array.field_names == ["b", "c"] - - ext_array.pop_field("c") + ext_array.pop_fields(["c", "a"]) assert ext_array.field_names == ["b"] with pytest.raises(ValueError): - ext_array.pop_field("b") + ext_array.pop_fields(["b"]) def test_from_arrow_ext_array():