Skip to content

Commit

Permalink
unstack: require unique MultiIndex (#8737)
Browse files Browse the repository at this point in the history
* unstack: require unique multiindex

* whats new

* fix ds creation

* fix the correct array

* update error message

* update err msg in tests

* Apply suggestions from code review
  • Loading branch information
mathause authored Feb 13, 2024
1 parent d644607 commit 013a426
Show file tree
Hide file tree
Showing 5 changed files with 35 additions and 0 deletions.
2 changes: 2 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,8 @@ Bug fixes
lead to integer overflow or unsafe conversion from floating point to integer
values (:issue:`8542`, :pull:`8575`). By `Spencer Clark
<https://github.com/spencerkclark>`_.
- Raise an error when unstacking a MultiIndex that has duplicates as this would lead
to silent data loss (:issue:`7104`, :pull:`8737`). By `Mathias Hauser <https://github.com/mathause>`_.

Documentation
~~~~~~~~~~~~~
Expand Down
7 changes: 7 additions & 0 deletions xarray/core/indexes.py
Original file line number Diff line number Diff line change
Expand Up @@ -1017,6 +1017,13 @@ def stack(
def unstack(self) -> tuple[dict[Hashable, Index], pd.MultiIndex]:
clean_index = remove_unused_levels_categories(self.index)

if not clean_index.is_unique:
raise ValueError(
"Cannot unstack MultiIndex containing duplicates. Make sure entries "
f"are unique, e.g., by calling ``.drop_duplicates('{self.dim}')``, "
"before unstacking."
)

new_indexes: dict[Hashable, Index] = {}
for name, lev in zip(clean_index.names, clean_index.levels):
idx = PandasIndex(
Expand Down
9 changes: 9 additions & 0 deletions xarray/tests/test_dataarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -2532,6 +2532,15 @@ def test_unstack_pandas_consistency(self) -> None:
actual = DataArray(s, dims="z").unstack("z")
assert_identical(expected, actual)

def test_unstack_requires_unique(self) -> None:
df = pd.DataFrame({"foo": range(2), "x": ["a", "a"], "y": [0, 0]})
s = df.set_index(["x", "y"])["foo"]

with pytest.raises(
ValueError, match="Cannot unstack MultiIndex containing duplicates"
):
DataArray(s, dims="z").unstack("z")

@pytest.mark.filterwarnings("error")
def test_unstack_roundtrip_integer_array(self) -> None:
arr = xr.DataArray(
Expand Down
8 changes: 8 additions & 0 deletions xarray/tests/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -3764,6 +3764,14 @@ def test_unstack_errors(self) -> None:
with pytest.raises(ValueError, match=r".*do not have exactly one multi-index"):
ds.unstack("x")

ds = Dataset({"da": [1, 2]}, coords={"y": ("x", [1, 1]), "z": ("x", [0, 0])})
ds = ds.set_index(x=("y", "z"))

with pytest.raises(
ValueError, match="Cannot unstack MultiIndex containing duplicates"
):
ds.unstack("x")

def test_unstack_fill_value(self) -> None:
ds = xr.Dataset(
{"var": (("x",), np.arange(6)), "other_var": (("x",), np.arange(3, 9))},
Expand Down
9 changes: 9 additions & 0 deletions xarray/tests/test_indexes.py
Original file line number Diff line number Diff line change
Expand Up @@ -452,6 +452,15 @@ def test_unstack(self) -> None:
assert new_indexes["two"].equals(PandasIndex([1, 2, 3], "two"))
assert new_pd_idx.equals(pd_midx)

def test_unstack_requires_unique(self) -> None:
pd_midx = pd.MultiIndex.from_product([["a", "a"], [1, 2]], names=["one", "two"])
index = PandasMultiIndex(pd_midx, "x")

with pytest.raises(
ValueError, match="Cannot unstack MultiIndex containing duplicates"
):
index.unstack()

def test_create_variables(self) -> None:
foo_data = np.array([0, 0, 1], dtype="int64")
bar_data = np.array([1.1, 1.2, 1.3], dtype="float64")
Expand Down

0 comments on commit 013a426

Please sign in to comment.