-
Notifications
You must be signed in to change notification settings - Fork 119
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. Weβll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat: add DataFrame
and LazyFrame
explode
method
#1542
Merged
Merged
Changes from all commits
Commits
Show all changes
17 commits
Select commit
Hold shift + click to select a range
3061fe9
feat: DataFrame and LazyFrame explode
FBruzzesi 2326b08
arrow refactor
FBruzzesi 32af22e
raise for invalid type and docstrings
FBruzzesi 3b52ab5
Update narwhals/dataframe.py
FBruzzesi c3bf009
old versions
FBruzzesi b427e79
merge main
FBruzzesi c77dc62
Merge branch 'main' into feat/explode-method
FBruzzesi 72314a2
almost all native
FBruzzesi 7f04579
doctest
FBruzzesi 7be326e
Merge branch 'main' into feat/explode-method
FBruzzesi 5da1ad6
Merge branch 'main' into feat/explode-method
FBruzzesi 4a098b8
Merge branch 'main' into feat/explode-method
FBruzzesi 380a6cb
Merge branch 'feat/explode-method' of https://github.com/narwhals-devβ¦
FBruzzesi c7a47c9
Merge branch 'main' into feat/explode-method
FBruzzesi 864e932
better error message, fail for arrow with nulls
FBruzzesi cc72f6b
doctest-modules
FBruzzesi 1156beb
completely remove pyarrow implementation
FBruzzesi File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -12,6 +12,7 @@ | |
- drop | ||
- drop_nulls | ||
- estimated_size | ||
- explode | ||
- filter | ||
- gather_every | ||
- get_column | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10,6 +10,7 @@ | |
- columns | ||
- drop | ||
- drop_nulls | ||
- explode | ||
- filter | ||
- gather_every | ||
- group_by | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,146 @@ | ||
from __future__ import annotations | ||
|
||
from typing import Sequence | ||
|
||
import pytest | ||
from polars.exceptions import InvalidOperationError as PlInvalidOperationError | ||
from polars.exceptions import ShapeError as PlShapeError | ||
|
||
import narwhals.stable.v1 as nw | ||
from narwhals.exceptions import InvalidOperationError | ||
from narwhals.exceptions import ShapeError | ||
from tests.utils import PANDAS_VERSION | ||
from tests.utils import POLARS_VERSION | ||
from tests.utils import Constructor | ||
from tests.utils import assert_equal_data | ||
|
||
# For context, polars allows to explode multiple columns only if the columns | ||
# have matching element counts, therefore, l1 and l2 but not l1 and l3 together. | ||
data = { | ||
"a": ["x", "y", "z", "w"], | ||
"l1": [[1, 2], None, [None], []], | ||
"l2": [[3, None], None, [42], []], | ||
"l3": [[1, 2], [3], [None], [1]], | ||
"l4": [[1, 2], [3], [123], [456]], | ||
} | ||
|
||
|
||
@pytest.mark.parametrize( | ||
("column", "expected_values"), | ||
[ | ||
("l2", [3, None, None, 42, None]), | ||
("l3", [1, 2, 3, None, 1]), # fast path for arrow | ||
], | ||
) | ||
def test_explode_single_col( | ||
request: pytest.FixtureRequest, | ||
constructor: Constructor, | ||
column: str, | ||
expected_values: list[int | None], | ||
) -> None: | ||
if any( | ||
backend in str(constructor) | ||
for backend in ("dask", "modin", "cudf", "pyarrow_table") | ||
): | ||
request.applymarker(pytest.mark.xfail) | ||
|
||
if "pandas" in str(constructor) and PANDAS_VERSION < (2, 2): | ||
request.applymarker(pytest.mark.xfail) | ||
|
||
result = ( | ||
nw.from_native(constructor(data)) | ||
.with_columns(nw.col(column).cast(nw.List(nw.Int32()))) | ||
.explode(column) | ||
.select("a", column) | ||
) | ||
expected = {"a": ["x", "x", "y", "z", "w"], column: expected_values} | ||
assert_equal_data(result, expected) | ||
|
||
|
||
@pytest.mark.parametrize( | ||
("columns", "more_columns", "expected"), | ||
[ | ||
( | ||
"l1", | ||
["l2"], | ||
{ | ||
"a": ["x", "x", "y", "z", "w"], | ||
"l1": [1, 2, None, None, None], | ||
"l2": [3, None, None, 42, None], | ||
}, | ||
), | ||
( | ||
"l3", | ||
["l4"], | ||
{ | ||
"a": ["x", "x", "y", "z", "w"], | ||
"l3": [1, 2, 3, None, 1], | ||
"l4": [1, 2, 3, 123, 456], | ||
}, | ||
), | ||
], | ||
) | ||
def test_explode_multiple_cols( | ||
request: pytest.FixtureRequest, | ||
constructor: Constructor, | ||
columns: str | Sequence[str], | ||
more_columns: Sequence[str], | ||
expected: dict[str, list[str | int | None]], | ||
) -> None: | ||
if any( | ||
backend in str(constructor) | ||
for backend in ("dask", "modin", "cudf", "pyarrow_table") | ||
): | ||
request.applymarker(pytest.mark.xfail) | ||
|
||
if "pandas" in str(constructor) and PANDAS_VERSION < (2, 2): | ||
request.applymarker(pytest.mark.xfail) | ||
|
||
result = ( | ||
nw.from_native(constructor(data)) | ||
.with_columns(nw.col(columns, *more_columns).cast(nw.List(nw.Int32()))) | ||
.explode(columns, *more_columns) | ||
.select("a", columns, *more_columns) | ||
) | ||
assert_equal_data(result, expected) | ||
|
||
|
||
def test_explode_shape_error( | ||
request: pytest.FixtureRequest, constructor: Constructor | ||
) -> None: | ||
if any( | ||
backend in str(constructor) | ||
for backend in ("dask", "modin", "cudf", "pyarrow_table") | ||
): | ||
request.applymarker(pytest.mark.xfail) | ||
|
||
if "pandas" in str(constructor) and PANDAS_VERSION < (2, 2): | ||
request.applymarker(pytest.mark.xfail) | ||
|
||
with pytest.raises( | ||
(ShapeError, PlShapeError), | ||
match="exploded columns must have matching element counts", | ||
): | ||
_ = ( | ||
nw.from_native(constructor(data)) | ||
.lazy() | ||
.with_columns(nw.col("l1", "l2", "l3").cast(nw.List(nw.Int32()))) | ||
.explode("l1", "l3") | ||
.collect() | ||
) | ||
|
||
|
||
def test_explode_invalid_operation_error( | ||
request: pytest.FixtureRequest, constructor: Constructor | ||
) -> None: | ||
if "dask" in str(constructor) or "pyarrow_table" in str(constructor): | ||
request.applymarker(pytest.mark.xfail) | ||
|
||
if "polars" in str(constructor) and POLARS_VERSION < (0, 20, 6): | ||
request.applymarker(pytest.mark.xfail) | ||
|
||
with pytest.raises( | ||
(InvalidOperationError, PlInvalidOperationError), | ||
match="`explode` operation not supported for dtype", | ||
): | ||
_ = nw.from_native(constructor(data)).lazy().explode("a").collect() |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If a single column is to be exploded, then we use the pandas native method. If multiple columns, the strategy is to explode the one column with the rest of the dataframe, and the other series individually and finally concatenating them back, plus sorting by original column names order