diff --git a/narwhals/_arrow/dataframe.py b/narwhals/_arrow/dataframe.py index 2750f8c09..fa5a69950 100644 --- a/narwhals/_arrow/dataframe.py +++ b/narwhals/_arrow/dataframe.py @@ -277,6 +277,7 @@ def join( how: Literal["left", "inner", "outer", "cross", "anti", "semi"] = "inner", left_on: str | list[str] | None, right_on: str | list[str] | None, + suffix: str, ) -> Self: how_to_join_map = { "anti": "left anti", @@ -298,7 +299,7 @@ def join( keys=key_token, right_keys=key_token, join_type="inner", - right_suffix="_right", + right_suffix=suffix, ) .drop([key_token]), ) @@ -309,7 +310,7 @@ def join( keys=left_on, right_keys=right_on, join_type=how_to_join_map[how], - right_suffix="_right", + right_suffix=suffix, ), ) diff --git a/narwhals/_arrow/expr.py b/narwhals/_arrow/expr.py index 593e73eb3..31052fa52 100644 --- a/narwhals/_arrow/expr.py +++ b/narwhals/_arrow/expr.py @@ -309,7 +309,9 @@ def func(df: ArrowDataFrame) -> list[ArrowSeries]: ) raise ValueError(msg) tmp = df.group_by(*keys).agg(self) - tmp = df.select(*keys).join(tmp, how="left", left_on=keys, right_on=keys) + tmp = df.select(*keys).join( + tmp, how="left", left_on=keys, right_on=keys, suffix="_right" + ) return [tmp[name] for name in self._output_names] return self.__class__( diff --git a/narwhals/_dask/dataframe.py b/narwhals/_dask/dataframe.py index 1a40d7a6c..e2a034ae2 100644 --- a/narwhals/_dask/dataframe.py +++ b/narwhals/_dask/dataframe.py @@ -208,6 +208,7 @@ def join( how: Literal["left", "inner", "outer", "cross", "anti", "semi"] = "inner", left_on: str | list[str] | None, right_on: str | list[str] | None, + suffix: str, ) -> Self: if how == "cross": key_token = generate_unique_token( @@ -221,7 +222,7 @@ def join( how="inner", left_on=key_token, right_on=key_token, - suffixes=("", "_right"), + suffixes=("", suffix), ) .drop(columns=key_token), ) @@ -273,7 +274,7 @@ def join( how="left", left_on=left_on, right_on=right_on, - suffixes=("", "_right"), + suffixes=("", suffix), ) extra = [] for left_key, right_key in zip(left_on, right_on): # type: ignore[arg-type] @@ -289,7 +290,7 @@ def join( left_on=left_on, right_on=right_on, how=how, - suffixes=("", "_right"), + suffixes=("", suffix), ), ) diff --git a/narwhals/_dask/expr.py b/narwhals/_dask/expr.py index e3030a787..730824508 100644 --- a/narwhals/_dask/expr.py +++ b/narwhals/_dask/expr.py @@ -629,7 +629,7 @@ def func(df: DaskLazyFrame) -> list[Any]: tmp = df.group_by(*keys).agg(self) tmp_native = ( df.select(*keys) - .join(tmp, how="left", left_on=keys, right_on=keys) + .join(tmp, how="left", left_on=keys, right_on=keys, suffix="_right") ._native_frame ) return [tmp_native[name] for name in self._output_names] diff --git a/narwhals/_pandas_like/dataframe.py b/narwhals/_pandas_like/dataframe.py index 4ec42ef59..59cff49fc 100644 --- a/narwhals/_pandas_like/dataframe.py +++ b/narwhals/_pandas_like/dataframe.py @@ -403,6 +403,7 @@ def join( how: Literal["left", "inner", "outer", "cross", "anti", "semi"] = "inner", left_on: str | list[str] | None, right_on: str | list[str] | None, + suffix: str, ) -> Self: if isinstance(left_on, str): left_on = [left_on] @@ -427,7 +428,7 @@ def join( how="inner", left_on=key_token, right_on=key_token, - suffixes=("", "_right"), + suffixes=("", suffix), ) .drop(columns=key_token), ) @@ -436,7 +437,7 @@ def join( self._native_frame.merge( other._native_frame, how="cross", - suffixes=("", "_right"), + suffixes=("", suffix), ), ) @@ -488,14 +489,14 @@ def join( how="left", left_on=left_on, right_on=right_on, - suffixes=("", "_right"), + suffixes=("", suffix), ) extra = [] for left_key, right_key in zip(left_on, right_on): # type: ignore[arg-type] if right_key != left_key and right_key not in self.columns: extra.append(right_key) elif right_key != left_key: - extra.append(f"{right_key}_right") + extra.append(f"{right_key}{suffix}") return self._from_native_frame(result_native.drop(columns=extra)) return self._from_native_frame( @@ -504,7 +505,7 @@ def join( left_on=left_on, right_on=right_on, how=how, - suffixes=("", "_right"), + suffixes=("", suffix), ), ) diff --git a/narwhals/_pandas_like/expr.py b/narwhals/_pandas_like/expr.py index 44154453d..8c3536c77 100644 --- a/narwhals/_pandas_like/expr.py +++ b/narwhals/_pandas_like/expr.py @@ -287,7 +287,9 @@ def func(df: PandasLikeDataFrame) -> list[PandasLikeSeries]: ) raise ValueError(msg) tmp = df.group_by(*keys).agg(self) - tmp = df.select(*keys).join(tmp, how="left", left_on=keys, right_on=keys) + tmp = df.select(*keys).join( + tmp, how="left", left_on=keys, right_on=keys, suffix="_right" + ) return [tmp[name] for name in self._output_names] return self.__class__( diff --git a/narwhals/dataframe.py b/narwhals/dataframe.py index ffd7ce36d..a266b73c7 100644 --- a/narwhals/dataframe.py +++ b/narwhals/dataframe.py @@ -182,11 +182,12 @@ def sort( def join( self, other: Self, - *, + on: str | list[str] | None = None, how: Literal["inner", "left", "cross", "semi", "anti"] = "inner", + *, left_on: str | list[str] | None = None, right_on: str | list[str] | None = None, - on: str | list[str] | None = None, + suffix: str = "_right", ) -> Self: _supported_joins = ("inner", "left", "cross", "anti", "semi") @@ -219,6 +220,7 @@ def join( how=how, left_on=left_on, right_on=right_on, + suffix=suffix, ) ) @@ -1850,30 +1852,29 @@ def sort( def join( self, other: Self, - *, + on: str | list[str] | None = None, how: Literal["inner", "left", "cross", "semi", "anti"] = "inner", + *, left_on: str | list[str] | None = None, right_on: str | list[str] | None = None, - on: str | list[str] | None = None, + suffix: str = "_right", ) -> Self: r""" Join in SQL-like fashion. Arguments: - other: DataFrame to join with. - + other: Lazy DataFrame to join with. + on: Name(s) of the join columns in both DataFrames. If set, `left_on` and + `right_on` should be None. how: Join strategy. * *inner*: Returns rows that have matching values in both tables. * *cross*: Returns the Cartesian product of rows from both tables. * *semi*: Filter rows that have a match in the right table. * *anti*: Filter rows that do not have a match in the right table. - - left_on: Name(s) of the left join column(s). - - right_on: Name(s) of the right join column(s). - - on: Join column of both DataFrames. If set, left_on and right_on should be None. + left_on: Join column of the left DataFrame. + right_on: Join column of the right DataFrame. + suffix: Suffix to append to columns with a duplicate name. Returns: A new joined DataFrame @@ -1922,7 +1923,9 @@ def join( │ 2 ┆ 7.0 ┆ b ┆ y │ └─────┴─────┴─────┴───────┘ """ - return super().join(other, how=how, left_on=left_on, right_on=right_on, on=on) + return super().join( + other, how=how, left_on=left_on, right_on=right_on, on=on, suffix=suffix + ) def join_asof( self, @@ -3578,30 +3581,29 @@ def sort( def join( self, other: Self, - *, + on: str | list[str] | None = None, how: Literal["inner", "left", "cross", "semi", "anti"] = "inner", + *, left_on: str | list[str] | None = None, right_on: str | list[str] | None = None, - on: str | list[str] | None = None, + suffix: str = "_right", ) -> Self: r""" Add a join operation to the Logical Plan. Arguments: other: Lazy DataFrame to join with. - + on: Name(s) of the join columns in both DataFrames. If set, `left_on` and + `right_on` should be None. how: Join strategy. * *inner*: Returns rows that have matching values in both tables. * *cross*: Returns the Cartesian product of rows from both tables. * *semi*: Filter rows that have a match in the right table. * *anti*: Filter rows that do not have a match in the right table. - left_on: Join column of the left DataFrame. - right_on: Join column of the right DataFrame. - - on: Join column of both DataFrames. If set, left_on and right_on should be None. + suffix: Suffix to append to columns with a duplicate name. Returns: A new joined LazyFrame @@ -3650,7 +3652,9 @@ def join( │ 2 ┆ 7.0 ┆ b ┆ y │ └─────┴─────┴─────┴───────┘ """ - return super().join(other, how=how, left_on=left_on, right_on=right_on, on=on) + return super().join( + other, how=how, left_on=left_on, right_on=right_on, on=on, suffix=suffix + ) def join_asof( self, diff --git a/tests/frame/join_test.py b/tests/frame/join_test.py index 6a1985f41..18e9aae64 100644 --- a/tests/frame/join_test.py +++ b/tests/frame/join_test.py @@ -89,6 +89,41 @@ def test_cross_join(constructor: Any) -> None: df.join(df, how="cross", left_on="antananarivo") # type: ignore[arg-type] +@pytest.mark.parametrize("how", ["inner", "left"]) +@pytest.mark.parametrize("suffix", ["_right", "_custom_suffix"]) +def test_suffix(constructor: Any, how: str, suffix: str) -> None: + data = { + "antananarivo": [1, 3, 2], + "bob": [4, 4, 6], + "zorro": [7.0, 8, 9], + } + df = nw.from_native(constructor(data)) + df_right = df + result = df.join( + df_right, # type: ignore[arg-type] + left_on=["antananarivo", "bob"], + right_on=["antananarivo", "bob"], + how=how, # type: ignore[arg-type] + suffix=suffix, + ) + result_cols = result.collect_schema().names() + assert result_cols == ["antananarivo", "bob", "zorro", f"zorro{suffix}"] + + +@pytest.mark.parametrize("suffix", ["_right", "_custom_suffix"]) +def test_cross_join_suffix(constructor: Any, suffix: str) -> None: + data = {"antananarivo": [1, 3, 2]} + df = nw.from_native(constructor(data)) + result = df.join(df, how="cross", suffix=suffix).sort( # type: ignore[arg-type] + "antananarivo", f"antananarivo{suffix}" + ) + expected = { + "antananarivo": [1, 1, 1, 2, 2, 2, 3, 3, 3], + f"antananarivo{suffix}": [1, 2, 3, 1, 2, 3, 1, 2, 3], + } + compare_dicts(result, expected) + + def test_cross_join_non_pandas() -> None: data = {"antananarivo": [1, 3, 2]} df = nw.from_native(pd.DataFrame(data))