diff --git a/docs/api-reference/expr.md b/docs/api-reference/expr.md index 7188b2c36..1ee8cae4f 100644 --- a/docs/api-reference/expr.md +++ b/docs/api-reference/expr.md @@ -36,6 +36,7 @@ - over - pipe - quantile + - rank - round - sample - shift diff --git a/docs/api-reference/series.md b/docs/api-reference/series.md index e8572dda8..0dda8107c 100644 --- a/docs/api-reference/series.md +++ b/docs/api-reference/series.md @@ -43,6 +43,7 @@ - null_count - pipe - quantile + - rank - rename - round - sample diff --git a/narwhals/_arrow/expr.py b/narwhals/_arrow/expr.py index 35e936d72..52b1c6875 100644 --- a/narwhals/_arrow/expr.py +++ b/narwhals/_arrow/expr.py @@ -372,6 +372,16 @@ def func(df: ArrowDataFrame) -> list[ArrowSeries]: def mode(self: Self) -> Self: return reuse_series_implementation(self, "mode") + def rank( + self: Self, + method: Literal["average", "min", "max", "dense", "ordinal"], + *, + descending: bool, + ) -> Self: + return reuse_series_implementation( + self, "rank", method=method, descending=descending + ) + @property def dt(self: Self) -> ArrowExprDateTimeNamespace: return ArrowExprDateTimeNamespace(self) diff --git a/narwhals/_arrow/series.py b/narwhals/_arrow/series.py index 70009df43..3610f1f41 100644 --- a/narwhals/_arrow/series.py +++ b/narwhals/_arrow/series.py @@ -721,6 +721,25 @@ def mode(self: Self) -> ArrowSeries: plx.col(col_token) == plx.col(col_token).max() )[self.name] + def rank( + self: Self, + method: Literal["average", "min", "max", "dense", "ordinal"], + *, + descending: bool, + ) -> Self: + import pyarrow as pa # ignore-banned-import + import pyarrow.compute as pc # ignore-banned-import + + sort_keys = "descending" if descending else "ascending" + tiebreaker = "first" if method == "ordinal" else method + native_series = self._native_series + null_mask = pc.is_null(native_series) + + rank = pc.rank(native_series, sort_keys=sort_keys, tiebreaker=tiebreaker) + + result = pc.if_else(null_mask, pa.scalar(None), rank) + return self._from_native_series(result) + def __iter__(self: Self) -> Iterator[Any]: yield from self._native_series.__iter__() diff --git a/narwhals/_pandas_like/expr.py b/narwhals/_pandas_like/expr.py index a58597eea..bc87a3833 100644 --- a/narwhals/_pandas_like/expr.py +++ b/narwhals/_pandas_like/expr.py @@ -387,6 +387,16 @@ def gather_every(self: Self, n: int, offset: int = 0) -> Self: def mode(self: Self) -> Self: return reuse_series_implementation(self, "mode") + def rank( + self: Self, + method: Literal["average", "min", "max", "dense", "ordinal"], + *, + descending: bool, + ) -> Self: + return reuse_series_implementation( + self, "rank", method=method, descending=descending + ) + @property def str(self: Self) -> PandasLikeExprStringNamespace: return PandasLikeExprStringNamespace(self) diff --git a/narwhals/_pandas_like/series.py b/narwhals/_pandas_like/series.py index 078e857b9..057237b73 100644 --- a/narwhals/_pandas_like/series.py +++ b/narwhals/_pandas_like/series.py @@ -682,6 +682,20 @@ def mode(self: Self) -> Self: def __iter__(self: Self) -> Iterator[Any]: yield from self._native_series.__iter__() + def rank( + self: Self, + method: Literal["average", "min", "max", "dense", "ordinal"], + *, + descending: bool, + ) -> Self: + result = self._native_series.rank( + method="first" if method == "ordinal" else method, + na_option="keep", + ascending=not descending, + pct=False, + ) + return self._from_native_series(result) + @property def str(self) -> PandasLikeSeriesStringNamespace: return PandasLikeSeriesStringNamespace(self) diff --git a/narwhals/expr.py b/narwhals/expr.py index 2f986760c..07e2fc92a 100644 --- a/narwhals/expr.py +++ b/narwhals/expr.py @@ -2310,6 +2310,97 @@ def mode(self: Self) -> Self: """ return self.__class__(lambda plx: self._call(plx).mode()) + def rank( + self: Self, + method: Literal["average", "min", "max", "dense", "ordinal"] = "average", + *, + descending: bool = False, + ) -> Self: + """ + Assign ranks to data, dealing with ties appropriately. + + Arguments: + method: The method used to assign ranks to tied elements. + The following methods are available (default is 'average'): + + - 'average' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - 'min' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - 'max' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - 'dense' : Like 'min', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - 'ordinal' : All values are given a distinct rank, corresponding to the + order that the values occur in the Series. + + descending: Rank in descending order. + + Examples + -------- + The 'average' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank()) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ f64 │ + ╞═════╡ + │ 3.0 │ + │ 4.5 │ + │ 1.5 │ + │ 1.5 │ + │ 4.5 │ + └─────┘ + + The 'ordinal' method: + + >>> df = pl.DataFrame({"a": [3, 6, 1, 1, 6]}) + >>> df.select(pl.col("a").rank("ordinal")) + shape: (5, 1) + ┌─────┐ + │ a │ + │ --- │ + │ u32 │ + ╞═════╡ + │ 3 │ + │ 4 │ + │ 1 │ + │ 2 │ + │ 5 │ + └─────┘ + + Use 'rank' with 'over' to rank within groups: + + >>> df = pl.DataFrame({"a": [1, 1, 2, 2, 2], "b": [6, 7, 5, 14, 11]}) + >>> df.with_columns(pl.col("b").rank().over("a").alias("rank")) + shape: (5, 3) + ┌─────┬─────┬──────┐ + │ a ┆ b ┆ rank │ + │ --- ┆ --- ┆ --- │ + │ i64 ┆ i64 ┆ f64 │ + ╞═════╪═════╪══════╡ + │ 1 ┆ 6 ┆ 1.0 │ + │ 1 ┆ 7 ┆ 2.0 │ + │ 2 ┆ 5 ┆ 1.0 │ + │ 2 ┆ 14 ┆ 3.0 │ + │ 2 ┆ 11 ┆ 2.0 │ + └─────┴─────┴──────┘ + """ + + supported_rank_methods = {"average", "min", "max", "dense"} + if method not in supported_rank_methods: + msg = f"Ranking method must be one of {supported_rank_methods}. Found '{method}'" + raise ValueError(msg) + + return self.__class__( + lambda plx: self._call(plx).rank(method=method, descending=descending) + ) + @property def str(self: Self) -> ExprStringNamespace[Self]: return ExprStringNamespace(self) diff --git a/narwhals/series.py b/narwhals/series.py index dac5c6d79..868a9eb5c 100644 --- a/narwhals/series.py +++ b/narwhals/series.py @@ -2525,6 +2525,57 @@ def mode(self: Self) -> Self: def __iter__(self: Self) -> Iterator[Any]: yield from self._compliant_series.__iter__() + def rank( + self: Self, + method: Literal["average", "min", "max", "dense", "ordinal"] = "average", + *, + descending: bool = False, + ) -> Self: + """ + Assign ranks to data, dealing with ties appropriately. + + Arguments: + method: The method used to assign ranks to tied elements. + The following methods are available (default is 'average'): + + - 'average' : The average of the ranks that would have been assigned to + all the tied values is assigned to each value. + - 'min' : The minimum of the ranks that would have been assigned to all + the tied values is assigned to each value. (This is also referred to + as "competition" ranking.) + - 'max' : The maximum of the ranks that would have been assigned to all + the tied values is assigned to each value. + - 'dense' : Like 'min', but the rank of the next highest element is + assigned the rank immediately after those assigned to the tied + elements. + - 'ordinal' : All values are given a distinct rank, corresponding to the + order that the values occur in the Series. + + descending: Rank in descending order. + + Examples: + + >>> s = pl.Series("a", [3, 6, 1, 1, 6]) + >>> s.rank() + shape: (5,) + Series: 'a' [f64] + [ + 3.0 + 4.5 + 1.5 + 1.5 + 4.5 + ] + """ + supported_rank_methods = {"average", "min", "max", "dense"} + if method not in supported_rank_methods: + msg = f"Ranking method must be one of {supported_rank_methods}. Found '{method}'" + raise ValueError(msg) + + return self._from_compliant_series( + self._compliant_series.rank(method=method, descending=descending) + ) + @property def str(self: Self) -> SeriesStringNamespace[Self]: return SeriesStringNamespace(self)