From f26b7653c44a39e7d9a3baedb44528b2e223072b Mon Sep 17 00:00:00 2001 From: Alessandro Miola <37796412+AlessandroMiola@users.noreply.github.com> Date: Sat, 28 Dec 2024 20:25:58 +0100 Subject: [PATCH] test: add test for `nw.Expr|Series.str.contains` with `literal=True` (#1670) --- narwhals/expr.py | 23 ++++++++-- narwhals/series.py | 22 +++++++-- tests/expr_and_series/str/contains_test.py | 52 +++++++++++++++------- 3 files changed, 73 insertions(+), 24 deletions(-) diff --git a/narwhals/expr.py b/narwhals/expr.py index 2ba2fd61b..777cae615 100644 --- a/narwhals/expr.py +++ b/narwhals/expr.py @@ -4178,15 +4178,17 @@ def contains(self: Self, pattern: str, *, literal: bool = False) -> ExprT: Examples: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoFrameT >>> data = {"pets": ["cat", "dog", "rabbit and parrot", "dove", None]} >>> df_pd = pd.DataFrame(data) >>> df_pl = pl.DataFrame(data) + >>> df_pa = pa.table(data) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(df_native: IntoFrameT) -> IntoFrameT: + >>> def agnostic_contains(df_native: IntoFrameT) -> IntoFrameT: ... df = nw.from_native(df_native) ... return df.with_columns( ... default_match=nw.col("pets").str.contains("parrot|Dove"), @@ -4196,16 +4198,17 @@ def contains(self: Self, pattern: str, *, literal: bool = False) -> ExprT: ... ), ... ).to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or PyArrow to `agnostic_contains`: - >>> my_library_agnostic_function(df_pd) + >>> agnostic_contains(df_pd) pets default_match case_insensitive_match literal_match 0 cat False False False 1 dog False False False 2 rabbit and parrot True True False 3 dove False True False 4 None None None None - >>> my_library_agnostic_function(df_pl) + + >>> agnostic_contains(df_pl) shape: (5, 4) ┌───────────────────┬───────────────┬────────────────────────┬───────────────┐ │ pets ┆ default_match ┆ case_insensitive_match ┆ literal_match │ @@ -4218,6 +4221,18 @@ def contains(self: Self, pattern: str, *, literal: bool = False) -> ExprT: │ dove ┆ false ┆ true ┆ false │ │ null ┆ null ┆ null ┆ null │ └───────────────────┴───────────────┴────────────────────────┴───────────────┘ + + >>> agnostic_contains(df_pa) + pyarrow.Table + pets: string + default_match: bool + case_insensitive_match: bool + literal_match: bool + ---- + pets: [["cat","dog","rabbit and parrot","dove",null]] + default_match: [[false,false,true,false,null]] + case_insensitive_match: [[false,false,true,true,null]] + literal_match: [[false,false,false,false,null]] """ return self._expr.__class__( lambda plx: self._expr._to_compliant_expr(plx).str.contains( diff --git a/narwhals/series.py b/narwhals/series.py index b5b6e1746..bbf94f38a 100644 --- a/narwhals/series.py +++ b/narwhals/series.py @@ -4137,21 +4137,23 @@ def contains(self: Self, pattern: str, *, literal: bool = False) -> SeriesT: Examples: >>> import pandas as pd >>> import polars as pl + >>> import pyarrow as pa >>> import narwhals as nw >>> from narwhals.typing import IntoSeriesT >>> pets = ["cat", "dog", "rabbit and parrot", "dove", None] >>> s_pd = pd.Series(pets) >>> s_pl = pl.Series(pets) + >>> s_pa = pa.chunked_array([pets]) We define a dataframe-agnostic function: - >>> def my_library_agnostic_function(s_native: IntoSeriesT) -> IntoSeriesT: + >>> def agnostic_contains(s_native: IntoSeriesT) -> IntoSeriesT: ... s = nw.from_native(s_native, series_only=True) ... return s.str.contains("parrot|dove").to_native() - We can then pass either pandas or Polars to `func`: + We can then pass any supported library such as pandas, Polars, or PyArrow to `agnostic_contains`: - >>> my_library_agnostic_function(s_pd) + >>> agnostic_contains(s_pd) 0 False 1 False 2 True @@ -4159,7 +4161,7 @@ def contains(self: Self, pattern: str, *, literal: bool = False) -> SeriesT: 4 None dtype: object - >>> my_library_agnostic_function(s_pl) # doctest: +NORMALIZE_WHITESPACE + >>> agnostic_contains(s_pl) # doctest: +NORMALIZE_WHITESPACE shape: (5,) Series: '' [bool] [ @@ -4169,6 +4171,18 @@ def contains(self: Self, pattern: str, *, literal: bool = False) -> SeriesT: true null ] + + >>> agnostic_contains(s_pa) # doctest: +ELLIPSIS + + [ + [ + false, + false, + true, + true, + null + ] + ] """ return self._narwhals_series._from_compliant_series( self._narwhals_series._compliant_series.str.contains(pattern, literal=literal) diff --git a/tests/expr_and_series/str/contains_test.py b/tests/expr_and_series/str/contains_test.py index 866f50ce1..06c6913aa 100644 --- a/tests/expr_and_series/str/contains_test.py +++ b/tests/expr_and_series/str/contains_test.py @@ -7,7 +7,7 @@ from tests.utils import ConstructorEager from tests.utils import assert_equal_data -data = {"pets": ["cat", "dog", "rabbit and parrot", "dove"]} +data = {"pets": ["cat", "dog", "rabbit and parrot", "dove", "Parrot|dove", None]} def test_contains_case_insensitive( @@ -17,12 +17,11 @@ def test_contains_case_insensitive( request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor(data)) - result = df.with_columns( - nw.col("pets").str.contains("(?i)parrot|Dove").alias("result") + result = df.select( + nw.col("pets").str.contains("(?i)parrot|Dove").alias("case_insensitive_match") ) expected = { - "pets": ["cat", "dog", "rabbit and parrot", "dove"], - "result": [False, False, True, True], + "case_insensitive_match": [False, False, True, True, True, None], } assert_equal_data(result, expected) @@ -34,31 +33,52 @@ def test_contains_series_case_insensitive( request.applymarker(pytest.mark.xfail) df = nw.from_native(constructor_eager(data), eager_only=True) - result = df.with_columns( - case_insensitive_match=df["pets"].str.contains("(?i)parrot|Dove") - ) + result = df.select(case_insensitive_match=df["pets"].str.contains("(?i)parrot|Dove")) expected = { - "pets": ["cat", "dog", "rabbit and parrot", "dove"], - "case_insensitive_match": [False, False, True, True], + "case_insensitive_match": [False, False, True, True, True, None], } assert_equal_data(result, expected) def test_contains_case_sensitive(constructor: Constructor) -> None: df = nw.from_native(constructor(data)) - result = df.with_columns(nw.col("pets").str.contains("parrot|Dove").alias("result")) + result = df.select(nw.col("pets").str.contains("parrot|Dove").alias("default_match")) expected = { - "pets": ["cat", "dog", "rabbit and parrot", "dove"], - "result": [False, False, True, False], + "default_match": [False, False, True, False, False, None], } assert_equal_data(result, expected) def test_contains_series_case_sensitive(constructor_eager: ConstructorEager) -> None: df = nw.from_native(constructor_eager(data), eager_only=True) - result = df.with_columns(case_sensitive_match=df["pets"].str.contains("parrot|Dove")) + result = df.select(default_match=df["pets"].str.contains("parrot|Dove")) + expected = { + "default_match": [False, False, True, False, False, None], + } + assert_equal_data(result, expected) + + +def test_contains_literal(constructor: Constructor) -> None: + df = nw.from_native(constructor(data)) + result = df.select( + nw.col("pets").str.contains("Parrot|dove").alias("default_match"), + nw.col("pets").str.contains("Parrot|dove", literal=True).alias("literal_match"), + ) + expected = { + "default_match": [False, False, False, True, True, None], + "literal_match": [False, False, False, False, True, None], + } + assert_equal_data(result, expected) + + +def test_contains_series_literal(constructor_eager: ConstructorEager) -> None: + df = nw.from_native(constructor_eager(data), eager_only=True) + result = df.select( + default_match=df["pets"].str.contains("Parrot|dove"), + literal_match=df["pets"].str.contains("Parrot|dove", literal=True), + ) expected = { - "pets": ["cat", "dog", "rabbit and parrot", "dove"], - "case_sensitive_match": [False, False, True, False], + "default_match": [False, False, False, True, True, None], + "literal_match": [False, False, False, False, True, None], } assert_equal_data(result, expected)