diff --git a/narwhals/_arrow/expr.py b/narwhals/_arrow/expr.py
index 7ec20357d..7e7a0d200 100644
--- a/narwhals/_arrow/expr.py
+++ b/narwhals/_arrow/expr.py
@@ -218,6 +218,11 @@ def null_count(self) -> Self:
     def is_null(self) -> Self:
         return reuse_series_implementation(self, "is_null")
 
+    def is_between(self, lower_bound: Any, upper_bound: Any, closed: str) -> Any:
+        return reuse_series_implementation(
+            self, "is_between", lower_bound, upper_bound, closed
+        )
+
     def head(self, n: int) -> Self:
         return reuse_series_implementation(self, "head", n)
 
diff --git a/tests/expr_and_series/is_between_test.py b/tests/expr_and_series/is_between_test.py
index 6ab2027c2..7203bdf00 100644
--- a/tests/expr_and_series/is_between_test.py
+++ b/tests/expr_and_series/is_between_test.py
@@ -21,15 +21,24 @@
         ("none", [False, True, True, False]),
     ],
 )
-def test_is_between(
-    request: Any, constructor: Any, closed: str, expected: list[bool]
-) -> None:
-    if "pyarrow_table" in str(constructor):
-        request.applymarker(pytest.mark.xfail)
-
-    df = nw.from_native(constructor(data), eager_only=True)
+def test_is_between(constructor_lazy: Any, closed: str, expected: list[bool]) -> None:
+    df = nw.from_native(constructor_lazy(data))
     result = df.select(nw.col("a").is_between(1, 5, closed=closed))
     expected_dict = {"a": expected}
     compare_dicts(result, expected_dict)
+
+
+@pytest.mark.parametrize(
+    ("closed", "expected"),
+    [
+        ("left", [True, True, True, False]),
+        ("right", [False, True, True, True]),
+        ("both", [True, True, True, True]),
+        ("none", [False, True, True, False]),
+    ],
+)
+def test_is_between_series(constructor: Any, closed: str, expected: list[bool]) -> None:
+    df = nw.from_native(constructor(data), eager_only=True)
     result = df.with_columns(a=df["a"].is_between(1, 5, closed=closed))
+    expected_dict = {"a": expected}
     compare_dicts(result, expected_dict)
diff --git a/tests/expr_and_series/is_duplicated_test.py b/tests/expr_and_series/is_duplicated_test.py
index ff69e4bf7..9f59ab44e 100644
--- a/tests/expr_and_series/is_duplicated_test.py
+++ b/tests/expr_and_series/is_duplicated_test.py
@@ -12,11 +12,11 @@
 }
 
 
-def test_is_duplicated_expr(constructor: Any, request: Any) -> None:
-    if "modin" in str(constructor):
+def test_is_duplicated_expr(constructor_lazy: Any, request: Any) -> None:
+    if "modin" in str(constructor_lazy):
         # TODO(unassigned): why is Modin failing here?
         request.applymarker(pytest.mark.xfail)
-    df = nw.from_native(constructor(data), eager_only=True)
+    df = nw.from_native(constructor_lazy(data))
     result = df.select(nw.all().is_duplicated())
     expected = {
         "a": [True, True, False],
diff --git a/tests/expr_and_series/is_first_distinct_test.py b/tests/expr_and_series/is_first_distinct_test.py
index d792f0ac7..a4e63939d 100644
--- a/tests/expr_and_series/is_first_distinct_test.py
+++ b/tests/expr_and_series/is_first_distinct_test.py
@@ -12,11 +12,11 @@
 }
 
 
-def test_is_first_distinct_expr(constructor: Any, request: Any) -> None:
-    if "modin" in str(constructor):
+def test_is_first_distinct_expr(constructor_lazy: Any, request: Any) -> None:
+    if "modin" in str(constructor_lazy):
         # TODO(unassigned): why is Modin failing here?
         request.applymarker(pytest.mark.xfail)
-    df = nw.from_native(constructor(data), eager_only=True)
+    df = nw.from_native(constructor_lazy(data))
     result = df.select(nw.all().is_first_distinct())
     expected = {
         "a": [True, False, True, True, False],
diff --git a/tests/expr_and_series/is_in_test.py b/tests/expr_and_series/is_in_test.py
index 8b683b604..4aeac63c4 100644
--- a/tests/expr_and_series/is_in_test.py
+++ b/tests/expr_and_series/is_in_test.py
@@ -9,8 +9,8 @@
 }
 
 
-def test_expr_is_in(constructor: Any) -> None:
-    df = nw.from_native(constructor(data))
+def test_expr_is_in(constructor_lazy: Any) -> None:
+    df = nw.from_native(constructor_lazy(data))
     result = df.select(nw.col("a").is_in([4, 5]))
     expected = {"a": [False, True, False, True]}
 
diff --git a/tests/expr_and_series/is_last_distinct_test.py b/tests/expr_and_series/is_last_distinct_test.py
index 38f981d0c..6a55a7311 100644
--- a/tests/expr_and_series/is_last_distinct_test.py
+++ b/tests/expr_and_series/is_last_distinct_test.py
@@ -12,11 +12,11 @@
 }
 
 
-def test_is_last_distinct_expr(constructor: Any, request: Any) -> None:
-    if "modin" in str(constructor):
+def test_is_last_distinct_expr(constructor_lazy: Any, request: Any) -> None:
+    if "modin" in str(constructor_lazy):
         # TODO(unassigned): why is Modin failing here?
         request.applymarker(pytest.mark.xfail)
-    df = nw.from_native(constructor(data), eager_only=True)
+    df = nw.from_native(constructor_lazy(data))
     result = df.select(nw.all().is_last_distinct())
     expected = {
         "a": [False, True, False, True, True],
diff --git a/tests/expr_and_series/is_unique_test.py b/tests/expr_and_series/is_unique_test.py
index 9af0f3d9c..975e1ecb7 100644
--- a/tests/expr_and_series/is_unique_test.py
+++ b/tests/expr_and_series/is_unique_test.py
@@ -12,11 +12,11 @@
 }
 
 
-def test_is_unique_expr(constructor: Any, request: Any) -> None:
-    if "modin" in str(constructor):
+def test_is_unique_expr(constructor_lazy: Any, request: Any) -> None:
+    if "modin" in str(constructor_lazy):
         # TODO(unassigned): why is Modin failing here?
         request.applymarker(pytest.mark.xfail)
-    df = nw.from_native(constructor(data), eager_only=True)
+    df = nw.from_native(constructor_lazy(data))
     result = df.select(nw.all().is_unique())
     expected = {
         "a": [False, False, True],
diff --git a/tests/expr_and_series/len_test.py b/tests/expr_and_series/len_test.py
index 7ec1505cc..592281936 100644
--- a/tests/expr_and_series/len_test.py
+++ b/tests/expr_and_series/len_test.py
@@ -7,8 +7,8 @@
 expected = {"a1": [2], "a2": [1]}
 
 
-def test_len(constructor: Any) -> None:
-    df_raw = constructor(data)
+def test_len(constructor_lazy: Any) -> None:
+    df_raw = constructor_lazy(data)
     df = nw.from_native(df_raw).select(
         nw.col("a").filter(nw.col("b") == 1).len().alias("a1"),
         nw.col("a").filter(nw.col("b") == 2).len().alias("a2"),
diff --git a/tests/expr_and_series/max_test.py b/tests/expr_and_series/max_test.py
index 18e0bab17..83ac2fd2f 100644
--- a/tests/expr_and_series/max_test.py
+++ b/tests/expr_and_series/max_test.py
@@ -11,8 +11,8 @@
 
 
 @pytest.mark.parametrize("expr", [nw.col("a", "b", "z").max(), nw.max("a", "b", "z")])
-def test_expr_max_expr(constructor: Any, expr: nw.Expr) -> None:
-    df = nw.from_native(constructor(data), eager_only=True)
+def test_expr_max_expr(constructor_lazy: Any, expr: nw.Expr) -> None:
+    df = nw.from_native(constructor_lazy(data))
     result = df.select(expr)
     expected = {"a": [3], "b": [6], "z": [9.0]}
     compare_dicts(result, expected)
diff --git a/tests/expr_and_series/mean_test.py b/tests/expr_and_series/mean_test.py
index c104e314b..ad04a8e12 100644
--- a/tests/expr_and_series/mean_test.py
+++ b/tests/expr_and_series/mean_test.py
@@ -11,8 +11,8 @@
 
 
 @pytest.mark.parametrize("expr", [nw.col("a", "b", "z").mean(), nw.mean("a", "b", "z")])
-def test_expr_mean_expr(constructor: Any, expr: nw.Expr) -> None:
-    df = nw.from_native(constructor(data), eager_only=True)
+def test_expr_mean_expr(constructor_lazy: Any, expr: nw.Expr) -> None:
+    df = nw.from_native(constructor_lazy(data))
     result = df.select(expr)
     expected = {"a": [2.0], "b": [5.0], "z": [8.0]}
     compare_dicts(result, expected)
diff --git a/tests/expr_and_series/min_test.py b/tests/expr_and_series/min_test.py
index dad0ffea4..786e2d942 100644
--- a/tests/expr_and_series/min_test.py
+++ b/tests/expr_and_series/min_test.py
@@ -11,8 +11,8 @@
 
 
 @pytest.mark.parametrize("expr", [nw.col("a", "b", "z").min(), nw.min("a", "b", "z")])
-def test_expr_min_expr(constructor: Any, expr: nw.Expr) -> None:
-    df = nw.from_native(constructor(data), eager_only=True)
+def test_expr_min_expr(constructor_lazy: Any, expr: nw.Expr) -> None:
+    df = nw.from_native(constructor_lazy(data))
     result = df.select(expr)
     expected = {"a": [1], "b": [4], "z": [7.0]}
     compare_dicts(result, expected)
diff --git a/tests/expr_and_series/n_unique_test.py b/tests/expr_and_series/n_unique_test.py
index 24c2c8896..3a22a4374 100644
--- a/tests/expr_and_series/n_unique_test.py
+++ b/tests/expr_and_series/n_unique_test.py
@@ -9,13 +9,15 @@
 }
 
 
-def test_n_unique(constructor: Any) -> None:
-    df = nw.from_native(constructor(data), eager_only=True)
+def test_n_unique(constructor_lazy: Any) -> None:
+    df = nw.from_native(constructor_lazy(data))
     result = df.select(nw.all().n_unique())
-    expected = {
-        "a": [3],
-        "b": [4],
-    }
+    expected = {"a": [3], "b": [4]}
     compare_dicts(result, expected)
+
+
+def test_n_unique_series(constructor: Any) -> None:
+    df = nw.from_native(constructor(data), eager_only=True)
+    expected = {"a": [3], "b": [4]}
     result_series = {"a": [df["a"].n_unique()], "b": [df["b"].n_unique()]}
     compare_dicts(result_series, expected)
diff --git a/tests/expr_and_series/null_count_test.py b/tests/expr_and_series/null_count_test.py
index 5461db866..d6092162c 100644
--- a/tests/expr_and_series/null_count_test.py
+++ b/tests/expr_and_series/null_count_test.py
@@ -9,8 +9,8 @@
 }
 
 
-def test_null_count(constructor: Any) -> None:
-    df = nw.from_native(constructor(data), eager_only=True)
+def test_null_count(constructor_lazy: Any) -> None:
+    df = nw.from_native(constructor_lazy(data))
     result = df.select(nw.all().null_count())
     expected = {
         "a": [2],
diff --git a/tests/expr_and_series/operators_test.py b/tests/expr_and_series/operators_test.py
index f29c8e9eb..67351d74c 100644
--- a/tests/expr_and_series/operators_test.py
+++ b/tests/expr_and_series/operators_test.py
@@ -20,10 +20,10 @@
     ],
 )
 def test_comparand_operators(
-    constructor: Any, operator: str, expected: list[bool]
+    constructor_lazy: Any, operator: str, expected: list[bool]
 ) -> None:
     data = {"a": [0, 1, 2]}
-    df = nw.from_native(constructor(data))
+    df = nw.from_native(constructor_lazy(data))
     result = df.select(getattr(nw.col("a"), operator)(1))
     compare_dicts(result, {"a": expected})
 
@@ -35,9 +35,11 @@ def test_comparand_operators(
         ("__or__", [True, True, True, False]),
     ],
 )
-def test_logic_operators(constructor: Any, operator: str, expected: list[bool]) -> None:
+def test_logic_operators(
+    constructor_lazy: Any, operator: str, expected: list[bool]
+) -> None:
     data = {"a": [True, True, False, False], "b": [True, False, True, False]}
-    df = nw.from_native(constructor(data))
+    df = nw.from_native(constructor_lazy(data))
 
     result = df.select(getattr(nw.col("a"), operator)(nw.col("b")))
     compare_dicts(result, {"a": expected})
diff --git a/tests/expr_and_series/over_test.py b/tests/expr_and_series/over_test.py
index b037d0386..d4d7a02fb 100644
--- a/tests/expr_and_series/over_test.py
+++ b/tests/expr_and_series/over_test.py
@@ -13,11 +13,11 @@
 }
 
 
-def test_over_single(request: Any, constructor: Any) -> None:
-    if "pyarrow_table" in str(constructor):
+def test_over_single(request: Any, constructor_lazy: Any) -> None:
+    if "pyarrow_table" in str(constructor_lazy):
         request.applymarker(pytest.mark.xfail)
 
-    df = nw.from_native(constructor(data))
+    df = nw.from_native(constructor_lazy(data))
     result = df.with_columns(c_max=nw.col("c").max().over("a"))
     expected = {
         "a": ["a", "a", "b", "b", "b"],
@@ -28,11 +28,11 @@ def test_over_single(request: Any, constructor: Any) -> None:
     compare_dicts(result, expected)
 
 
-def test_over_multiple(request: Any, constructor: Any) -> None:
-    if "pyarrow_table" in str(constructor):
+def test_over_multiple(request: Any, constructor_lazy: Any) -> None:
+    if "pyarrow_table" in str(constructor_lazy):
         request.applymarker(pytest.mark.xfail)
 
-    df = nw.from_native(constructor(data))
+    df = nw.from_native(constructor_lazy(data))
     result = df.with_columns(c_min=nw.col("c").min().over("a", "b"))
     expected = {
         "a": ["a", "a", "b", "b", "b"],
diff --git a/tests/expr_and_series/quantile_test.py b/tests/expr_and_series/quantile_test.py
index 23a433d1d..5a75973d9 100644
--- a/tests/expr_and_series/quantile_test.py
+++ b/tests/expr_and_series/quantile_test.py
@@ -21,13 +21,13 @@
 )
 @pytest.mark.filterwarnings("ignore:the `interpolation=` argument to percentile")
 def test_quantile_expr(
-    constructor: Any,
+    constructor_lazy: Any,
     interpolation: Literal["nearest", "higher", "lower", "midpoint", "linear"],
     expected: dict[str, list[float]],
 ) -> None:
     q = 0.3
     data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]}
-    df_raw = constructor(data)
+    df_raw = constructor_lazy(data)
     df = nw.from_native(df_raw)
     result = df.select(nw.all().quantile(quantile=q, interpolation=interpolation))
     compare_dicts(result, expected)
diff --git a/tests/expr_and_series/round_test.py b/tests/expr_and_series/round_test.py
index 4fd7b385d..37a9cc8d3 100644
--- a/tests/expr_and_series/round_test.py
+++ b/tests/expr_and_series/round_test.py
@@ -9,17 +9,27 @@
 
 
 @pytest.mark.parametrize("decimals", [0, 1, 2])
-def test_round(request: Any, constructor: Any, decimals: int) -> None:
-    if "pyarrow_table" in str(constructor):
+def test_round(request: Any, constructor_lazy: Any, decimals: int) -> None:
+    if "pyarrow_table" in str(constructor_lazy):
         request.applymarker(pytest.mark.xfail)
     data = {"a": [1.12345, 2.56789, 3.901234]}
-    df_raw = constructor(data)
-    df = nw.from_native(df_raw, eager_only=True)
+    df_raw = constructor_lazy(data)
+    df = nw.from_native(df_raw)
 
     expected_data = {k: [round(e, decimals) for e in v] for k, v in data.items()}
     result_frame = df.select(nw.col("a").round(decimals))
     compare_dicts(result_frame, expected_data)
 
+
+@pytest.mark.parametrize("decimals", [0, 1, 2])
+def test_round_series(request: Any, constructor: Any, decimals: int) -> None:
+    if "pyarrow_table" in str(constructor):
+        request.applymarker(pytest.mark.xfail)
+    data = {"a": [1.12345, 2.56789, 3.901234]}
+    df_raw = constructor(data)
+    df = nw.from_native(df_raw, eager_only=True)
+
+    expected_data = {k: [round(e, decimals) for e in v] for k, v in data.items()}
     result_series = df["a"].round(decimals)
 
-    assert result_series.to_numpy().tolist() == expected_data["a"]
+    assert result_series.to_list() == expected_data["a"]
diff --git a/tests/expr_and_series/sample_test.py b/tests/expr_and_series/sample_test.py
index a19c686e6..d846cdab3 100644
--- a/tests/expr_and_series/sample_test.py
+++ b/tests/expr_and_series/sample_test.py
@@ -3,8 +3,8 @@
 import narwhals.stable.v1 as nw
 
 
-def test_expr_sample(constructor: Any) -> None:
-    df = nw.from_native(constructor({"a": [1, 2, 3], "b": [4, 5, 6]})).lazy()
+def test_expr_sample(constructor_lazy: Any) -> None:
+    df = nw.from_native(constructor_lazy({"a": [1, 2, 3], "b": [4, 5, 6]})).lazy()
 
     result_expr = df.select(nw.col("a").sample(n=2)).collect().shape
     expected_expr = (2, 1)
diff --git a/tests/expr_and_series/shift_test.py b/tests/expr_and_series/shift_test.py
index a330fa238..b82adac4b 100644
--- a/tests/expr_and_series/shift_test.py
+++ b/tests/expr_and_series/shift_test.py
@@ -13,11 +13,11 @@
 }
 
 
-def test_shift(request: Any, constructor: Any) -> None:
-    if "pyarrow_table" in str(constructor):
+def test_shift(request: Any, constructor_lazy: Any) -> None:
+    if "pyarrow_table" in str(constructor_lazy):
         request.applymarker(pytest.mark.xfail)
 
-    df = nw.from_native(constructor(data), eager_only=True)
+    df = nw.from_native(constructor_lazy(data))
     result = df.with_columns(nw.col("a", "b", "c").shift(2)).filter(nw.col("i") > 1)
     expected = {
         "i": [2, 3, 4],
@@ -26,6 +26,19 @@ def test_shift(request: Any, constructor: Any) -> None:
         "c": [5, 4, 3],
     }
     compare_dicts(result, expected)
+
+
+def test_shift_series(request: Any, constructor: Any) -> None:
+    if "pyarrow_table" in str(constructor):
+        request.applymarker(pytest.mark.xfail)
+
+    df = nw.from_native(constructor(data), eager_only=True)
+    expected = {
+        "i": [2, 3, 4],
+        "a": [0, 1, 2],
+        "b": [1, 2, 3],
+        "c": [5, 4, 3],
+    }
     result = df.select(
         df["i"],
         df["a"].shift(2),
diff --git a/tests/expr_and_series/sum_all_test.py b/tests/expr_and_series/sum_all_test.py
index 343cfec98..fdbffb3b1 100644
--- a/tests/expr_and_series/sum_all_test.py
+++ b/tests/expr_and_series/sum_all_test.py
@@ -6,8 +6,8 @@
 data = {"a": [1, 3, 2], "b": [4, 4, 6], "z": [7.0, 8, 9]}
 
 
-def test_sum_all_expr(constructor: Any) -> None:
-    df = nw.from_native(constructor(data), eager_only=True)
+def test_sum_all_expr(constructor_lazy: Any) -> None:
+    df = nw.from_native(constructor_lazy(data))
     result = df.select(nw.all().sum())
     expected = {"a": [6], "b": [14], "z": [24.0]}
     compare_dicts(result, expected)
diff --git a/tests/expr_and_series/sum_test.py b/tests/expr_and_series/sum_test.py
index 7ca488804..c4ab387f2 100644
--- a/tests/expr_and_series/sum_test.py
+++ b/tests/expr_and_series/sum_test.py
@@ -11,8 +11,8 @@
 
 
 @pytest.mark.parametrize("expr", [nw.col("a", "b", "z").sum(), nw.sum("a", "b", "z")])
-def test_expr_sum_expr(constructor: Any, expr: nw.Expr) -> None:
-    df = nw.from_native(constructor(data), eager_only=True)
+def test_expr_sum_expr(constructor_lazy: Any, expr: nw.Expr) -> None:
+    df = nw.from_native(constructor_lazy(data))
     result = df.select(expr)
     expected = {"a": [6], "b": [14], "z": [24.0]}
     compare_dicts(result, expected)
diff --git a/tests/frame/test_common.py b/tests/frame/test_common.py
index af0a27757..83006b033 100644
--- a/tests/frame/test_common.py
+++ b/tests/frame/test_common.py
@@ -112,10 +112,7 @@ def test_expr_binary(constructor: Any) -> None:
     compare_dicts(result, expected)
 
 
-def test_expr_transform(request: Any, constructor: Any) -> None:
-    if "pyarrow_table" in str(constructor):
-        request.applymarker(pytest.mark.xfail)
-
+def test_expr_transform(constructor: Any) -> None:
     df = nw.from_native(constructor(data))
     result = df.with_columns(a=nw.col("a").is_between(-1, 1), b=nw.col("b").is_in([4, 5]))
     expected = {"a": [True, False, False], "b": [True, True, False], "z": [7, 8, 9]}
diff --git a/tests/series_only/filter_test.py b/tests/series_only/filter_test.py
deleted file mode 100644
index 0d98c118e..000000000
--- a/tests/series_only/filter_test.py
+++ /dev/null
@@ -1,15 +0,0 @@
-from __future__ import annotations
-
-from typing import Any
-
-import numpy as np
-
-import narwhals as nw
-
-
-def test_filter(constructor: Any) -> None:
-    data = [1, 3, 2]
-    series = nw.from_native(constructor({"a": data}), eager_only=True)["a"]
-    result = series.filter(series > 1)
-    expected = np.array([3, 2])
-    assert (result.to_numpy() == expected).all()
diff --git a/tests/series_only/is_between_test.py b/tests/series_only/is_between_test.py
deleted file mode 100644
index 61c2993a0..000000000
--- a/tests/series_only/is_between_test.py
+++ /dev/null
@@ -1,25 +0,0 @@
-from __future__ import annotations
-
-from typing import Any
-
-import pytest
-
-import narwhals.stable.v1 as nw
-from tests.utils import compare_dicts
-
-data = [1, 4, 2, 5]
-
-
-@pytest.mark.parametrize(
-    ("closed", "expected"),
-    [
-        ("left", [True, True, True, False]),
-        ("right", [False, True, True, True]),
-        ("both", [True, True, True, True]),
-        ("none", [False, True, True, False]),
-    ],
-)
-def test_is_between(constructor: Any, closed: str, expected: list[bool]) -> None:
-    ser = nw.from_native(constructor({"a": data}), eager_only=True)["a"]
-    result = ser.is_between(1, 5, closed=closed)
-    compare_dicts({"a": result}, {"a": expected})