From f8f368302409138ffadce17bf0ecf6336f9e592f Mon Sep 17 00:00:00 2001 From: Marco Edward Gorelli Date: Sun, 17 Nov 2024 12:14:01 +0000 Subject: [PATCH] docs: Add page about pandas booleans (#1392) --------- Co-authored-by: Francesco Bruzzesi <42817048+FBruzzesi@users.noreply.github.com> --- .github/workflows/extremes.yml | 2 +- docs/pandas_like_concepts/boolean.md | 61 +++++++++++++++++++ .../column_names.md | 0 .../pandas_index.md | 0 .../user_warning.md | 0 mkdocs.yml | 7 ++- narwhals/_arrow/dataframe.py | 2 +- narwhals/_arrow/group_by.py | 4 +- narwhals/_arrow/series.py | 2 +- narwhals/_arrow/utils.py | 6 +- 10 files changed, 75 insertions(+), 9 deletions(-) create mode 100644 docs/pandas_like_concepts/boolean.md rename docs/{other => pandas_like_concepts}/column_names.md (100%) rename docs/{other => pandas_like_concepts}/pandas_index.md (100%) rename docs/{other => pandas_like_concepts}/user_warning.md (100%) diff --git a/.github/workflows/extremes.yml b/.github/workflows/extremes.yml index fd6a7cfb2..4b89bb630 100644 --- a/.github/workflows/extremes.yml +++ b/.github/workflows/extremes.yml @@ -78,7 +78,7 @@ jobs: enable-cache: "true" cache-suffix: ${{ matrix.python-version }} cache-dependency-glob: "**requirements*.txt" - - name: install-minimum-versions + - name: install-not-so-old-versions run: uv pip install tox virtualenv setuptools pandas==2.0.3 polars==0.20.8 numpy==1.24.4 pyarrow==14.0.0 scipy==1.8.0 scikit-learn==1.3.0 dask[dataframe]==2024.7 tzdata --system - name: install-reqs run: uv pip install -r requirements-dev.txt --system diff --git a/docs/pandas_like_concepts/boolean.md b/docs/pandas_like_concepts/boolean.md new file mode 100644 index 000000000..fe60904cb --- /dev/null +++ b/docs/pandas_like_concepts/boolean.md @@ -0,0 +1,61 @@ +# Boolean columns + +Generally speaking, Narwhals operations preserve null values. +For example, if you do `nw.col('a')*2`, then: + +- Values which were non-null get multiplied by 2. +- Null values stay null. + +```python exec="1" source="above" session="boolean" result="python" +import narwhals as nw + +import pandas as pd +import polars as pl +import pyarrow as pa + +data = {"a": [1.4, None, 4.2]} +print("pandas output") +print(nw.from_native(pd.DataFrame(data)).with_columns(b=nw.col("a") * 2).to_native()) +print("\nPolars output") +print(nw.from_native(pl.DataFrame(data)).with_columns(b=nw.col("a") * 2).to_native()) +print("\nPyArrow output") +print(nw.from_native(pa.table(data)).with_columns(b=nw.col("a") * 2).to_native()) +``` + +What do we do, however, when the result column is boolean? For +example, `nw.col('a') > 0`? +Unfortunately, this is backend-dependent: + +- for all backends except pandas, null values are preserved +- for pandas, this depends on the dtype backend: + - for PyArrow dtypes and pandas nullable dtypes, null + values are preserved + - for the classic NumPy dtypes, null values are typically + filled in with `False`. + +pandas is generally moving towards nullable dtypes, and they +[may become the default in the future](https://github.com/pandas-dev/pandas/pull/58988), +so we hope that the classical NumPy dtypes not supporting null values will just +be a temporary legacy pandas issue which will eventually go +away anyway. + +```python exec="1" source="above" session="boolean" result="python" +print("pandas output") +print(nw.from_native(pd.DataFrame(data)).with_columns(b=nw.col("a") > 2).to_native()) +print("\npandas (nullable dtypes) output") +print( + nw.from_native(pd.DataFrame(data, dtype="Float64")) + .with_columns(b=nw.col("a") > 2) + .to_native() +) +print("\npandas (pyarrow dtypes) output") +print( + nw.from_native(pd.DataFrame(data, dtype="Float64[pyarrow]")) + .with_columns(b=nw.col("a") > 2) + .to_native() +) +print("\nPolars output") +print(nw.from_native(pl.DataFrame(data)).with_columns(b=nw.col("a") > 2).to_native()) +print("\nPyArrow output") +print(nw.from_native(pa.table(data)).with_columns(b=nw.col("a") > 2).to_native()) +``` \ No newline at end of file diff --git a/docs/other/column_names.md b/docs/pandas_like_concepts/column_names.md similarity index 100% rename from docs/other/column_names.md rename to docs/pandas_like_concepts/column_names.md diff --git a/docs/other/pandas_index.md b/docs/pandas_like_concepts/pandas_index.md similarity index 100% rename from docs/other/pandas_index.md rename to docs/pandas_like_concepts/pandas_index.md diff --git a/docs/other/user_warning.md b/docs/pandas_like_concepts/user_warning.md similarity index 100% rename from docs/other/user_warning.md rename to docs/pandas_like_concepts/user_warning.md diff --git a/mkdocs.yml b/mkdocs.yml index 4c3369655..e7268e2b4 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -12,9 +12,10 @@ nav: - basics/complete_example.md - basics/dataframe_conversion.md - Pandas-like concepts: - - other/pandas_index.md - - other/user_warning.md - - other/column_names.md + - pandas_like_concepts/pandas_index.md + - pandas_like_concepts/user_warning.md + - pandas_like_concepts/column_names.md + - pandas_like_concepts/boolean.md - overhead.md - backcompat.md - extending.md diff --git a/narwhals/_arrow/dataframe.py b/narwhals/_arrow/dataframe.py index e6c5bfd8e..2ae8d7617 100644 --- a/narwhals/_arrow/dataframe.py +++ b/narwhals/_arrow/dataframe.py @@ -68,7 +68,7 @@ def __narwhals_dataframe__(self) -> Self: def __narwhals_lazyframe__(self) -> Self: return self - def _from_native_frame(self, df: Any) -> Self: + def _from_native_frame(self, df: pa.Table) -> Self: return self.__class__( df, backend_version=self._backend_version, dtypes=self._dtypes ) diff --git a/narwhals/_arrow/group_by.py b/narwhals/_arrow/group_by.py index 78777e2c4..e5f0bb8c1 100644 --- a/narwhals/_arrow/group_by.py +++ b/narwhals/_arrow/group_by.py @@ -12,6 +12,8 @@ from narwhals.utils import remove_prefix if TYPE_CHECKING: + import pyarrow as pa + from narwhals._arrow.dataframe import ArrowDataFrame from narwhals._arrow.expr import ArrowExpr from narwhals._arrow.typing import IntoArrowExpr @@ -115,7 +117,7 @@ def __iter__(self) -> Iterator[tuple[Any, ArrowDataFrame]]: def agg_arrow( - grouped: Any, + grouped: pa.TableGroupBy, exprs: list[ArrowExpr], keys: list[str], output_names: list[str], diff --git a/narwhals/_arrow/series.py b/narwhals/_arrow/series.py index 75a3cabde..133bf2165 100644 --- a/narwhals/_arrow/series.py +++ b/narwhals/_arrow/series.py @@ -44,7 +44,7 @@ def __init__( self._backend_version = backend_version self._dtypes = dtypes - def _from_native_series(self, series: Any) -> Self: + def _from_native_series(self, series: pa.ChunkedArray | pa.Array) -> Self: import pyarrow as pa # ignore-banned-import() if isinstance(series, pa.Array): diff --git a/narwhals/_arrow/utils.py b/narwhals/_arrow/utils.py index 272a7ecdc..0a9fd2b80 100644 --- a/narwhals/_arrow/utils.py +++ b/narwhals/_arrow/utils.py @@ -15,7 +15,7 @@ from narwhals.typing import DTypes -def native_to_narwhals_dtype(dtype: Any, dtypes: DTypes) -> DType: +def native_to_narwhals_dtype(dtype: pa.DataType, dtypes: DTypes) -> DType: import pyarrow as pa # ignore-banned-import if pa.types.is_int64(dtype): @@ -284,7 +284,9 @@ def floordiv_compat(left: Any, right: Any) -> Any: return result -def cast_for_truediv(arrow_array: Any, pa_object: Any) -> tuple[Any, Any]: +def cast_for_truediv( + arrow_array: pa.ChunkedArray | pa.Scalar, pa_object: pa.ChunkedArray | pa.Scalar +) -> tuple[pa.ChunkedArray | pa.Scalar, pa.ChunkedArray | pa.Scalar]: # Lifted from: # https://github.com/pandas-dev/pandas/blob/262fcfbffcee5c3116e86a951d8b693f90411e68/pandas/core/arrays/arrow/array.py#L108-L122 import pyarrow as pa # ignore-banned-import