diff --git a/docs/pandas_like_concepts/boolean.md b/docs/pandas_like_concepts/boolean.md index c1b434f57..227ab2eee 100644 --- a/docs/pandas_like_concepts/boolean.md +++ b/docs/pandas_like_concepts/boolean.md @@ -8,12 +8,12 @@ For example, if you do `nw.col('a')*2`, then: ```python exec="1" source="above" session="boolean" import narwhals as nw -from narwhals.typing import FrameT +from narwhals.typing import IntoFrameT data = {"a": [1.4, None, 4.2]} -def multiplication(df: FrameT) -> FrameT: +def multiplication(df: IntoFrameT) -> IntoFrameT: return nw.from_native(df).with_columns((nw.col("a") * 2).alias("a*2")).to_native() ``` diff --git a/docs/pandas_like_concepts/column_names.md b/docs/pandas_like_concepts/column_names.md index 14346a303..68e7886d4 100644 --- a/docs/pandas_like_concepts/column_names.md +++ b/docs/pandas_like_concepts/column_names.md @@ -2,12 +2,11 @@ Polars and PyArrow only allow for string column names. What about pandas? -```python ->>> import pandas as pd ->>> pd.concat([pd.Series([1, 2], name=0), pd.Series([1, 3], name=0)], axis=1) - 0 0 -0 1 1 -1 2 3 +```python exec="true" source="above" result="python" session="col_names" +import pandas as pd + +df = pd.concat([pd.Series([1, 2], name=0), pd.Series([1, 3], name=0)], axis=1) +print(df) ``` Oh...not only does it let us create a dataframe with a column named `0` - it lets us diff --git a/docs/pandas_like_concepts/pandas_index.md b/docs/pandas_like_concepts/pandas_index.md index ccc38d490..aa5f11bcf 100644 --- a/docs/pandas_like_concepts/pandas_index.md +++ b/docs/pandas_like_concepts/pandas_index.md @@ -20,9 +20,10 @@ Let's learn about what Narwhals promises. ```python exec="1" source="above" session="ex1" import narwhals as nw +from narwhals.typing import IntoFrameT -def my_func(df): +def my_func(df: IntoFrameT) -> IntoFrameT: df = nw.from_native(df) df = df.with_columns(a_plus_one=nw.col("a") + 1) return nw.to_native(df) @@ -51,13 +52,16 @@ df_pd = pd.DataFrame({"a": [2, 1, 3], "b": [4, 5, 6]}) s_pd = df_pd["a"].sort_values() df_pd["a_sorted"] = s_pd ``` + Reading the code, you might expect that `'a_sorted'` will contain the values `[1, 2, 3]`. **However**, here's what actually happens: + ```python exec="1" source="material-block" session="ex2" result="python" print(df_pd) ``` + In other words, pandas' index alignment undid the `sort_values` operation! Narwhals, on the other hand, preserves the index of the left-hand-side argument. diff --git a/docs/pandas_like_concepts/user_warning.md b/docs/pandas_like_concepts/user_warning.md index 60e20d4d6..dd95291a8 100644 --- a/docs/pandas_like_concepts/user_warning.md +++ b/docs/pandas_like_concepts/user_warning.md @@ -14,6 +14,7 @@ The pandas API most likely cannot efficiently handle the complexity of the aggre ```python exec="true" source="above" result="python" session="df_ex1" import narwhals as nw import pandas as pd + from narwhals.typing import IntoFrameT data = {"a": [1, 2, 3, 4, 5], "b": [5, 4, 3, 2, 1], "c": [10, 20, 30, 40, 50]} @@ -21,7 +22,7 @@ The pandas API most likely cannot efficiently handle the complexity of the aggre @nw.narwhalify - def approach_1(df): + def approach_1(df: IntoFrameT) -> IntoFrameT: # Pay attention to this next line df = df.group_by("a").agg(d=(nw.col("b") + nw.col("c")).sum()) @@ -43,7 +44,7 @@ The pandas API most likely cannot efficiently handle the complexity of the aggre @nw.narwhalify - def approach_2(df): + def approach_2(df: IntoFrameT) -> IntoFrameT: # Pay attention to this next line df = df.with_columns(d=nw.col("b") + nw.col("c")).group_by("a").agg(nw.sum("d")) @@ -54,7 +55,6 @@ The pandas API most likely cannot efficiently handle the complexity of the aggre print(approach_2(df_pd)) ``` - Both Approaches shown above return the exact same result, but Approach 1 is inefficient and returns the warning message we showed at the top. @@ -62,6 +62,7 @@ What makes the first approach inefficient and the second approach efficient? It pandas API lets us express. ## Approach 1 + ```python # From line 11 @@ -69,15 +70,18 @@ return df.group_by("a").agg((nw.col("b") + nw.col("c")).sum().alias("d")) ``` To translate this to pandas, we would do: + ```python df.groupby("a").apply( lambda df: pd.Series([(df["b"] + df["c"]).sum()], index=["d"]), include_groups=False ) ``` + Any time you use `apply` in pandas, that's a performance footgun - best to avoid it and use vectorised operations instead. Let's take a look at how "approach 2" gets translated to pandas to see the difference. ## Approach 2 + ```python # Line 11 in Approach 2 @@ -85,9 +89,11 @@ return df.with_columns(d=nw.col("b") + nw.col("c")).group_by("a").agg({"d": "sum ``` This gets roughly translated to: + ```python df.assign(d=lambda df: df["b"] + df["c"]).groupby("a").agg({"d": "sum"}) ``` + Because we're using pandas' own API, as opposed to `apply` and a custom `lambda` function, then this is going to be much more efficient. ## Tips for Avoiding the `UserWarning`