diff --git a/README.md b/README.md index f37c5bb22..2349c5d5a 100644 --- a/README.md +++ b/README.md @@ -69,6 +69,7 @@ def my_agnostic_function( weight_mean=nw.col("weight").mean(), weight_max=nw.col("weight").max(), ) + .sort("s") ) return nw.to_native(result) @@ -122,10 +123,10 @@ shape: (4, 3) │ --- ┆ --- ┆ --- │ │ str ┆ f64 ┆ f64 │ ╞═════╪═════════════╪════════════╡ +│ S1 ┆ 15.0 ┆ 19.0 │ │ S2 ┆ 14.5 ┆ 17.0 │ │ S3 ┆ 14.5 ┆ 17.0 │ │ S4 ┆ 15.0 ┆ 19.0 │ -│ S1 ┆ 15.0 ┆ 19.0 │ └─────┴─────────────┴────────────┘ ``` Magic! 🪄 diff --git a/tpch/notebooks/q1/execute.ipynb b/tpch/notebooks/q1/execute.ipynb index 0a4b95ad4..49cc4bdbb 100755 --- a/tpch/notebooks/q1/execute.ipynb +++ b/tpch/notebooks/q1/execute.ipynb @@ -288,7 +288,7 @@ "tool = 'pandas[pyarrow]'\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o q1_pandas_native(fn(lineitem))\n", - "results[tool+'native'] = timings.best" + "results[tool+'[native]'] = timings.best" ] }, { diff --git a/tpch/notebooks/q2/execute.ipynb b/tpch/notebooks/q2/execute.ipynb index 32bf97a99..f9a92e2ff 100755 --- a/tpch/notebooks/q2/execute.ipynb +++ b/tpch/notebooks/q2/execute.ipynb @@ -98,96 +98,23 @@ " var1 = 15\n", " var2 = \"BRASS\"\n", " var3 = \"EUROPE\"\n", - " nation_filtered = nation_ds.loc[:, [\"n_nationkey\", \"n_name\", \"n_regionkey\"]]\n", - " region_filtered = region_ds[(region_ds[\"r_name\"] == var3)]\n", - " region_filtered = region_filtered.loc[:, [\"r_regionkey\"]]\n", - " r_n_merged = nation_ds.merge(\n", - " region_ds, left_on=\"n_regionkey\", right_on=\"r_regionkey\", how=\"inner\"\n", - " )\n", - " # r_n_merged = r_n_merged.loc[:, [\"n_nationkey\", \"n_name\"]]\n", - " # supplier_filtered = supplier_ds.loc[\n", - " # :,\n", - " # [\n", - " # \"s_suppkey\",\n", - " # \"s_name\",\n", - " # \"s_address\",\n", - " # \"s_nationkey\",\n", - " # \"s_phone\",\n", - " # \"s_acctbal\",\n", - " # \"s_comment\",\n", - " # ],\n", - " # ]\n", - " s_r_n_merged = r_n_merged.merge(\n", - " supplier_ds,\n", - " left_on=\"n_nationkey\",\n", - " right_on=\"s_nationkey\",\n", - " how=\"inner\",\n", - " )\n", - " s_r_n_merged = s_r_n_merged.loc[\n", - " :,\n", - " [\n", - " \"n_name\",\n", - " \"s_suppkey\",\n", - " \"s_name\",\n", - " \"s_address\",\n", - " \"s_phone\",\n", - " \"s_acctbal\",\n", - " \"s_comment\",\n", - " ],\n", - " ]\n", - " partsupp_filtered = part_supp_ds.loc[\n", - " :, [\"ps_partkey\", \"ps_suppkey\", \"ps_supplycost\"]\n", - " ]\n", - " ps_s_r_n_merged = s_r_n_merged.merge(\n", - " partsupp_filtered, left_on=\"s_suppkey\", right_on=\"ps_suppkey\", how=\"inner\"\n", - " )\n", - " ps_s_r_n_merged = ps_s_r_n_merged.loc[\n", - " :,\n", - " [\n", - " \"n_name\",\n", - " \"s_name\",\n", - " \"s_address\",\n", - " \"s_phone\",\n", - " \"s_acctbal\",\n", - " \"s_comment\",\n", - " \"ps_partkey\",\n", - " \"ps_supplycost\",\n", - " ],\n", - " ]\n", - " part_filtered = part_ds.loc[:, [\"p_partkey\", \"p_mfgr\", \"p_size\", \"p_type\"]]\n", - " part_filtered = part_filtered[\n", - " (part_filtered[\"p_size\"] == var1)\n", - " & (part_filtered[\"p_type\"].str.endswith(var2))\n", - " ]\n", - " part_filtered = part_filtered.loc[:, [\"p_partkey\", \"p_mfgr\"]]\n", - " merged_df = part_filtered.merge(\n", - " ps_s_r_n_merged, left_on=\"p_partkey\", right_on=\"ps_partkey\", how=\"inner\"\n", - " )\n", - " merged_df = merged_df.loc[\n", - " :,\n", - " [\n", - " \"n_name\",\n", - " \"s_name\",\n", - " \"s_address\",\n", - " \"s_phone\",\n", - " \"s_acctbal\",\n", - " \"s_comment\",\n", - " \"ps_supplycost\",\n", - " \"p_partkey\",\n", - " \"p_mfgr\",\n", - " ],\n", - " ]\n", - " min_values = merged_df.groupby(\"p_partkey\", as_index=False)[\n", - " \"ps_supplycost\"\n", - " ].min()\n", - " min_values.columns = [\"P_PARTKEY_CPY\", \"MIN_SUPPLYCOST\"]\n", - " merged_df = merged_df.merge(\n", - " min_values,\n", - " left_on=[\"p_partkey\", \"ps_supplycost\"],\n", - " right_on=[\"P_PARTKEY_CPY\", \"MIN_SUPPLYCOST\"],\n", - " how=\"inner\",\n", + "\n", + " jn = (\n", + " part_ds.merge(part_supp_ds, left_on=\"p_partkey\", right_on=\"ps_partkey\")\n", + " .merge(supplier_ds, left_on=\"ps_suppkey\", right_on=\"s_suppkey\")\n", + " .merge(nation_ds, left_on=\"s_nationkey\", right_on=\"n_nationkey\")\n", + " .merge(region_ds, left_on=\"n_regionkey\", right_on=\"r_regionkey\")\n", " )\n", - " result_df = merged_df.loc[\n", + "\n", + " jn = jn[jn[\"p_size\"] == var1]\n", + " jn = jn[jn[\"p_type\"].str.endswith(var2)]\n", + " jn = jn[jn[\"r_name\"] == var3]\n", + "\n", + " gb = jn.groupby(\"p_partkey\", as_index=False)\n", + " agg = gb[\"ps_supplycost\"].min()\n", + " jn2 = agg.merge(jn, on=[\"p_partkey\", \"ps_supplycost\"])\n", + "\n", + " sel = jn2.loc[\n", " :,\n", " [\n", " \"s_acctbal\",\n", @@ -200,20 +127,12 @@ " \"s_comment\",\n", " ],\n", " ]\n", - " result_df = result_df.sort_values(\n", - " by=[\n", - " \"s_acctbal\",\n", - " \"n_name\",\n", - " \"s_name\",\n", - " \"p_partkey\",\n", - " ],\n", - " ascending=[\n", - " False,\n", - " True,\n", - " True,\n", - " True,\n", - " ],\n", - " ).head(100)\n", + "\n", + " sort = sel.sort_values(\n", + " by=[\"s_acctbal\", \"n_name\", \"s_name\", \"p_partkey\"],\n", + " ascending=[False, True, True, True],\n", + " )\n", + " result_df = sort.head(100)\n", "\n", " return result_df # type: ignore[no-any-return]" ] @@ -411,7 +330,7 @@ "tool = 'pandas[pyarrow]'\n", "fn = IO_FUNCS[tool]\n", "timings = %timeit -o q2_pandas_native(fn(region), fn(nation), fn(supplier), fn(part), fn(partsupp))\n", - "results[tool+'native'] = timings.best" + "results[tool+'[native]'] = timings.best" ] }, { diff --git a/tpch/notebooks/q3/execute.ipynb b/tpch/notebooks/q3/execute.ipynb index 2dbace1e7..3c6ead597 100755 --- a/tpch/notebooks/q3/execute.ipynb +++ b/tpch/notebooks/q3/execute.ipynb @@ -79,6 +79,47 @@ "pd.options.future.infer_string = True" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "3497be7d", + "metadata": {}, + "outputs": [], + "source": [ + "from typing import Any\n", + "from datetime import date\n", + "\n", + "def q3_pandas_native(\n", + " customer_ds: Any,\n", + " line_item_ds: Any,\n", + " orders_ds: Any,\n", + "):\n", + " var1 = \"BUILDING\"\n", + " var2 = date(1995, 3, 15)\n", + "\n", + " fcustomer = customer_ds[customer_ds[\"c_mktsegment\"] == var1]\n", + "\n", + " jn1 = fcustomer.merge(orders_ds, left_on=\"c_custkey\", right_on=\"o_custkey\")\n", + " jn2 = jn1.merge(line_item_ds, left_on=\"o_orderkey\", right_on=\"l_orderkey\")\n", + "\n", + " jn2 = jn2[jn2[\"o_orderdate\"] < var2]\n", + " jn2 = jn2[jn2[\"l_shipdate\"] > var2]\n", + " jn2[\"revenue\"] = jn2.l_extendedprice * (1 - jn2.l_discount)\n", + "\n", + " gb = jn2.groupby(\n", + " [\"o_orderkey\", \"o_orderdate\", \"o_shippriority\"], as_index=False\n", + " )\n", + " agg = gb[\"revenue\"].sum()\n", + "\n", + " sel = agg.loc[:, [\"o_orderkey\", \"revenue\", \"o_orderdate\", \"o_shippriority\"]]\n", + " sel = sel.rename({\"o_orderkey\": \"l_orderkey\"}, axis=\"columns\")\n", + "\n", + " sorted = sel.sort_values(by=[\"revenue\", \"o_orderdate\"], ascending=[False, True])\n", + " result_df = sorted.head(10)\n", + "\n", + " return result_df # type: ignore[no-any-return]" + ] + }, { "cell_type": "code", "execution_count": 4, @@ -217,6 +258,44 @@ "results = {}" ] }, + { + "cell_type": "markdown", + "id": "6bfedb9e", + "metadata": {}, + "source": [ + "## pandas, pyarrow dtypes, native" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "51d52b99", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "24 s ± 142 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + }, + { + "data": { + "text/plain": [ + "23.841894793999984" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "tool = 'pandas[pyarrow]'\n", + "fn = IO_FUNCS[tool]\n", + "timings = %timeit -o q3_pandas_native(fn(customer), fn(lineitem), fn(orders))\n", + "results[tool+'[native]'] = timings.best" + ] + }, { "cell_type": "markdown", "id": "09249944", diff --git a/tpch/notebooks/q4/execute.ipynb b/tpch/notebooks/q4/execute.ipynb index bfaa48ce1..e06538c02 100755 --- a/tpch/notebooks/q4/execute.ipynb +++ b/tpch/notebooks/q4/execute.ipynb @@ -79,6 +79,41 @@ "pd.options.future.infer_string = True" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "580ac2a5", + "metadata": {}, + "outputs": [], + "source": [ + "from datetime import date\n", + "from typing import Any\n", + "\n", + "def q4_pandas_native(\n", + " line_item_ds: Any,\n", + " orders_ds: Any,\n", + "):\n", + " var1 = date(1993, 7, 1)\n", + " var2 = date(1993, 10, 1)\n", + "\n", + " jn = line_item_ds.merge(orders_ds, left_on=\"l_orderkey\", right_on=\"o_orderkey\")\n", + "\n", + " jn = jn[\n", + " (jn[\"o_orderdate\"] < var2)\n", + " & (jn[\"o_orderdate\"] >= var1)\n", + " & (jn[\"l_commitdate\"] < jn[\"l_receiptdate\"])\n", + " ]\n", + "\n", + " jn = jn.drop_duplicates(subset=[\"o_orderpriority\", \"l_orderkey\"])\n", + "\n", + " gb = jn.groupby(\"o_orderpriority\", as_index=False)\n", + " agg = gb.agg(order_count=pd.NamedAgg(column=\"o_orderkey\", aggfunc=\"count\"))\n", + "\n", + " result_df = agg.sort_values([\"o_orderpriority\"])\n", + "\n", + " return result_df # type: ignore[no-any-return]" + ] + }, { "cell_type": "code", "execution_count": 4, @@ -203,6 +238,44 @@ "results = {}" ] }, + { + "cell_type": "markdown", + "id": "956e9675", + "metadata": {}, + "source": [ + "## pandas, pyarrow dtype, via Narwhals" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ca578422", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "24 s ± 142 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + }, + { + "data": { + "text/plain": [ + "23.841894793999984" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "tool = 'pandas[pyarrow]'\n", + "fn = IO_FUNCS[tool]\n", + "timings = %timeit -o q4_pandas_native(fn(lineitem), fn(orders))\n", + "results[tool+'[native]'] = timings.best" + ] + }, { "cell_type": "markdown", "id": "09249944", diff --git a/tpch/notebooks/q5/execute.ipynb b/tpch/notebooks/q5/execute.ipynb index b72fb7fab..c75296180 100755 --- a/tpch/notebooks/q5/execute.ipynb +++ b/tpch/notebooks/q5/execute.ipynb @@ -79,6 +79,48 @@ "pd.options.future.infer_string = True" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "690c74b3", + "metadata": {}, + "outputs": [], + "source": [ + "from typing import Any\n", + "from datetime import date\n", + "\n", + "def q5_pandas_native(\n", + " region_ds: Any,\n", + " nation_ds: Any,\n", + " customer_ds: Any,\n", + " line_item_ds: Any,\n", + " orders_ds: Any,\n", + " supplier_ds: Any,\n", + "):\n", + " var1 = \"ASIA\"\n", + " var2 = date(1994, 1, 1)\n", + " var3 = date(1995, 1, 1)\n", + "\n", + " jn1 = region_ds.merge(nation_ds, left_on=\"r_regionkey\", right_on=\"n_regionkey\")\n", + " jn2 = jn1.merge(customer_ds, left_on=\"n_nationkey\", right_on=\"c_nationkey\")\n", + " jn3 = jn2.merge(orders_ds, left_on=\"c_custkey\", right_on=\"o_custkey\")\n", + " jn4 = jn3.merge(line_item_ds, left_on=\"o_orderkey\", right_on=\"l_orderkey\")\n", + " jn5 = jn4.merge(\n", + " supplier_ds,\n", + " left_on=[\"l_suppkey\", \"n_nationkey\"],\n", + " right_on=[\"s_suppkey\", \"s_nationkey\"],\n", + " )\n", + "\n", + " jn5 = jn5[jn5[\"r_name\"] == var1]\n", + " jn5 = jn5[(jn5[\"o_orderdate\"] >= var2) & (jn5[\"o_orderdate\"] < var3)]\n", + " jn5[\"revenue\"] = jn5.l_extendedprice * (1.0 - jn5.l_discount)\n", + "\n", + " gb = jn5.groupby(\"n_name\", as_index=False)[\"revenue\"].sum()\n", + " result_df = gb.sort_values(\"revenue\", ascending=False)\n", + "\n", + " return result_df # type: ignore[no-any-return]" + ] + }, { "cell_type": "code", "execution_count": 4, @@ -222,6 +264,44 @@ "results = {}" ] }, + { + "cell_type": "markdown", + "id": "3d76284a", + "metadata": {}, + "source": [ + "## pandas, pyarrow dtypes, native" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0fd7e705", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "24 s ± 142 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + }, + { + "data": { + "text/plain": [ + "23.841894793999984" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "tool = 'pandas[pyarrow]'\n", + "fn = IO_FUNCS[tool]\n", + "timings = %timeit -o q5_pandas_native(fn(region), fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier))\n", + "results[tool+'[native]'] = timings.best" + ] + }, { "cell_type": "markdown", "id": "09249944", diff --git a/tpch/notebooks/q6/execute.ipynb b/tpch/notebooks/q6/execute.ipynb index a1d3f4766..3e65496a2 100755 --- a/tpch/notebooks/q6/execute.ipynb +++ b/tpch/notebooks/q6/execute.ipynb @@ -75,8 +75,37 @@ "import pandas as pd\n", "import polars as pl\n", "\n", - "pd.options.mode.copy_on_write = True\n", - "pd.options.future.infer_string = True" + "pd.options.mode.copy_on_write = True" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3b4dcf55", + "metadata": {}, + "outputs": [], + "source": [ + "from datetime import date\n", + "\n", + "def q6_pandas_native(line_item_ds):\n", + " var1 = date(1994, 1, 1)\n", + " var2 = date(1995, 1, 1)\n", + " var3 = 0.05\n", + " var4 = 0.07\n", + " var5 = 24\n", + "\n", + " flineitem = line_item_ds[\n", + " (line_item_ds[\"l_shipdate\"] >= var1)\n", + " & (line_item_ds[\"l_shipdate\"] < var2)\n", + " & (line_item_ds[\"l_discount\"] >= var3)\n", + " & (line_item_ds[\"l_discount\"] <= var4)\n", + " & (line_item_ds[\"l_quantity\"] < var5)\n", + " ]\n", + "\n", + " result_value = (flineitem[\"l_extendedprice\"] * flineitem[\"l_discount\"]).sum()\n", + " result_df = pd.DataFrame({\"revenue\": [result_value]})\n", + "\n", + " return result_df" ] }, { @@ -198,6 +227,44 @@ "results = {}" ] }, + { + "cell_type": "markdown", + "id": "56b73231", + "metadata": {}, + "source": [ + "## pandas, pyarrow dtypes, native" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bbea3aa9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "24 s ± 142 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + }, + { + "data": { + "text/plain": [ + "23.841894793999984" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "tool = 'pandas[pyarrow]'\n", + "fn = IO_FUNCS[tool]\n", + "timings = %timeit -o q6_pandas_native(fn(lineitem))\n", + "results[tool+'[native]'] = timings.best" + ] + }, { "cell_type": "markdown", "id": "09249944", diff --git a/tpch/notebooks/q7/execute.ipynb b/tpch/notebooks/q7/execute.ipynb index 0fc794dd4..8b884df61 100755 --- a/tpch/notebooks/q7/execute.ipynb +++ b/tpch/notebooks/q7/execute.ipynb @@ -89,6 +89,65 @@ "pd.options.future.infer_string = True" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "3c83ec1b", + "metadata": {}, + "outputs": [], + "source": [ + "from typing import Any\n", + "from datetime import datetime, date\n", + "import narwhals as nw\n", + "\n", + "def q7_pandas_native(\n", + " nation_ds,\n", + " customer_ds,\n", + " line_item_ds,\n", + " orders_ds,\n", + " supplier_ds,\n", + ") -> None:\n", + " var1 = \"FRANCE\"\n", + " var2 = \"GERMANY\"\n", + " var3 = date(1995, 1, 1)\n", + " var4 = date(1996, 12, 31)\n", + "\n", + " n1 = nation_ds[(nation_ds[\"n_name\"] == var1)]\n", + " n2 = nation_ds[(nation_ds[\"n_name\"] == var2)]\n", + "\n", + " # Part 1\n", + " jn1 = customer_ds.merge(n1, left_on=\"c_nationkey\", right_on=\"n_nationkey\")\n", + " jn2 = jn1.merge(orders_ds, left_on=\"c_custkey\", right_on=\"o_custkey\")\n", + " jn2 = jn2.rename({\"n_name\": \"cust_nation\"}, axis=\"columns\")\n", + " jn3 = jn2.merge(line_item_ds, left_on=\"o_orderkey\", right_on=\"l_orderkey\")\n", + " jn4 = jn3.merge(supplier_ds, left_on=\"l_suppkey\", right_on=\"s_suppkey\")\n", + " jn5 = jn4.merge(n2, left_on=\"s_nationkey\", right_on=\"n_nationkey\")\n", + " df1 = jn5.rename({\"n_name\": \"supp_nation\"}, axis=\"columns\")\n", + "\n", + " # Part 2\n", + " jn1 = customer_ds.merge(n2, left_on=\"c_nationkey\", right_on=\"n_nationkey\")\n", + " jn2 = jn1.merge(orders_ds, left_on=\"c_custkey\", right_on=\"o_custkey\")\n", + " jn2 = jn2.rename({\"n_name\": \"cust_nation\"}, axis=\"columns\")\n", + " jn3 = jn2.merge(line_item_ds, left_on=\"o_orderkey\", right_on=\"l_orderkey\")\n", + " jn4 = jn3.merge(supplier_ds, left_on=\"l_suppkey\", right_on=\"s_suppkey\")\n", + " jn5 = jn4.merge(n1, left_on=\"s_nationkey\", right_on=\"n_nationkey\")\n", + " df2 = jn5.rename({\"n_name\": \"supp_nation\"}, axis=\"columns\")\n", + "\n", + " # Combine\n", + " total = pd.concat([df1, df2])\n", + "\n", + " total = total[(total[\"l_shipdate\"] >= var3) & (total[\"l_shipdate\"] <= var4)]\n", + " total[\"volume\"] = total[\"l_extendedprice\"] * (1.0 - total[\"l_discount\"])\n", + " total[\"l_year\"] = total[\"l_shipdate\"].dt.year\n", + "\n", + " gb = total.groupby([\"supp_nation\", \"cust_nation\", \"l_year\"], as_index=False)\n", + " agg = gb.agg(revenue=pd.NamedAgg(column=\"volume\", aggfunc=\"sum\"))\n", + "\n", + " result_df = agg.sort_values(by=[\"supp_nation\", \"cust_nation\", \"l_year\"])\n", + "\n", + " return result_df # type: ignore[no-any-return]" + ] + }, { "cell_type": "code", "execution_count": 2, @@ -241,6 +300,44 @@ "results = {}" ] }, + { + "cell_type": "markdown", + "id": "12824d5d", + "metadata": {}, + "source": [ + "## pandas, pyarrow dtypes, native" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ce229598", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "20.2 s ± 5.8 s per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + }, + { + "data": { + "text/plain": [ + "16.42582530300001" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "tool = 'pandas[pyarrow]'\n", + "fn = IO_FUNCS[tool]\n", + "timings = %timeit -o q7_pandas_native(fn(nation), fn(customer), fn(lineitem), fn(orders), fn(supplier))\n", + "results[tool+'[native]'] = timings.best" + ] + }, { "cell_type": "markdown", "id": "09249944",