Skip to content

Commit

Permalink
use sort in readme example
Browse files Browse the repository at this point in the history
  • Loading branch information
MarcoGorelli committed Apr 11, 2024
1 parent 1a335c7 commit d5e03f9
Show file tree
Hide file tree
Showing 8 changed files with 424 additions and 108 deletions.
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ def my_agnostic_function(
weight_mean=nw.col("weight").mean(),
weight_max=nw.col("weight").max(),
)
.sort("s")
)

return nw.to_native(result)
Expand Down Expand Up @@ -122,10 +123,10 @@ shape: (4, 3)
│ --- ┆ --- ┆ --- │
│ str ┆ f64 ┆ f64 │
╞═════╪═════════════╪════════════╡
│ S1 ┆ 15.0 ┆ 19.0 │
│ S2 ┆ 14.5 ┆ 17.0 │
│ S3 ┆ 14.5 ┆ 17.0 │
│ S4 ┆ 15.0 ┆ 19.0 │
│ S1 ┆ 15.0 ┆ 19.0 │
└─────┴─────────────┴────────────┘
```
Magic! 🪄
Expand Down
2 changes: 1 addition & 1 deletion tpch/notebooks/q1/execute.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,7 @@
"tool = 'pandas[pyarrow]'\n",
"fn = IO_FUNCS[tool]\n",
"timings = %timeit -o q1_pandas_native(fn(lineitem))\n",
"results[tool+'native'] = timings.best"
"results[tool+'[native]'] = timings.best"
]
},
{
Expand Down
127 changes: 23 additions & 104 deletions tpch/notebooks/q2/execute.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -98,96 +98,23 @@
" var1 = 15\n",
" var2 = \"BRASS\"\n",
" var3 = \"EUROPE\"\n",
" nation_filtered = nation_ds.loc[:, [\"n_nationkey\", \"n_name\", \"n_regionkey\"]]\n",
" region_filtered = region_ds[(region_ds[\"r_name\"] == var3)]\n",
" region_filtered = region_filtered.loc[:, [\"r_regionkey\"]]\n",
" r_n_merged = nation_ds.merge(\n",
" region_ds, left_on=\"n_regionkey\", right_on=\"r_regionkey\", how=\"inner\"\n",
" )\n",
" # r_n_merged = r_n_merged.loc[:, [\"n_nationkey\", \"n_name\"]]\n",
" # supplier_filtered = supplier_ds.loc[\n",
" # :,\n",
" # [\n",
" # \"s_suppkey\",\n",
" # \"s_name\",\n",
" # \"s_address\",\n",
" # \"s_nationkey\",\n",
" # \"s_phone\",\n",
" # \"s_acctbal\",\n",
" # \"s_comment\",\n",
" # ],\n",
" # ]\n",
" s_r_n_merged = r_n_merged.merge(\n",
" supplier_ds,\n",
" left_on=\"n_nationkey\",\n",
" right_on=\"s_nationkey\",\n",
" how=\"inner\",\n",
" )\n",
" s_r_n_merged = s_r_n_merged.loc[\n",
" :,\n",
" [\n",
" \"n_name\",\n",
" \"s_suppkey\",\n",
" \"s_name\",\n",
" \"s_address\",\n",
" \"s_phone\",\n",
" \"s_acctbal\",\n",
" \"s_comment\",\n",
" ],\n",
" ]\n",
" partsupp_filtered = part_supp_ds.loc[\n",
" :, [\"ps_partkey\", \"ps_suppkey\", \"ps_supplycost\"]\n",
" ]\n",
" ps_s_r_n_merged = s_r_n_merged.merge(\n",
" partsupp_filtered, left_on=\"s_suppkey\", right_on=\"ps_suppkey\", how=\"inner\"\n",
" )\n",
" ps_s_r_n_merged = ps_s_r_n_merged.loc[\n",
" :,\n",
" [\n",
" \"n_name\",\n",
" \"s_name\",\n",
" \"s_address\",\n",
" \"s_phone\",\n",
" \"s_acctbal\",\n",
" \"s_comment\",\n",
" \"ps_partkey\",\n",
" \"ps_supplycost\",\n",
" ],\n",
" ]\n",
" part_filtered = part_ds.loc[:, [\"p_partkey\", \"p_mfgr\", \"p_size\", \"p_type\"]]\n",
" part_filtered = part_filtered[\n",
" (part_filtered[\"p_size\"] == var1)\n",
" & (part_filtered[\"p_type\"].str.endswith(var2))\n",
" ]\n",
" part_filtered = part_filtered.loc[:, [\"p_partkey\", \"p_mfgr\"]]\n",
" merged_df = part_filtered.merge(\n",
" ps_s_r_n_merged, left_on=\"p_partkey\", right_on=\"ps_partkey\", how=\"inner\"\n",
" )\n",
" merged_df = merged_df.loc[\n",
" :,\n",
" [\n",
" \"n_name\",\n",
" \"s_name\",\n",
" \"s_address\",\n",
" \"s_phone\",\n",
" \"s_acctbal\",\n",
" \"s_comment\",\n",
" \"ps_supplycost\",\n",
" \"p_partkey\",\n",
" \"p_mfgr\",\n",
" ],\n",
" ]\n",
" min_values = merged_df.groupby(\"p_partkey\", as_index=False)[\n",
" \"ps_supplycost\"\n",
" ].min()\n",
" min_values.columns = [\"P_PARTKEY_CPY\", \"MIN_SUPPLYCOST\"]\n",
" merged_df = merged_df.merge(\n",
" min_values,\n",
" left_on=[\"p_partkey\", \"ps_supplycost\"],\n",
" right_on=[\"P_PARTKEY_CPY\", \"MIN_SUPPLYCOST\"],\n",
" how=\"inner\",\n",
"\n",
" jn = (\n",
" part_ds.merge(part_supp_ds, left_on=\"p_partkey\", right_on=\"ps_partkey\")\n",
" .merge(supplier_ds, left_on=\"ps_suppkey\", right_on=\"s_suppkey\")\n",
" .merge(nation_ds, left_on=\"s_nationkey\", right_on=\"n_nationkey\")\n",
" .merge(region_ds, left_on=\"n_regionkey\", right_on=\"r_regionkey\")\n",
" )\n",
" result_df = merged_df.loc[\n",
"\n",
" jn = jn[jn[\"p_size\"] == var1]\n",
" jn = jn[jn[\"p_type\"].str.endswith(var2)]\n",
" jn = jn[jn[\"r_name\"] == var3]\n",
"\n",
" gb = jn.groupby(\"p_partkey\", as_index=False)\n",
" agg = gb[\"ps_supplycost\"].min()\n",
" jn2 = agg.merge(jn, on=[\"p_partkey\", \"ps_supplycost\"])\n",
"\n",
" sel = jn2.loc[\n",
" :,\n",
" [\n",
" \"s_acctbal\",\n",
Expand All @@ -200,20 +127,12 @@
" \"s_comment\",\n",
" ],\n",
" ]\n",
" result_df = result_df.sort_values(\n",
" by=[\n",
" \"s_acctbal\",\n",
" \"n_name\",\n",
" \"s_name\",\n",
" \"p_partkey\",\n",
" ],\n",
" ascending=[\n",
" False,\n",
" True,\n",
" True,\n",
" True,\n",
" ],\n",
" ).head(100)\n",
"\n",
" sort = sel.sort_values(\n",
" by=[\"s_acctbal\", \"n_name\", \"s_name\", \"p_partkey\"],\n",
" ascending=[False, True, True, True],\n",
" )\n",
" result_df = sort.head(100)\n",
"\n",
" return result_df # type: ignore[no-any-return]"
]
Expand Down Expand Up @@ -411,7 +330,7 @@
"tool = 'pandas[pyarrow]'\n",
"fn = IO_FUNCS[tool]\n",
"timings = %timeit -o q2_pandas_native(fn(region), fn(nation), fn(supplier), fn(part), fn(partsupp))\n",
"results[tool+'native'] = timings.best"
"results[tool+'[native]'] = timings.best"
]
},
{
Expand Down
79 changes: 79 additions & 0 deletions tpch/notebooks/q3/execute.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,47 @@
"pd.options.future.infer_string = True"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3497be7d",
"metadata": {},
"outputs": [],
"source": [
"from typing import Any\n",
"from datetime import date\n",
"\n",
"def q3_pandas_native(\n",
" customer_ds: Any,\n",
" line_item_ds: Any,\n",
" orders_ds: Any,\n",
"):\n",
" var1 = \"BUILDING\"\n",
" var2 = date(1995, 3, 15)\n",
"\n",
" fcustomer = customer_ds[customer_ds[\"c_mktsegment\"] == var1]\n",
"\n",
" jn1 = fcustomer.merge(orders_ds, left_on=\"c_custkey\", right_on=\"o_custkey\")\n",
" jn2 = jn1.merge(line_item_ds, left_on=\"o_orderkey\", right_on=\"l_orderkey\")\n",
"\n",
" jn2 = jn2[jn2[\"o_orderdate\"] < var2]\n",
" jn2 = jn2[jn2[\"l_shipdate\"] > var2]\n",
" jn2[\"revenue\"] = jn2.l_extendedprice * (1 - jn2.l_discount)\n",
"\n",
" gb = jn2.groupby(\n",
" [\"o_orderkey\", \"o_orderdate\", \"o_shippriority\"], as_index=False\n",
" )\n",
" agg = gb[\"revenue\"].sum()\n",
"\n",
" sel = agg.loc[:, [\"o_orderkey\", \"revenue\", \"o_orderdate\", \"o_shippriority\"]]\n",
" sel = sel.rename({\"o_orderkey\": \"l_orderkey\"}, axis=\"columns\")\n",
"\n",
" sorted = sel.sort_values(by=[\"revenue\", \"o_orderdate\"], ascending=[False, True])\n",
" result_df = sorted.head(10)\n",
"\n",
" return result_df # type: ignore[no-any-return]"
]
},
{
"cell_type": "code",
"execution_count": 4,
Expand Down Expand Up @@ -217,6 +258,44 @@
"results = {}"
]
},
{
"cell_type": "markdown",
"id": "6bfedb9e",
"metadata": {},
"source": [
"## pandas, pyarrow dtypes, native"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "51d52b99",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"24 s ± 142 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
]
},
{
"data": {
"text/plain": [
"23.841894793999984"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"tool = 'pandas[pyarrow]'\n",
"fn = IO_FUNCS[tool]\n",
"timings = %timeit -o q3_pandas_native(fn(customer), fn(lineitem), fn(orders))\n",
"results[tool+'[native]'] = timings.best"
]
},
{
"cell_type": "markdown",
"id": "09249944",
Expand Down
73 changes: 73 additions & 0 deletions tpch/notebooks/q4/execute.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,41 @@
"pd.options.future.infer_string = True"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "580ac2a5",
"metadata": {},
"outputs": [],
"source": [
"from datetime import date\n",
"from typing import Any\n",
"\n",
"def q4_pandas_native(\n",
" line_item_ds: Any,\n",
" orders_ds: Any,\n",
"):\n",
" var1 = date(1993, 7, 1)\n",
" var2 = date(1993, 10, 1)\n",
"\n",
" jn = line_item_ds.merge(orders_ds, left_on=\"l_orderkey\", right_on=\"o_orderkey\")\n",
"\n",
" jn = jn[\n",
" (jn[\"o_orderdate\"] < var2)\n",
" & (jn[\"o_orderdate\"] >= var1)\n",
" & (jn[\"l_commitdate\"] < jn[\"l_receiptdate\"])\n",
" ]\n",
"\n",
" jn = jn.drop_duplicates(subset=[\"o_orderpriority\", \"l_orderkey\"])\n",
"\n",
" gb = jn.groupby(\"o_orderpriority\", as_index=False)\n",
" agg = gb.agg(order_count=pd.NamedAgg(column=\"o_orderkey\", aggfunc=\"count\"))\n",
"\n",
" result_df = agg.sort_values([\"o_orderpriority\"])\n",
"\n",
" return result_df # type: ignore[no-any-return]"
]
},
{
"cell_type": "code",
"execution_count": 4,
Expand Down Expand Up @@ -203,6 +238,44 @@
"results = {}"
]
},
{
"cell_type": "markdown",
"id": "956e9675",
"metadata": {},
"source": [
"## pandas, pyarrow dtype, via Narwhals"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ca578422",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"24 s ± 142 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
]
},
{
"data": {
"text/plain": [
"23.841894793999984"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"tool = 'pandas[pyarrow]'\n",
"fn = IO_FUNCS[tool]\n",
"timings = %timeit -o q4_pandas_native(fn(lineitem), fn(orders))\n",
"results[tool+'[native]'] = timings.best"
]
},
{
"cell_type": "markdown",
"id": "09249944",
Expand Down
Loading

0 comments on commit d5e03f9

Please sign in to comment.