From 83c90c94efbbe2f43d378d0dbcb85f59d908156b Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Tue, 11 Jun 2024 15:25:56 -0500 Subject: [PATCH] add spark_to_pandas method --- src/notebook_utils/notebook_utils.py | 34 ++++++++++++++++++++++++---- src/notebook_utils/startup.py | 5 +++- 2 files changed, 33 insertions(+), 6 deletions(-) diff --git a/src/notebook_utils/notebook_utils.py b/src/notebook_utils/notebook_utils.py index c2b3f12..ba789ab 100644 --- a/src/notebook_utils/notebook_utils.py +++ b/src/notebook_utils/notebook_utils.py @@ -2,13 +2,31 @@ import itables.options as opt from itables import init_notebook_mode, show -from pandas import DataFrame +from pandas import DataFrame as PandasDataFrame +from pyspark.sql import DataFrame as SparkDataFrame lock = RLock() +def spark_to_pandas( + spark_df: SparkDataFrame, + limit: int = 1000, + offset: int = 0 +) -> PandasDataFrame: + """ + Convert a Spark DataFrame to a pandas DataFrame. + + :param spark_df: a Spark DataFrame + :param limit: the number of rows to fetch + :param offset: the number of rows to skip + :return: a pandas DataFrame + """ + + return spark_df.offset(offset).limit(limit).toPandas() + + def display_df( - df: DataFrame, + df: PandasDataFrame | SparkDataFrame, layout: dict = None, buttons: list = None, length_menu: list = None @@ -17,15 +35,21 @@ def display_df( Display a pandas DataFrame using itables. iTables project page: https://github.com/mwouts/itables - Notice this function is not compatible with Spark DataFrames. A Spark DataFrame should be converted to a - pandas DataFrame before calling this function which is not efficient for large datasets. + Notice itables.show() function is not compatible with Spark DataFrames. If a Spark DataFrame is passed to this + function, it will be converted to a pandas DataFrame (first 1000 rows) before displaying it. - :param df: a pandas DataFrame # TODO add spark DataFrame support + :param df: a pandas DataFrame or a Spark DataFrame :param layout: layout options, refer to https://datatables.net/reference/option/layout :param buttons: buttons options, options refer to https://datatables.net/reference/button/ :param length_menu: length menu options, refer to https://datatables.net/reference/option/lengthMenu :return: """ + # convert Spark DataFrame to pandas DataFrame + if isinstance(df, SparkDataFrame): + if df.count() > 1000: + print("Converting first 1000 rows from Spark to Pandas DataFrame...") + df = spark_to_pandas(df) + # initialize itables for the notebook init_notebook_mode(all_interactive=False) diff --git a/src/notebook_utils/startup.py b/src/notebook_utils/startup.py index 8ad844d..2fa9304 100644 --- a/src/notebook_utils/startup.py +++ b/src/notebook_utils/startup.py @@ -14,4 +14,7 @@ remove_table, ) from minio_utils.minio_utils import get_minio_client -from notebook_utils.notebook_utils import display_df +from notebook_utils.notebook_utils import ( + display_df, + spark_to_pandas, +)