Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add SparkMonitor and itables extension #30

Merged
merged 9 commits into from
Jun 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ COPY ./src/ /src
ENV PYTHONPATH "${PYTHONPATH}:/src"

# Copy the startup script to the default profile location to automatically load pre-built functions in Jupyter Notebook
COPY ./src/notebook/startup.py /.ipython/profile_default/startup/
COPY ./src/notebook_utils/startup.py /.ipython/profile_default/startup/

COPY ./scripts/ /opt/scripts/
RUN chmod a+x /opt/scripts/*.sh
Expand Down
2 changes: 2 additions & 0 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ minio = "==7.2.7"
delta-spark = "==3.2.0" # should match JAR version (DELTA_SPARK_VER) specified in the Dockerfile
pandas = "==2.2.2"
pyarrow = "==16.1.0"
sparkmonitor = "==3.0.2"
itables = "==2.1.1"

[dev-packages]
pytest = "==8.2.1"
Expand Down
96 changes: 56 additions & 40 deletions Pipfile.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

17 changes: 17 additions & 0 deletions scripts/notebook_entrypoint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,23 @@ fi

mkdir -p "$NOTEBOOK_DIR" && cd "$NOTEBOOK_DIR"

# Enable SparkMonitor extension
# https://github.com/swan-cern/sparkmonitor?tab=readme-ov-file#setting-up-the-extension
enable_spark_monitor() {
echo "Enabling SparkMonitor extension..."
local ipython_config_path="/.ipython/profile_default/ipython_kernel_config.py"
local spark_monitor_config="c.InteractiveShellApp.extensions.append('sparkmonitor.kernelextension')"

if ! grep -q "$spark_monitor_config" "$ipython_config_path"; then
echo "$spark_monitor_config" >> "$ipython_config_path"
echo "SparkMonitor kernel extension enabled in IPython config."
else
echo "SparkMonitor kernel extension is already enabled in IPython config."
fi
}

enable_spark_monitor

# Start Jupyter Lab
jupyter lab --ip=0.0.0.0 \
--port="$NOTEBOOK_PORT" \
Expand Down
49 changes: 49 additions & 0 deletions src/notebook_utils/notebook_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
from threading import RLock

import itables.options as opt
from itables import init_notebook_mode, show
from pandas import DataFrame

lock = RLock()


def display_df(
df: DataFrame,
layout: dict = None,
buttons: list = None,
length_menu: list = None
) -> None:
"""
Display a pandas DataFrame using itables.
MrCreosote marked this conversation as resolved.
Show resolved Hide resolved
iTables project page: https://github.com/mwouts/itables

Notice this function is not compatible with Spark DataFrames. A Spark DataFrame should be converted to a
pandas DataFrame before calling this function which is not efficient for large datasets.

:param df: a pandas DataFrame # TODO add spark DataFrame support
:param layout: layout options, refer to https://datatables.net/reference/option/layout
:param buttons: buttons options, options refer to https://datatables.net/reference/button/
:param length_menu: length menu options, refer to https://datatables.net/reference/option/lengthMenu
:return:
"""
# initialize itables for the notebook
init_notebook_mode(all_interactive=False)
MrCreosote marked this conversation as resolved.
Show resolved Hide resolved

# set default values if options are not provided
default_layout = {
"topStart": "search",
"topEnd": "buttons",
"bottomStart": "pageLength",
"bottomEnd": "paging",
"bottom2Start": "info"
}
default_buttons = ["csvHtml5", "excelHtml5", "print"]
default_length_menu = [5, 10, 20]

layout = layout or default_layout
buttons = buttons or default_buttons
length_menu = length_menu or default_length_menu

with lock:
opt.layout = layout
show(df, buttons=buttons, lengthMenu=length_menu)
4 changes: 2 additions & 2 deletions src/notebook/startup.py → src/notebook_utils/startup.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@

"""
This file handles the importation of essential modules and functions pre-configured for the notebook.

Expand All @@ -14,4 +13,5 @@
table_exists,
remove_table,
)
from minio_utils.minio_utils import get_minio_client
from minio_utils.minio_utils import get_minio_client
from notebook_utils.notebook_utils import display_df
8 changes: 8 additions & 0 deletions src/spark/utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import csv
import os
import site
from datetime import datetime
from threading import Timer

Expand Down Expand Up @@ -48,6 +49,9 @@ def _get_delta_lake_conf(

reference: https://blog.min.io/delta-lake-minio-multi-cloud/
"""

site_packages_path = site.getsitepackages()[0]

return {
"spark.jars": jars_str,
"spark.sql.extensions": "io.delta.sql.DeltaSparkSessionExtension",
Expand All @@ -59,6 +63,10 @@ def _get_delta_lake_conf(
"spark.hadoop.fs.s3a.path.style.access": "true",
"spark.hadoop.fs.s3a.impl": "org.apache.hadoop.fs.s3a.S3AFileSystem",
"spark.sql.catalogImplementation": "hive",
# SparkMonitor extension configuration
# https://github.com/swan-cern/sparkmonitor?tab=readme-ov-file#setting-up-the-extension
"spark.extraListeners": "sparkmonitor.listener.JupyterSparkMonitorListener",
"spark.driver.extraClassPath": f"{site_packages_path}/sparkmonitor/listener_{SCALA_VER}.jar",
MrCreosote marked this conversation as resolved.
Show resolved Hide resolved
}


Expand Down
1 change: 1 addition & 0 deletions test/src/notebook_utils/notebook_utils_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from notebook_utils.notebook_utils import *
Loading