diff --git a/analytics/terraform/spark-k8s-operator/examples/cluster-autoscaler/nvme-ephemeral-storage/taxi-trip-execute.sh b/analytics/scripts/taxi-trip-execute.sh similarity index 78% rename from analytics/terraform/spark-k8s-operator/examples/cluster-autoscaler/nvme-ephemeral-storage/taxi-trip-execute.sh rename to analytics/scripts/taxi-trip-execute.sh index ca9156eeb..fd609b12d 100755 --- a/analytics/terraform/spark-k8s-operator/examples/cluster-autoscaler/nvme-ephemeral-storage/taxi-trip-execute.sh +++ b/analytics/scripts/taxi-trip-execute.sh @@ -11,17 +11,15 @@ # Script usage ./taxi-trip-execute my-s3-bucket us-west-2 -# validating that use passes two arguments, if not return a message to pass the arguments if [ $# -ne 2 ]; then echo "Usage: $0 " exit 1 fi - S3_BUCKET="$1" REGION="$2" -INPUT_DATA_S3_PATH="s3://${S3_BUCKET}/taxi-trip/input/" +INPUT_DATA_S3_PATH="s3://${S3_BUCKET}/taxi-trip/input" # Create a local input folder mkdir input @@ -31,15 +29,21 @@ aws s3 cp pyspark-taxi-trip.py s3://${S3_BUCKET}/taxi-trip/scripts/ --region ${R # Copy Test Input data to S3 bucket wget https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-01.parquet -O "input/yellow_tripdata_2022-0.parquet" +aws s3 cp "input/yellow_tripdata_2022-0.parquet" ${INPUT_DATA_S3_PATH}/yellow_tripdata_2022-0.parquet + +pids=() # Making duplicate copies to increase the size of the data. -max=100 +max=25 for (( i=1; i <= $max; ++i )) do - cp -rf "input/yellow_tripdata_2022-0.parquet" "input/yellow_tripdata_2022-${i}.parquet" + aws s3 cp ${INPUT_DATA_S3_PATH}/yellow_tripdata_2022-0.parquet ${INPUT_DATA_S3_PATH}/yellow_tripdata_2022-${i}.parquet & + pids+=($!) done -aws s3 sync "input/" ${INPUT_DATA_S3_PATH} +for pid in "${pids[@]}"; do + wait $pid +done # Delete a local input folder rm -rf input diff --git a/analytics/terraform/spark-k8s-operator/examples/cluster-autoscaler/ebs-storage-dynamic-pvc/taxi-trip-execute.sh b/analytics/terraform/spark-k8s-operator/examples/cluster-autoscaler/ebs-storage-dynamic-pvc/taxi-trip-execute.sh deleted file mode 100644 index 31432838b..000000000 --- a/analytics/terraform/spark-k8s-operator/examples/cluster-autoscaler/ebs-storage-dynamic-pvc/taxi-trip-execute.sh +++ /dev/null @@ -1,43 +0,0 @@ -#!/bin/bash -# This job copies Sample PySpark script and some test data to your S3 bucket which will enable you to run the following Spark Operator script - -# Prerequisites for running this shell script -# 1/ Execute the shell script with the required arguments -# ./your_script.sh -# 2/ Ensure is replaced in "ebs-storage-dynamic-pvc.yaml" file -# 3/ Execute the shell script which creates the input data in your S3 bucket -# 4/ Run `kubectl apply -f ebs-storage-dynamic-pvc.yaml` to trigger the Spark job -# 5/ Monitor the Spark job using "kubectl get pods -n spark-team-a -w" - -# Script usage ./taxi-trip-execute my-s3-bucket us-west-2 - -if [ $# -ne 3 ]; then - echo "Usage: $0 " - exit 1 -fi - -S3_BUCKET="$1" -REGION="$2" - -INPUT_DATA_S3_PATH="s3://${S3_BUCKET}/taxi-trip/input/" - -# Create a local input folder -mkdir input - -# Copy PySpark Script to S3 bucket -aws s3 cp pyspark-taxi-trip.py s3://${S3_BUCKET}/taxi-trip/scripts/ --region ${REGION} - -# Copy Test Input data to S3 bucket -wget https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-01.parquet -O "input/yellow_tripdata_2022-0.parquet" - -# Making duplicate copies to increase the size of the data. -max=100 -for (( i=1; i <= $max; ++i )) -do - cp -rf "input/yellow_tripdata_2022-0.parquet" "input/yellow_tripdata_2022-${i}.parquet" -done - -aws s3 sync "input/" ${INPUT_DATA_S3_PATH} - -# Delete a local input folder -rm -rf input diff --git a/analytics/terraform/spark-k8s-operator/examples/cluster-autoscaler/nvme-yunikorn-gang-scheduling/taxi-trip-execute.sh b/analytics/terraform/spark-k8s-operator/examples/cluster-autoscaler/nvme-yunikorn-gang-scheduling/taxi-trip-execute.sh deleted file mode 100644 index ac6d8ceb6..000000000 --- a/analytics/terraform/spark-k8s-operator/examples/cluster-autoscaler/nvme-yunikorn-gang-scheduling/taxi-trip-execute.sh +++ /dev/null @@ -1,43 +0,0 @@ -#!/bin/bash -# This job copies Sample PySpark script and some test data to your S3 bucket which will enable you to run the following Spark Operator script - -# Prerequisites for running this shell script -# 1/ Execute the shell script with the required arguments -# ./your_script.sh -# 2/ Ensure is replaced in "nvme-storage-yunikorn-gang-scheduling.yaml" file -# 3/ Execute the shell script which creates the input data in your S3 bucket -# 4/ Run `kubectl apply -f nvme-storage-yunikorn-gang-scheduling.yaml` to trigger the Spark job -# 5/ Monitor the Spark job using "kubectl get pods -n spark-team-a -w" - -# Script usage ./taxi-trip-execute my-s3-bucket us-west-2 - -if [ $# -ne 3 ]; then - echo "Usage: $0 " - exit 1 -fi - -S3_BUCKET="$1" -REGION="$2" - -INPUT_DATA_S3_PATH="s3://${S3_BUCKET}/taxi-trip/input/" - -# Create a local input folder -mkdir input - -# Copy PySpark Script to S3 bucket -aws s3 cp pyspark-taxi-trip.py s3://${S3_BUCKET}/taxi-trip/scripts/ --region ${REGION} - -# Copy Test Input data to S3 bucket -wget https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-01.parquet -O "input/yellow_tripdata_2022-0.parquet" - -# Making duplicate copies to increase the size of the data. -max=100 -for (( i=1; i <= $max; ++i )) -do - cp -rf "input/yellow_tripdata_2022-0.parquet" "input/yellow_tripdata_2022-${i}.parquet" -done - -aws s3 sync "input/" ${INPUT_DATA_S3_PATH} - -# Delete a local input folder -rm -rf input diff --git a/analytics/terraform/spark-k8s-operator/examples/karpenter/ebs-storage-dynamic-pvc/taxi-trip-execute.sh b/analytics/terraform/spark-k8s-operator/examples/karpenter/ebs-storage-dynamic-pvc/taxi-trip-execute.sh deleted file mode 100644 index 4ebf25143..000000000 --- a/analytics/terraform/spark-k8s-operator/examples/karpenter/ebs-storage-dynamic-pvc/taxi-trip-execute.sh +++ /dev/null @@ -1,43 +0,0 @@ -#!/bin/bash -# This job copies Sample PySpark script and some test data to your S3 bucket which will enable you to run the following Spark Operator script - -# Prerequisites for running this shell script -# 1/ Execute the shell script with the required arguments -# ./your_script.sh -# 2/ Ensure is replaced in "ebs-storage-dynamic-pvc.yaml" file -# 3/ Execute the shell script which creates the input data in your S3 bucket -# 4/ Run `kubectl apply -f ebs-storage-dynamic-pvc.yaml` to trigger the Spark job -# 5/ Monitor the Spark job using "kubectl get pods -n spark-team-a -w" - -# Script usage ./taxi-trip-execute my-s3-bucket us-west-2 - -if [ $# -ne 2 ]; then - echo "Usage: $0 " - exit 1 -fi - -S3_BUCKET="$1" -REGION="$2" - -INPUT_DATA_S3_PATH="s3://${S3_BUCKET}/taxi-trip/input/" - -# Create a local input folder -mkdir input - -# Copy PySpark Script to S3 bucket -aws s3 cp pyspark-taxi-trip.py s3://${S3_BUCKET}/taxi-trip/scripts/ --region ${REGION} - -# Copy Test Input data to S3 bucket -wget https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-01.parquet -O "input/yellow_tripdata_2022-0.parquet" - -# Making duplicate copies to increase the size of the data. -max=100 -for (( i=1; i <= $max; ++i )) -do - cp -rf "input/yellow_tripdata_2022-0.parquet" "input/yellow_tripdata_2022-${i}.parquet" -done - -aws s3 sync "input/" ${INPUT_DATA_S3_PATH} - -# Delete a local input folder -rm -rf input diff --git a/analytics/terraform/spark-k8s-operator/examples/karpenter/nvme-ephemeral-storage/taxi-trip-execute.sh b/analytics/terraform/spark-k8s-operator/examples/karpenter/nvme-ephemeral-storage/taxi-trip-execute.sh deleted file mode 100755 index b3a9dcfb5..000000000 --- a/analytics/terraform/spark-k8s-operator/examples/karpenter/nvme-ephemeral-storage/taxi-trip-execute.sh +++ /dev/null @@ -1,43 +0,0 @@ -#!/bin/bash -# This job copies Sample PySpark script and some test data to your S3 bucket which will enable you to run the following Spark Operator script - -# Prerequisites for running this shell script -# 1/ Execute the shell script with the required arguments -# ./your_script.sh -# 2/ Ensure is replaced in "nvme-ephemeral-storage.yaml" file -# 3/ Execute the shell script which creates the input data in your S3 bucket -# 4/ Run `kubectl apply -f nvme-ephemeral-storage.yaml` to trigger the Spark job -# 5/ Monitor the Spark job using "kubectl get pods -n spark-team-a -w" - -# Script usage ./taxi-trip-execute my-s3-bucket us-west-2 - -if [ $# -ne 2 ]; then - echo "Usage: $0 " - exit 1 -fi - -S3_BUCKET="$1" -REGION="$2" - -INPUT_DATA_S3_PATH="s3://${S3_BUCKET}/taxi-trip/input/" - -# Create a local input folder -mkdir input - -# Copy PySpark Script to S3 bucket -aws s3 cp pyspark-taxi-trip.py s3://${S3_BUCKET}/taxi-trip/scripts/ --region ${REGION} - -# Copy Test Input data to S3 bucket -wget https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-01.parquet -O "input/yellow_tripdata_2022-0.parquet" - -# Making duplicate copies to increase the size of the data. -max=100 -for (( i=1; i <= $max; ++i )) -do - cp -rf "input/yellow_tripdata_2022-0.parquet" "input/yellow_tripdata_2022-${i}.parquet" -done - -aws s3 sync "input/" ${INPUT_DATA_S3_PATH} - -# Delete a local input folder -rm -rf input diff --git a/analytics/terraform/spark-k8s-operator/examples/karpenter/nvme-yunikorn-gang-scheduling/taxi-trip-execute.sh b/analytics/terraform/spark-k8s-operator/examples/karpenter/nvme-yunikorn-gang-scheduling/taxi-trip-execute.sh deleted file mode 100755 index 25707e596..000000000 --- a/analytics/terraform/spark-k8s-operator/examples/karpenter/nvme-yunikorn-gang-scheduling/taxi-trip-execute.sh +++ /dev/null @@ -1,43 +0,0 @@ -#!/bin/bash -# This job copies Sample PySpark script and some test data to your S3 bucket which will enable you to run the following Spark Operator script - -# Prerequisites for running this shell script -# 1/ Execute the shell script with the required arguments -# ./your_script.sh -# 2/ Ensure is replaced in "nvme-storage-yunikorn-gang-scheduling.yaml" file -# 3/ Execute the shell script which creates the input data in your S3 bucket -# 4/ Run `kubectl apply -f nvme-storage-yunikorn-gang-scheduling.yaml` to trigger the Spark job -# 5/ Monitor the Spark job using "kubectl get pods -n spark-team-a -w" - -# Script usage ./taxi-trip-execute my-s3-bucket us-west-2 - -if [ $# -ne 2 ]; then - echo "Usage: $0 " - exit 1 -fi - -S3_BUCKET="$1" -REGION="$2" - -INPUT_DATA_S3_PATH="s3://${S3_BUCKET}/taxi-trip/input/" - -# Create a local input folder -mkdir input - -# Copy PySpark Script to S3 bucket -aws s3 cp pyspark-taxi-trip.py s3://${S3_BUCKET}/taxi-trip/scripts/ --region ${REGION} - -# Copy Test Input data to S3 bucket -wget https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-01.parquet -O "input/yellow_tripdata_2022-0.parquet" - -# Making duplicate copies to increase the size of the data. -max=100 -for (( i=1; i <= $max; ++i )) -do - cp -rf "input/yellow_tripdata_2022-0.parquet" "input/yellow_tripdata_2022-${i}.parquet" -done - -aws s3 sync "input/" ${INPUT_DATA_S3_PATH} - -# Delete a local input folder -rm -rf input diff --git a/website/docs/blueprints/data-analytics/_taxi_trip_exec.md b/website/docs/blueprints/data-analytics/_taxi_trip_exec.md index f6edf2c88..5352f9a52 100644 --- a/website/docs/blueprints/data-analytics/_taxi_trip_exec.md +++ b/website/docs/blueprints/data-analytics/_taxi_trip_exec.md @@ -5,5 +5,5 @@ it in order to increase the size a bit. This will take a bit of time and will require a relatively fast internet connection. ```bash -./taxi-trip-execute.sh ${S3_BUCKET} YOUR_REGION_HERE +${DOEKS_HOME}/analytics/scripts/taxi-trip-execute.sh ${S3_BUCKET} YOUR_REGION_HERE ``` diff --git a/website/docs/blueprints/data-analytics/observability-spark-on-eks.md b/website/docs/blueprints/data-analytics/observability-spark-on-eks.md index 2b5371a71..83b7226be 100644 --- a/website/docs/blueprints/data-analytics/observability-spark-on-eks.md +++ b/website/docs/blueprints/data-analytics/observability-spark-on-eks.md @@ -2,6 +2,9 @@ sidebar_position: 3 sidebar_label: Observability Spark on EKS --- + +import TaxiTripExec from './_taxi_trip_exec.md'; + # Observability Spark on EKS ## Introduction @@ -15,11 +18,10 @@ We will reuse the previous Spark on Operator example. Please follow [this link]( let's navigate to one example folder under spark-k8s-operator and run the shell script to upload data and py script to the S3 bucket created by terraform above. ```bash cd data-on-eks/analytics/terraform/spark-k8s-operator/examples/cluster-autoscaler/nvme-ephemeral-storage - -# replace \ with your S3 bucket and \ with your region, then run -./taxi-trip-execute.sh ``` + + ## Spark Web UI When you submit a Spark application, Spark context is created which ideally gives you [Spark Web UI](https://sparkbyexamples.com/spark/spark-web-ui-understanding/) to monitor the execution of the application. Monitoring includes the following. - Spark configurations used