From 9e284c287eac07abe6732490729ad24af9a802c5 Mon Sep 17 00:00:00 2001
From: Ray Krueger <raykrueger@gmail.com>
Date: Mon, 1 Apr 2024 13:05:08 +0000
Subject: [PATCH 01/11] WIP update spark operator docs

---
 .../data-analytics/spark-operator-yunikorn.md        | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/website/docs/blueprints/data-analytics/spark-operator-yunikorn.md b/website/docs/blueprints/data-analytics/spark-operator-yunikorn.md
index c00bd8b41..75397feb2 100644
--- a/website/docs/blueprints/data-analytics/spark-operator-yunikorn.md
+++ b/website/docs/blueprints/data-analytics/spark-operator-yunikorn.md
@@ -260,12 +260,14 @@ Clone the repository
 
 ```bash
 git clone https://github.com/awslabs/data-on-eks.git
+cd data-on-eks
+export DOEKS_HOME=$(pwd)
 ```
 
 Navigate into one of the example directories and run `install.sh` script
 
 ```bash
-cd data-on-eks/analytics/terraform/spark-k8s-operator
+cd ${DOEKS_HOME}/analytics/terraform/spark-k8s-operator
 chmod +x install.sh
 ./install.sh
 ```
@@ -277,7 +279,7 @@ chmod +x install.sh
 Navigate to example directory and submit the Spark job.
 
 ```bash
-cd data-on-eks/analytics/terraform/spark-k8s-operator/examples/karpenter
+cd ${DOEKS_HOME}/analytics/terraform/spark-k8s-operator/examples/karpenter
 kubectl apply -f pyspark-pi-job.yaml
 ```
 
@@ -295,7 +297,7 @@ You can try the following examples to leverage multiple Karpenter Nodepools, EBS
 Example PySpark job that uses NVMe based ephemeral SSD disk for Driver and Executor shuffle storage
 
 ```bash
-  cd analytics/terraform/spark-k8s-operator/examples/karpenter/nvme-ephemeral-storage/
+  cd ${DOEKS_HOME}/analytics/terraform/spark-k8s-operator/examples/karpenter/nvme-ephemeral-storage/
 ```
 
 Update the variables in Shell script and execute
@@ -314,7 +316,7 @@ Update YAML file and run the below command
 Example PySpark job that uses EBS ON_DEMAND volumes using Dynamic PVCs for Driver and Executor shuffle storage
 
 ```bash
-  cd analytics/terraform/spark-k8s-operator/examples/karpenter/ebs-storage-dynamic-pvc/
+  cd ${DOEKS_HOME}/analytics/terraform/spark-k8s-operator/examples/karpenter/ebs-storage-dynamic-pvc/
 ```
 
 Update the variables in Shell script and execute
@@ -355,7 +357,7 @@ Update YAML file and run the below command
 Navigate to example directory and submit the Spark job.
 
 ```bash
-cd data-on-eks/analytics/terraform/spark-k8s-operator/examples/cluster-autoscaler
+cd ${DOEKS_HOME}/analytics/terraform/spark-k8s-operator/examples/cluster-autoscaler
 kubectl apply -f pyspark-pi-job.yaml
 ```
 

From 3d7956739ab0ca23d26f1fafaf8701db047c9c30 Mon Sep 17 00:00:00 2001
From: Ray Krueger <raykrueger@gmail.com>
Date: Mon, 1 Apr 2024 13:05:08 +0000
Subject: [PATCH 02/11] Create a DOEKS_HOME environment variable

Adding a DOEKS_HOME varaible set to the data-on-eks directory means that
each command in the spark on eks docs can be copied and pasted directly.

Originally the docs simply said `cd analytics/...` repeastedly and that
path was never correct.
---
 .../data-analytics/spark-operator-yunikorn.md | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/website/docs/blueprints/data-analytics/spark-operator-yunikorn.md b/website/docs/blueprints/data-analytics/spark-operator-yunikorn.md
index 75397feb2..dd743408d 100644
--- a/website/docs/blueprints/data-analytics/spark-operator-yunikorn.md
+++ b/website/docs/blueprints/data-analytics/spark-operator-yunikorn.md
@@ -256,7 +256,7 @@ Ensure that you have installed the following tools on your machine.
 
 ### Deploy
 
-Clone the repository
+Clone the repository.
 
 ```bash
 git clone https://github.com/awslabs/data-on-eks.git
@@ -264,7 +264,10 @@ cd data-on-eks
 export DOEKS_HOME=$(pwd)
 ```
 
-Navigate into one of the example directories and run `install.sh` script
+If DOEKS_HOME is ever unset, you can always set it manually using `export
+DATA_ON_EKS=$(pwd)` from your data-on-eks directory.
+
+Navigate into one of the example directories and run `install.sh` script.
 
 ```bash
 cd ${DOEKS_HOME}/analytics/terraform/spark-k8s-operator
@@ -335,7 +338,7 @@ Update YAML file and run the below command
 Gang Scheduling Spark jobs using Apache YuniKorn and Spark Operator
 
 ```bash
-  cd analytics/terraform/spark-k8s-operator/examples/karpenter/nvme-yunikorn-gang-scheduling/
+  cd ${DOEKS_HOME}/analytics/terraform/spark-k8s-operator/examples/karpenter/nvme-yunikorn-gang-scheduling/
 ```
 
 Update the variables in Shell script and execute
@@ -373,7 +376,7 @@ kubectl get pods -n spark-team-a -w
 Example PySpark job that uses NVMe based ephemeral SSD disk for Driver and Executor shuffle storage
 
 ```bash
-  cd analytics/terraform/spark-k8s-operator/examples/cluster-autoscaler/nvme-ephemeral-storage
+  cd ${DOEKS_HOME}/analytics/terraform/spark-k8s-operator/examples/cluster-autoscaler/nvme-ephemeral-storage
 ```
 
 Update the variables in Shell script and execute
@@ -392,7 +395,7 @@ Update YAML file and run the below command
 Example PySpark job that uses EBS ON_DEMAND volumes using Dynamic PVCs for Driver and Executor shuffle storage
 
 ```bash
-  cd analytics/terraform/spark-k8s-operator/examples/cluster-autoscaler/ebs-storage-dynamic-pvc
+  cd ${DOEKS_HOME}/analytics/terraform/spark-k8s-operator/examples/cluster-autoscaler/ebs-storage-dynamic-pvc
 ```
 
 Update the variables in Shell script and execute
@@ -411,7 +414,7 @@ Update YAML file and run the below command
 Gang Scheduling Spark jobs using Apache YuniKorn and Spark Operator
 
 ```bash
-  cd analytics/terraform/spark-k8s-operator/examples/cluster-autoscaler/nvme-yunikorn-gang-scheduling
+  cd ${DOEKS_HOME}/analytics/terraform/spark-k8s-operator/examples/cluster-autoscaler/nvme-yunikorn-gang-scheduling
 ```
 
 Update the variables in Shell script and execute
@@ -433,7 +436,7 @@ Update YAML file and run the below command
 Check the pre-requisites in yaml file before running this job.
 
 ```bash
-cd analytics/terraform/spark-k8s-operator/examples/benchmark
+cd ${DOEKS_HOME}/analytics/terraform/spark-k8s-operator/examples/benchmark
 ```
 
 Step1: Benchmark test data generation
@@ -615,7 +618,7 @@ Intel Nodepool (AMD): Set the weight of the Intel Nodepool to `50`. This ensures
 This script will cleanup the environment using `-target` option to ensure all the resources are deleted in correct order.
 
 ```bash
-cd analytics/terraform/spark-k8s-operator && chmod +x cleanup.sh
+cd ${DOEKS_HOME}/analytics/terraform/spark-k8s-operator && chmod +x cleanup.sh
 ./cleanup.sh
 ```
 

From caf5c2fec73b9fab92579de8aa33bba7f55ce521 Mon Sep 17 00:00:00 2001
From: Ray Krueger <raykrueger@gmail.com>
Date: Mon, 1 Apr 2024 15:28:17 +0200
Subject: [PATCH 03/11] Fix leading white space on some of the commands

---
 .../data-analytics/spark-operator-yunikorn.md | 36 +++++++++----------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/website/docs/blueprints/data-analytics/spark-operator-yunikorn.md b/website/docs/blueprints/data-analytics/spark-operator-yunikorn.md
index dd743408d..81b85c60b 100644
--- a/website/docs/blueprints/data-analytics/spark-operator-yunikorn.md
+++ b/website/docs/blueprints/data-analytics/spark-operator-yunikorn.md
@@ -300,13 +300,13 @@ You can try the following examples to leverage multiple Karpenter Nodepools, EBS
 Example PySpark job that uses NVMe based ephemeral SSD disk for Driver and Executor shuffle storage
 
 ```bash
-  cd ${DOEKS_HOME}/analytics/terraform/spark-k8s-operator/examples/karpenter/nvme-ephemeral-storage/
+cd ${DOEKS_HOME}/analytics/terraform/spark-k8s-operator/examples/karpenter/nvme-ephemeral-storage/
 ```
 
 Update the variables in Shell script and execute
 
 ```bash
-  ./taxi-trip-execute.sh
+./taxi-trip-execute.sh
 ```
 
 Update YAML file and run the below command
@@ -319,38 +319,38 @@ Update YAML file and run the below command
 Example PySpark job that uses EBS ON_DEMAND volumes using Dynamic PVCs for Driver and Executor shuffle storage
 
 ```bash
-  cd ${DOEKS_HOME}/analytics/terraform/spark-k8s-operator/examples/karpenter/ebs-storage-dynamic-pvc/
+cd ${DOEKS_HOME}/analytics/terraform/spark-k8s-operator/examples/karpenter/ebs-storage-dynamic-pvc/
 ```
 
 Update the variables in Shell script and execute
 
 ```bash
-  ./taxi-trip-execute.sh
+./taxi-trip-execute.sh
 ```
 
 Update YAML file and run the below command
 
 ```bash
-  kubectl apply -f ebs-storage-dynamic-pvc.yaml
+kubectl apply -f ebs-storage-dynamic-pvc.yaml
 ```
 
 ## Apache YuniKorn Gang Scheduling with NVMe based SSD disk for shuffle storage
 Gang Scheduling Spark jobs using Apache YuniKorn and Spark Operator
 
 ```bash
-  cd ${DOEKS_HOME}/analytics/terraform/spark-k8s-operator/examples/karpenter/nvme-yunikorn-gang-scheduling/
+cd ${DOEKS_HOME}/analytics/terraform/spark-k8s-operator/examples/karpenter/nvme-yunikorn-gang-scheduling/
 ```
 
 Update the variables in Shell script and execute
 
 ```bash
-  ./taxi-trip-execute.sh
+./taxi-trip-execute.sh
 ```
 
 Update YAML file and run the below command
 
 ```bash
-  kubectl apply -f nvme-storage-yunikorn-gang-scheduling.yaml
+kubectl apply -f nvme-storage-yunikorn-gang-scheduling.yaml
 ```
 
 </CollapsibleContent>
@@ -376,57 +376,57 @@ kubectl get pods -n spark-team-a -w
 Example PySpark job that uses NVMe based ephemeral SSD disk for Driver and Executor shuffle storage
 
 ```bash
-  cd ${DOEKS_HOME}/analytics/terraform/spark-k8s-operator/examples/cluster-autoscaler/nvme-ephemeral-storage
+cd ${DOEKS_HOME}/analytics/terraform/spark-k8s-operator/examples/cluster-autoscaler/nvme-ephemeral-storage
 ```
 
 Update the variables in Shell script and execute
 
 ```bash
-  ./taxi-trip-execute.sh
+./taxi-trip-execute.sh
 ```
 
 Update YAML file and run the below command
 
 ```bash
-  kubectl apply -f nvme-ephemeral-storage.yaml
+kubectl apply -f nvme-ephemeral-storage.yaml
 ```
 
 ## EBS Dynamic PVC for shuffle storage
 Example PySpark job that uses EBS ON_DEMAND volumes using Dynamic PVCs for Driver and Executor shuffle storage
 
 ```bash
-  cd ${DOEKS_HOME}/analytics/terraform/spark-k8s-operator/examples/cluster-autoscaler/ebs-storage-dynamic-pvc
+cd ${DOEKS_HOME}/analytics/terraform/spark-k8s-operator/examples/cluster-autoscaler/ebs-storage-dynamic-pvc
 ```
 
 Update the variables in Shell script and execute
 
 ```bash
-  ./taxi-trip-execute.sh
+./taxi-trip-execute.sh
 ```
 
 Update YAML file and run the below command
 
 ```bash
-  kubectl apply -f ebs-storage-dynamic-pvc.yaml
+kubectl apply -f ebs-storage-dynamic-pvc.yaml
 ```
 
 ## Apache YuniKorn Gang Scheduling with NVMe based SSD disk for shuffle storage
 Gang Scheduling Spark jobs using Apache YuniKorn and Spark Operator
 
 ```bash
-  cd ${DOEKS_HOME}/analytics/terraform/spark-k8s-operator/examples/cluster-autoscaler/nvme-yunikorn-gang-scheduling
+cd ${DOEKS_HOME}/analytics/terraform/spark-k8s-operator/examples/cluster-autoscaler/nvme-yunikorn-gang-scheduling
 ```
 
 Update the variables in Shell script and execute
 
 ```bash
-  ./taxi-trip-execute.sh
+./taxi-trip-execute.sh
 ```
 
 Update YAML file and run the below command
 
 ```bash
-  kubectl apply -f nvme-storage-yunikorn-gang-scheduling.yaml
+kubectl apply -f nvme-storage-yunikorn-gang-scheduling.yaml
 ```
 
 </CollapsibleContent>
@@ -447,7 +447,7 @@ kubectl apply -f tpcds-benchmark-data-generation-1t
 Step2: Execute Benchmark test
 
 ```bash
-  kubectl apply -f tpcds-benchmark-1t.yaml
+kubectl apply -f tpcds-benchmark-1t.yaml
 ```
 </CollapsibleContent>
 

From 22da57e024a90623c95a08b59ec1a283a0c65d01 Mon Sep 17 00:00:00 2001
From: Ray Krueger <raykrueger@gmail.com>
Date: Mon, 1 Apr 2024 16:30:39 +0200
Subject: [PATCH 04/11] Rename ENTER_S3_BUCKET to S3_BUCKET

---
 .../nvme-ephemeral-storage.yaml                      | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/analytics/terraform/spark-k8s-operator/examples/karpenter/nvme-ephemeral-storage/nvme-ephemeral-storage.yaml b/analytics/terraform/spark-k8s-operator/examples/karpenter/nvme-ephemeral-storage/nvme-ephemeral-storage.yaml
index 0a4f3cf8e..1b6546c93 100644
--- a/analytics/terraform/spark-k8s-operator/examples/karpenter/nvme-ephemeral-storage/nvme-ephemeral-storage.yaml
+++ b/analytics/terraform/spark-k8s-operator/examples/karpenter/nvme-ephemeral-storage/nvme-ephemeral-storage.yaml
@@ -1,6 +1,6 @@
 # Pre-requisite before running this job
-# 1/ Open taxi-trip-execute.sh and update <ENTER_S3_BUCKET> and <REGION>
-# 2/ Replace <ENTER_S3_BUCKET> with your S3 bucket created by this blueprint(Check Terraform outputs)
+# 1/ Open taxi-trip-execute.sh and update <S3_BUCKET> and <REGION>
+# 2/ Replace <S3_BUCKET> with your S3 bucket created by this blueprint(Check Terraform outputs)
 # 3/ execute taxi-trip-execute.sh
 
 ---
@@ -30,10 +30,10 @@ spec:
   mode: cluster
   image: public.ecr.aws/data-on-eks/spark3.3.1-hadoop3.2-aws-java-sdk-bundle-1.12.647
   imagePullPolicy: IfNotPresent
-  mainApplicationFile: "s3a://<ENTER_S3_BUCKET>/taxi-trip/scripts/pyspark-taxi-trip.py"  # MainFile is the path to a bundled JAR, Python, or R file of the application
+  mainApplicationFile: "s3a://<S3_BUCKET>/taxi-trip/scripts/pyspark-taxi-trip.py"  # MainFile is the path to a bundled JAR, Python, or R file of the application
   arguments:
-    - "s3a://<ENTER_S3_BUCKET>/taxi-trip/input/"
-    - "s3a://<ENTER_S3_BUCKET>/taxi-trip/output/"
+    - "s3a://<S3_BUCKET>/taxi-trip/input/"
+    - "s3a://<S3_BUCKET>/taxi-trip/output/"
   hadoopConf:
     "fs.s3a.aws.credentials.provider": "com.amazonaws.auth.WebIdentityTokenCredentialsProvider"
     "fs.s3a.impl": "org.apache.hadoop.fs.s3a.S3AFileSystem"
@@ -55,7 +55,7 @@ spec:
 
     # Spark Event logs
     "spark.eventLog.enabled": "true"
-    "spark.eventLog.dir": "s3a://<ENTER_S3_BUCKET>/spark-event-logs"
+    "spark.eventLog.dir": "s3a://<S3_BUCKET>/spark-event-logs"
     "spark.eventLog.rolling.enabled": "true"
     "spark.eventLog.rolling.maxFileSize": "64m"
 #    "spark.history.fs.eventLog.rolling.maxFilesToRetain": 100

From 4be92d1b3d40183573d7c44785cb7ec15088dcda Mon Sep 17 00:00:00 2001
From: Ray Krueger <raykrueger@gmail.com>
Date: Mon, 1 Apr 2024 16:57:49 +0200
Subject: [PATCH 05/11] Provide an example of using sed for bucket names

I am unifying all the examples to use <S3_BUCKET> as the placeholder in
files. I have added documentation to show how to use sed to do the
varaiable replaement rather than putting the user through manual
efforts.
---
 .gitignore                                    |  1 +
 .../nvme-ephemeral-storage.yaml               |  4 +--
 .../data-analytics/spark-operator-yunikorn.md | 32 ++++++++++++++++---
 3 files changed, 30 insertions(+), 7 deletions(-)

diff --git a/.gitignore b/.gitignore
index 095c4d90c..b112a3d20 100755
--- a/.gitignore
+++ b/.gitignore
@@ -1,6 +1,7 @@
 .DS_Store
 .idea
 .build
+*.old
 
 # ignore the lock file
 website/package-lock.json
diff --git a/analytics/terraform/spark-k8s-operator/examples/karpenter/nvme-ephemeral-storage/nvme-ephemeral-storage.yaml b/analytics/terraform/spark-k8s-operator/examples/karpenter/nvme-ephemeral-storage/nvme-ephemeral-storage.yaml
index 1b6546c93..6dc4407f7 100644
--- a/analytics/terraform/spark-k8s-operator/examples/karpenter/nvme-ephemeral-storage/nvme-ephemeral-storage.yaml
+++ b/analytics/terraform/spark-k8s-operator/examples/karpenter/nvme-ephemeral-storage/nvme-ephemeral-storage.yaml
@@ -1,7 +1,5 @@
 # Pre-requisite before running this job
-# 1/ Open taxi-trip-execute.sh and update <S3_BUCKET> and <REGION>
-# 2/ Replace <S3_BUCKET> with your S3 bucket created by this blueprint(Check Terraform outputs)
-# 3/ execute taxi-trip-execute.sh
+# Replace <S3_BUCKET> with your S3 bucket created by this blueprint(Check Terraform outputs)
 
 ---
 apiVersion: "sparkoperator.k8s.io/v1beta2"
diff --git a/website/docs/blueprints/data-analytics/spark-operator-yunikorn.md b/website/docs/blueprints/data-analytics/spark-operator-yunikorn.md
index 81b85c60b..895d98039 100644
--- a/website/docs/blueprints/data-analytics/spark-operator-yunikorn.md
+++ b/website/docs/blueprints/data-analytics/spark-operator-yunikorn.md
@@ -275,6 +275,15 @@ chmod +x install.sh
 ./install.sh
 ```
 
+Now create an S3_BUCKET variable that holds the name of the bucket created
+during the install. This bucket will be used in later examples to store output
+data. If S3_BUCKET is ever unset, you can run the following commands again.
+
+```bash
+export S3_BUCKET=$(terraform output -raw s3_bucket_id_spark_history_server)
+echo $S3_BUCKET
+```
+
 </CollapsibleContent>
 
 <CollapsibleContent header={<h2><span>Execute Sample Spark job with Karpenter</span></h2>}>
@@ -303,16 +312,31 @@ Example PySpark job that uses NVMe based ephemeral SSD disk for Driver and Execu
 cd ${DOEKS_HOME}/analytics/terraform/spark-k8s-operator/examples/karpenter/nvme-ephemeral-storage/
 ```
 
-Update the variables in Shell script and execute
+Run the *taxi-trip-execute.sh* script with the following input. You will use the S3_BUCKET variable created earlier. Additionally, you must change YOUR_REGION_HERE with the region of your choice, *us-west-2* for example.
+
+This script will download some example taxi trip data and create duplicates of
+it in order to increase the size a bit. This will take a bit of time and will
+require a relatively fast internet connection.
 
 ```bash
-./taxi-trip-execute.sh
+./taxi-trip-execute.sh ${S3_BUCKET} YOUR_REGION_HERE
 ```
 
-Update YAML file and run the below command
+Once our sample data is uploaded you can run the Spark job. You will need to
+replace the *<S3_BUCKET>* placeholders in this file with the name of the bucket
+created earlier. You can get that value by running `echo $S3_BUCKET`.
+
+To do this automatically you can run the following, which will create a .old
+backup file and do the rename for you.
 
 ```bash
-  kubectl apply -f nvme-ephemeral-storage.yaml
+sed -i.old s/\<S3_BUCKET\>/${S3_BUCKET}/g ./nvme-ephemeral-storage.yaml
+```
+
+Now that the bucket name is in place you can create the Spark job.
+
+```bash
+kubectl apply -f nvme-ephemeral-storage.yaml
 ```
 
 ## EBS Dynamic PVC for shuffle storage

From ee99a02ebac2488ec566c4fa0109646061aeaa5c Mon Sep 17 00:00:00 2001
From: Ray Krueger <raykrueger@gmail.com>
Date: Mon, 1 Apr 2024 19:41:57 +0200
Subject: [PATCH 06/11] Do the same for EBS Dynamic PVC

---
 .../data-analytics/spark-operator-yunikorn.md | 27 ++++++++++++++-----
 1 file changed, 21 insertions(+), 6 deletions(-)

diff --git a/website/docs/blueprints/data-analytics/spark-operator-yunikorn.md b/website/docs/blueprints/data-analytics/spark-operator-yunikorn.md
index 895d98039..df0395731 100644
--- a/website/docs/blueprints/data-analytics/spark-operator-yunikorn.md
+++ b/website/docs/blueprints/data-analytics/spark-operator-yunikorn.md
@@ -323,11 +323,11 @@ require a relatively fast internet connection.
 ```
 
 Once our sample data is uploaded you can run the Spark job. You will need to
-replace the *<S3_BUCKET>* placeholders in this file with the name of the bucket
+replace the *\<S3_BUCKET\>* placeholders in this file with the name of the bucket
 created earlier. You can get that value by running `echo $S3_BUCKET`.
 
 To do this automatically you can run the following, which will create a .old
-backup file and do the rename for you.
+backup file and do the replacement for you.
 
 ```bash
 sed -i.old s/\<S3_BUCKET\>/${S3_BUCKET}/g ./nvme-ephemeral-storage.yaml
@@ -343,16 +343,31 @@ kubectl apply -f nvme-ephemeral-storage.yaml
 Example PySpark job that uses EBS ON_DEMAND volumes using Dynamic PVCs for Driver and Executor shuffle storage
 
 ```bash
-cd ${DOEKS_HOME}/analytics/terraform/spark-k8s-operator/examples/karpenter/ebs-storage-dynamic-pvc/
+cd ${DOEKS_HOME}/analytics/terraform/spark-k8s-operator/examples/karpenter/ebs-storage-dynamic-pvc
 ```
 
-Update the variables in Shell script and execute
+Run the *taxi-trip-execute.sh* script with the following input. You will use the S3_BUCKET variable created earlier. Additionally, you must change YOUR_REGION_HERE with the region of your choice, *us-west-2* for example.
+
+This script will download some example taxi trip data and create duplicates of
+it in order to increase the size a bit. This will take a bit of time and will
+require a relatively fast internet connection.
 
 ```bash
-./taxi-trip-execute.sh
+./taxi-trip-execute.sh ${S3_BUCKET} YOUR_REGION_HERE
 ```
 
-Update YAML file and run the below command
+Once our sample data is uploaded you can run the Spark job. You will need to
+replace the *\<S3_BUCKET\>* placeholders in this file with the name of the bucket
+created earlier. You can get that value by running `echo $S3_BUCKET`.
+
+To do this automatically you can run the following, which will create a .old
+backup file and do the replacement for you.
+
+```bash
+sed -i.old s/\<S3_BUCKET\>/${S3_BUCKET}/g ./ebs-storage-dynamic-pvc.yaml
+```
+
+Now that the bucket name is in place you can create the Spark job.
 
 ```bash
 kubectl apply -f ebs-storage-dynamic-pvc.yaml

From 8731ef8eabe4950a7208eca42a8b26be8efecf6d Mon Sep 17 00:00:00 2001
From: Ray Krueger <raykrueger@gmail.com>
Date: Mon, 1 Apr 2024 19:54:19 +0200
Subject: [PATCH 07/11] Update spark operator benchmark example

The benchmark docs for spark operator point to the wrong file names.

Unify the Bucket place holder to <S3_BUCKET>

Update the benchmark docs with clear instructions
---
 .../benchmark/tpcds-benchmark-3t.yaml         |  4 ++--
 .../tpcds-benchmark-data-generation-3t.yaml   |  4 ++--
 .../data-analytics/spark-operator-yunikorn.md | 19 ++++++++++++++-----
 3 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/analytics/terraform/spark-k8s-operator/examples/benchmark/tpcds-benchmark-3t.yaml b/analytics/terraform/spark-k8s-operator/examples/benchmark/tpcds-benchmark-3t.yaml
index 51b3f2a8b..9e1d37685 100644
--- a/analytics/terraform/spark-k8s-operator/examples/benchmark/tpcds-benchmark-3t.yaml
+++ b/analytics/terraform/spark-k8s-operator/examples/benchmark/tpcds-benchmark-3t.yaml
@@ -1,6 +1,6 @@
 # NOTE: This example requires the following prerequisites before executing the jobs
 # 1. Ensure spark-team-a name space exists
-# 2. replace <REPLACE-WITH-YOUR-S3BUCKET-NAME> with your bucket name
+# 2. replace <S3_BUCKET> with your bucket name
 # 3. Ensure you run "analytics/spark-k8s-operator/spark-samples/tpcds-benchmark-data-generation-1t.yaml"  which generates 3 TB input data
 
 ---
@@ -26,7 +26,7 @@ spec:
     # TPC-DS data location
     - "s3://blogpost-sparkoneks-us-east-1/blog/BLOG_TPCDS-TEST-3T-partitioned"
     # results location
-    - "s3://<REPLACE-WITH-YOUR-S3BUCKET-NAME>/TPCDS-TEST-3T-RESULT"
+    - "s3://<S3_BUCKET>/TPCDS-TEST-3T-RESULT"
     # Path to kit in the docker image
     - "/opt/tpcds-kit/tools"
     # Data Format
diff --git a/analytics/terraform/spark-k8s-operator/examples/benchmark/tpcds-benchmark-data-generation-3t.yaml b/analytics/terraform/spark-k8s-operator/examples/benchmark/tpcds-benchmark-data-generation-3t.yaml
index 14eda2f98..d6d66fdba 100644
--- a/analytics/terraform/spark-k8s-operator/examples/benchmark/tpcds-benchmark-data-generation-3t.yaml
+++ b/analytics/terraform/spark-k8s-operator/examples/benchmark/tpcds-benchmark-data-generation-3t.yaml
@@ -1,6 +1,6 @@
 # NOTE: This example requires the following prerequisites before executing the jobs
 # 1. Ensure spark-team-a name space exists
-# 2. replace <REPLACE-WITH-YOUR-S3BUCKET-NAME> with your bucket name
+# 2. replace <S3_BUCKET> with your bucket name
 
 ---
 apiVersion: "sparkoperator.k8s.io/v1beta2"
@@ -23,7 +23,7 @@ spec:
   mainApplicationFile: local:///opt/spark/examples/jars/eks-spark-benchmark-assembly-1.0.jar
   arguments:
     # TPC-DS data location
-    - "s3a://<REPLACE-WITH-YOUR-S3BUCKET-NAME>/TPCDS-TEST-3T"
+    - "s3a://<S3_BUCKET>/TPCDS-TEST-3T"
     # Path to kit in the docker image
     - "/opt/tpcds-kit/tools"
     # Data Format
diff --git a/website/docs/blueprints/data-analytics/spark-operator-yunikorn.md b/website/docs/blueprints/data-analytics/spark-operator-yunikorn.md
index df0395731..b342a7d7b 100644
--- a/website/docs/blueprints/data-analytics/spark-operator-yunikorn.md
+++ b/website/docs/blueprints/data-analytics/spark-operator-yunikorn.md
@@ -472,21 +472,30 @@ kubectl apply -f nvme-storage-yunikorn-gang-scheduling.yaml
 
 <CollapsibleContent header={<h2><span>Example for TPCDS Benchmark test</span></h2>}>
 
-Check the pre-requisites in yaml file before running this job.
+Be sure that the S3_BUCKET variable is set in the terminal session. If it is
+not, see the Deployment documentation above.
 
 ```bash
-cd ${DOEKS_HOME}/analytics/terraform/spark-k8s-operator/examples/benchmark
+if [ -z "$S3_BUCKET" ] ; then
+  printf "\nS3_BUCKET is NOT set."
+else
+  printf "\nS3_BUCKET is set, rock on."
+fi
 ```
 
-Step1: Benchmark test data generation
+If S3_HOME is set we can proceed into our example.
+
+```bash
+cd ${DOEKS_HOME}/analytics/terraform/spark-k8s-operator/examples/benchmark
+```
 
 ```bash
-kubectl apply -f tpcds-benchmark-data-generation-1t
+kubectl apply -f tpcds-benchmark-data-generation-3t.yaml
 ```
 Step2: Execute Benchmark test
 
 ```bash
-kubectl apply -f tpcds-benchmark-1t.yaml
+kubectl apply -f tpcds-benchmark-3t.yaml
 ```
 </CollapsibleContent>
 

From e04bf95410f057eec47c5e8850ea2647dad464d9 Mon Sep 17 00:00:00 2001
From: Ray Krueger <raykrueger@gmail.com>
Date: Tue, 2 Apr 2024 14:38:14 -0500
Subject: [PATCH 08/11] Update Yunikorn and Karpenter spark operator docs

This closes out the Karpenter updates.
---
 .../data-analytics/spark-operator-yunikorn.md | 22 ++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

diff --git a/website/docs/blueprints/data-analytics/spark-operator-yunikorn.md b/website/docs/blueprints/data-analytics/spark-operator-yunikorn.md
index b342a7d7b..3339c4af2 100644
--- a/website/docs/blueprints/data-analytics/spark-operator-yunikorn.md
+++ b/website/docs/blueprints/data-analytics/spark-operator-yunikorn.md
@@ -374,19 +374,35 @@ kubectl apply -f ebs-storage-dynamic-pvc.yaml
 ```
 
 ## Apache YuniKorn Gang Scheduling with NVMe based SSD disk for shuffle storage
+
 Gang Scheduling Spark jobs using Apache YuniKorn and Spark Operator
 
 ```bash
 cd ${DOEKS_HOME}/analytics/terraform/spark-k8s-operator/examples/karpenter/nvme-yunikorn-gang-scheduling/
 ```
 
-Update the variables in Shell script and execute
+Run the *taxi-trip-execute.sh* script with the following input. You will use the S3_BUCKET variable created earlier. Additionally, you must change YOUR_REGION_HERE with the region of your choice, *us-west-2* for example.
+
+This script will download some example taxi trip data and create duplicates of
+it in order to increase the size a bit. This will take a bit of time and will
+require a relatively fast internet connection.
 
 ```bash
-./taxi-trip-execute.sh
+./taxi-trip-execute.sh ${S3_BUCKET} YOUR_REGION_HERE
 ```
 
-Update YAML file and run the below command
+Once our sample data is uploaded you can run the Spark job. You will need to
+replace the *\<S3_BUCKET\>* placeholders in this file with the name of the bucket
+created earlier. You can get that value by running `echo $S3_BUCKET`.
+
+To do this automatically you can run the following, which will create a .old
+backup file and do the replacement for you.
+
+```bash
+sed -i.old s/\<S3_BUCKET\>/${S3_BUCKET}/g ./nvme-storage-yunikorn-gang-scheduling.yaml
+```
+
+Now that the bucket name is in place you can create the Spark job.
 
 ```bash
 kubectl apply -f nvme-storage-yunikorn-gang-scheduling.yaml

From 71d7c0126e27126578f660667bfe90fa4bcf9605 Mon Sep 17 00:00:00 2001
From: Ray Krueger <raykrueger@gmail.com>
Date: Tue, 2 Apr 2024 15:39:34 -0500
Subject: [PATCH 09/11] Move big yaml blocks to includes

---
 .../data-analytics/_graviton_nodepool.md      |  71 +++++++++
 .../_memory_optimized_nodepool.md             |  70 +++++++++
 .../data-analytics/spark-operator-yunikorn.md | 147 +-----------------
 3 files changed, 147 insertions(+), 141 deletions(-)
 create mode 100644 website/docs/blueprints/data-analytics/_graviton_nodepool.md
 create mode 100644 website/docs/blueprints/data-analytics/_memory_optimized_nodepool.md

diff --git a/website/docs/blueprints/data-analytics/_graviton_nodepool.md b/website/docs/blueprints/data-analytics/_graviton_nodepool.md
new file mode 100644
index 000000000..a783e3608
--- /dev/null
+++ b/website/docs/blueprints/data-analytics/_graviton_nodepool.md
@@ -0,0 +1,71 @@
+```yaml
+    name: spark-graviton-memory-optimized
+    clusterName: ${module.eks.cluster_name}
+    ec2NodeClass:
+      karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]}
+      subnetSelectorTerms:
+        tags:
+          Name: "${module.eks.cluster_name}-private*"
+      securityGroupSelectorTerms:
+        tags:
+          Name: ${module.eks.cluster_name}-node
+      userData: |
+        MIME-Version: 1.0
+        Content-Type: multipart/mixed; boundary="BOUNDARY"
+
+        --BOUNDARY
+        Content-Type: text/x-shellscript; charset="us-ascii"
+
+        cat <<-EOF > /etc/profile.d/bootstrap.sh
+        #!/bin/sh
+
+
+        # Configure the NVMe volumes in RAID0 configuration in the bootstrap.sh call.
+        # https://github.com/awslabs/amazon-eks-ami/blob/master/files/bootstrap.sh#L35
+        # This will create a RAID volume and mount it at /mnt/k8s-disks/0
+        #   then mount that volume to /var/lib/kubelet, /var/lib/containerd, and /var/log/pods
+        #   this allows the container daemons and pods to write to the RAID0 by default without needing PersistentVolumes
+        export LOCAL_DISKS='raid0'
+        EOF
+
+        # Source extra environment variables in bootstrap script
+        sed -i '/^set -o errexit/a\\nsource /etc/profile.d/bootstrap.sh' /etc/eks/bootstrap.sh
+
+        --BOUNDARY--
+
+
+    nodePool:
+      labels:
+        - type: karpenter
+        - NodeGroupType: SparkGravitonMemoryOptimized
+        - multiArch: Spark
+      requirements:
+        - key: "karpenter.sh/capacity-type"
+          operator: In
+          values: ["spot", "on-demand"]
+        - key: "kubernetes.io/arch"
+          operator: In
+          values: ["arm64"]
+        - key: "karpenter.k8s.aws/instance-category"
+          operator: In
+          values: ["r"]
+        - key: "karpenter.k8s.aws/instance-family"
+          operator: In
+          values: ["r6gd"]
+        - key: "karpenter.k8s.aws/instance-cpu"
+          operator: In
+          values: ["4", "8", "16", "32"]
+        - key: "karpenter.k8s.aws/instance-hypervisor"
+          operator: In
+          values: ["nitro"]
+        - key: "karpenter.k8s.aws/instance-generation"
+          operator: Gt
+          values: ["2"]
+      limits:
+        cpu: 1000
+      disruption:
+        consolidationPolicy: WhenEmpty
+        consolidateAfter: 30s
+        expireAfter: 720h
+      weight: 50
+```
\ No newline at end of file
diff --git a/website/docs/blueprints/data-analytics/_memory_optimized_nodepool.md b/website/docs/blueprints/data-analytics/_memory_optimized_nodepool.md
new file mode 100644
index 000000000..4117c7bbe
--- /dev/null
+++ b/website/docs/blueprints/data-analytics/_memory_optimized_nodepool.md
@@ -0,0 +1,70 @@
+```yaml
+    name: spark-memory-optimized
+      clusterName: ${module.eks.cluster_name}
+      ec2NodeClass:
+        karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]}
+        subnetSelectorTerms:
+          tags:
+            Name: "${module.eks.cluster_name}-private*"
+        securityGroupSelectorTerms:
+          tags:
+            Name: ${module.eks.cluster_name}-node
+        userData: |
+          MIME-Version: 1.0
+          Content-Type: multipart/mixed; boundary="BOUNDARY"
+
+          --BOUNDARY
+          Content-Type: text/x-shellscript; charset="us-ascii"
+
+          cat <<-EOF > /etc/profile.d/bootstrap.sh
+          #!/bin/sh
+
+
+          # Configure the NVMe volumes in RAID0 configuration in the bootstrap.sh call.
+          # https://github.com/awslabs/amazon-eks-ami/blob/master/files/bootstrap.sh#L35
+          # This will create a RAID volume and mount it at /mnt/k8s-disks/0
+          #   then mount that volume to /var/lib/kubelet, /var/lib/containerd, and /var/log/pods
+          #   this allows the container daemons and pods to write to the RAID0 by default without needing PersistentVolumes
+          export LOCAL_DISKS='raid0'
+          EOF
+
+          # Source extra environment variables in bootstrap script
+          sed -i '/^set -o errexit/a\\nsource /etc/profile.d/bootstrap.sh' /etc/eks/bootstrap.sh
+
+          --BOUNDARY--
+
+      nodePool:
+        labels:
+          - type: karpenter
+          - NodeGroupType: SparkComputeOptimized
+          - multiArch: Spark
+        requirements:
+          - key: "karpenter.sh/capacity-type"
+            operator: In
+            values: ["spot", "on-demand"]
+          - key: "kubernetes.io/arch"
+            operator: In
+            values: ["amd64"]
+          - key: "karpenter.k8s.aws/instance-category"
+            operator: In
+            values: ["r"]
+          - key: "karpenter.k8s.aws/instance-family"
+            operator: In
+            values: ["r5d"]
+          - key: "karpenter.k8s.aws/instance-cpu"
+            operator: In
+            values: ["4", "8", "16", "32"]
+          - key: "karpenter.k8s.aws/instance-hypervisor"
+            operator: In
+            values: ["nitro"]
+          - key: "karpenter.k8s.aws/instance-generation"
+            operator: Gt
+            values: ["2"]
+        limits:
+          cpu: 1000
+        disruption:
+          consolidationPolicy: WhenEmpty
+          consolidateAfter: 30s
+          expireAfter: 720h
+        weight: 100
+```
\ No newline at end of file
diff --git a/website/docs/blueprints/data-analytics/spark-operator-yunikorn.md b/website/docs/blueprints/data-analytics/spark-operator-yunikorn.md
index 3339c4af2..652f4f288 100644
--- a/website/docs/blueprints/data-analytics/spark-operator-yunikorn.md
+++ b/website/docs/blueprints/data-analytics/spark-operator-yunikorn.md
@@ -6,6 +6,9 @@ import Tabs from '@theme/Tabs';
 import TabItem from '@theme/TabItem';
 import CollapsibleContent from '../../../src/components/CollapsibleContent';
 
+import GravitonNodepool from './_graviton_nodepool.md'
+import MemoryOptimizedNodepool from './_memory_optimized_nodepool.md'
+
 import CodeBlock from '@theme/CodeBlock';
 
 # Spark Operator with YuniKorn
@@ -29,76 +32,8 @@ In this tutorial, you will use Karpenter Nodepools that uses memory optimized in
 <details>
 <summary> To view Karpenter Nodepool for memory optimized instances, Click to toggle content!</summary>
 
-```yaml
-    name: spark-memory-optimized
-      clusterName: ${module.eks.cluster_name}
-      ec2NodeClass:
-        karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]}
-        subnetSelectorTerms:
-          tags:
-            Name: "${module.eks.cluster_name}-private*"
-        securityGroupSelectorTerms:
-          tags:
-            Name: ${module.eks.cluster_name}-node
-        userData: |
-          MIME-Version: 1.0
-          Content-Type: multipart/mixed; boundary="BOUNDARY"
-
-          --BOUNDARY
-          Content-Type: text/x-shellscript; charset="us-ascii"
-
-          cat <<-EOF > /etc/profile.d/bootstrap.sh
-          #!/bin/sh
-
-
-          # Configure the NVMe volumes in RAID0 configuration in the bootstrap.sh call.
-          # https://github.com/awslabs/amazon-eks-ami/blob/master/files/bootstrap.sh#L35
-          # This will create a RAID volume and mount it at /mnt/k8s-disks/0
-          #   then mount that volume to /var/lib/kubelet, /var/lib/containerd, and /var/log/pods
-          #   this allows the container daemons and pods to write to the RAID0 by default without needing PersistentVolumes
-          export LOCAL_DISKS='raid0'
-          EOF
-
-          # Source extra environment variables in bootstrap script
-          sed -i '/^set -o errexit/a\\nsource /etc/profile.d/bootstrap.sh' /etc/eks/bootstrap.sh
-
-          --BOUNDARY--
-
-      nodePool:
-        labels:
-          - type: karpenter
-          - NodeGroupType: SparkComputeOptimized
-          - multiArch: Spark
-        requirements:
-          - key: "karpenter.sh/capacity-type"
-            operator: In
-            values: ["spot", "on-demand"]
-          - key: "kubernetes.io/arch"
-            operator: In
-            values: ["amd64"]
-          - key: "karpenter.k8s.aws/instance-category"
-            operator: In
-            values: ["r"]
-          - key: "karpenter.k8s.aws/instance-family"
-            operator: In
-            values: ["r5d"]
-          - key: "karpenter.k8s.aws/instance-cpu"
-            operator: In
-            values: ["4", "8", "16", "32"]
-          - key: "karpenter.k8s.aws/instance-hypervisor"
-            operator: In
-            values: ["nitro"]
-          - key: "karpenter.k8s.aws/instance-generation"
-            operator: Gt
-            values: ["2"]
-        limits:
-          cpu: 1000
-        disruption:
-          consolidationPolicy: WhenEmpty
-          consolidateAfter: 30s
-          expireAfter: 720h
-        weight: 100
-```
+<MemoryOptimizedNodepool />
+
 </details>
 
 
@@ -111,77 +46,7 @@ In this yaml, you will use Karpenter Nodepool that uses Graviton memory optimize
 <details>
 <summary> To view Karpenter Nodepool for Graviton memory optimized instances, Click to toggle content!</summary>
 
-```yaml
-    name: spark-graviton-memory-optimized
-    clusterName: ${module.eks.cluster_name}
-    ec2NodeClass:
-      karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]}
-      subnetSelectorTerms:
-        tags:
-          Name: "${module.eks.cluster_name}-private*"
-      securityGroupSelectorTerms:
-        tags:
-          Name: ${module.eks.cluster_name}-node
-      userData: |
-        MIME-Version: 1.0
-        Content-Type: multipart/mixed; boundary="BOUNDARY"
-
-        --BOUNDARY
-        Content-Type: text/x-shellscript; charset="us-ascii"
-
-        cat <<-EOF > /etc/profile.d/bootstrap.sh
-        #!/bin/sh
-
-
-        # Configure the NVMe volumes in RAID0 configuration in the bootstrap.sh call.
-        # https://github.com/awslabs/amazon-eks-ami/blob/master/files/bootstrap.sh#L35
-        # This will create a RAID volume and mount it at /mnt/k8s-disks/0
-        #   then mount that volume to /var/lib/kubelet, /var/lib/containerd, and /var/log/pods
-        #   this allows the container daemons and pods to write to the RAID0 by default without needing PersistentVolumes
-        export LOCAL_DISKS='raid0'
-        EOF
-
-        # Source extra environment variables in bootstrap script
-        sed -i '/^set -o errexit/a\\nsource /etc/profile.d/bootstrap.sh' /etc/eks/bootstrap.sh
-
-        --BOUNDARY--
-
-
-    nodePool:
-      labels:
-        - type: karpenter
-        - NodeGroupType: SparkGravitonMemoryOptimized
-        - multiArch: Spark
-      requirements:
-        - key: "karpenter.sh/capacity-type"
-          operator: In
-          values: ["spot", "on-demand"]
-        - key: "kubernetes.io/arch"
-          operator: In
-          values: ["arm64"]
-        - key: "karpenter.k8s.aws/instance-category"
-          operator: In
-          values: ["r"]
-        - key: "karpenter.k8s.aws/instance-family"
-          operator: In
-          values: ["r6gd"]
-        - key: "karpenter.k8s.aws/instance-cpu"
-          operator: In
-          values: ["4", "8", "16", "32"]
-        - key: "karpenter.k8s.aws/instance-hypervisor"
-          operator: In
-          values: ["nitro"]
-        - key: "karpenter.k8s.aws/instance-generation"
-          operator: Gt
-          values: ["2"]
-      limits:
-        cpu: 1000
-      disruption:
-        consolidationPolicy: WhenEmpty
-        consolidateAfter: 30s
-        expireAfter: 720h
-      weight: 50
-```
+<GravitonNodepool />
 
 </details>
 

From 2519999921ddcd9421292e7d6a1968df7822abae Mon Sep 17 00:00:00 2001
From: Ray Krueger <raykrueger@gmail.com>
Date: Tue, 2 Apr 2024 16:26:15 -0500
Subject: [PATCH 10/11] Move repetative paragraphs to partial includes

---
 .../_compute_optimized_nodepool.md            | 141 +++++++++++
 .../_replace_s3_bucket_placeholders.mdx       |   6 +
 .../data-analytics/_taxi_trip_exec.md         |   9 +
 .../data-analytics/spark-operator-yunikorn.md | 226 +++---------------
 4 files changed, 184 insertions(+), 198 deletions(-)
 create mode 100644 website/docs/blueprints/data-analytics/_compute_optimized_nodepool.md
 create mode 100644 website/docs/blueprints/data-analytics/_replace_s3_bucket_placeholders.mdx
 create mode 100644 website/docs/blueprints/data-analytics/_taxi_trip_exec.md

diff --git a/website/docs/blueprints/data-analytics/_compute_optimized_nodepool.md b/website/docs/blueprints/data-analytics/_compute_optimized_nodepool.md
new file mode 100644
index 000000000..385683085
--- /dev/null
+++ b/website/docs/blueprints/data-analytics/_compute_optimized_nodepool.md
@@ -0,0 +1,141 @@
+```yaml
+  # spark-compute-optimized
+    name: spark-compute-optimized
+    clusterName: ${module.eks.cluster_name}
+    ec2NodeClass:
+      karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]}
+      subnetSelectorTerms:
+        tags:
+          Name: "${module.eks.cluster_name}-private*"
+      securityGroupSelectorTerms:
+        tags:
+          Name: ${module.eks.cluster_name}-node
+      userData: |
+        MIME-Version: 1.0
+        Content-Type: multipart/mixed; boundary="BOUNDARY"
+
+        --BOUNDARY
+        Content-Type: text/x-shellscript; charset="us-ascii"
+
+        cat <<-EOF > /etc/profile.d/bootstrap.sh
+        #!/bin/sh
+
+
+        # Configure the NVMe volumes in RAID0 configuration in the bootstrap.sh call.
+        # https://github.com/awslabs/amazon-eks-ami/blob/master/files/bootstrap.sh#L35
+        # This will create a RAID volume and mount it at /mnt/k8s-disks/0
+        #   then mount that volume to /var/lib/kubelet, /var/lib/containerd, and /var/log/pods
+        #   this allows the container daemons and pods to write to the RAID0 by default without needing PersistentVolumes
+        export LOCAL_DISKS='raid0'
+        EOF
+
+        # Source extra environment variables in bootstrap script
+        sed -i '/^set -o errexit/a\\nsource /etc/profile.d/bootstrap.sh' /etc/eks/bootstrap.sh
+
+        --BOUNDARY--
+
+    nodePool:
+      labels:
+        - type: karpenter
+        - NodeGroupType: SparkComputeOptimized
+        - multiArch: Spark
+      requirements:
+        - key: "karpenter.sh/capacity-type"
+          operator: In
+          values: ["spot", "on-demand"]
+        - key: "kubernetes.io/arch"
+          operator: In
+          values: ["amd64"]
+        - key: "karpenter.k8s.aws/instance-category"
+          operator: In
+          values: ["c"]
+        - key: "karpenter.k8s.aws/instance-family"
+          operator: In
+          values: ["c5d"]
+        - key: "karpenter.k8s.aws/instance-cpu"
+          operator: In
+          values: ["4", "8", "16", "36"]
+        - key: "karpenter.k8s.aws/instance-hypervisor"
+          operator: In
+          values: ["nitro"]
+        - key: "karpenter.k8s.aws/instance-generation"
+          operator: Gt
+          values: ["2"]
+      limits:
+        cpu: 20 # Change this to 1000 or more for production according to your needs
+      disruption:
+        consolidationPolicy: WhenEmpty
+        consolidateAfter: 30s
+        expireAfter: 720h
+      weight: 100
+
+    # spark-graviton-memory-optimized Nodepool
+
+    name: spark-graviton-memory-optimized
+    clusterName: ${module.eks.cluster_name}
+    ec2NodeClass:
+      karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]}
+      subnetSelectorTerms:
+        tags:
+          Name: "${module.eks.cluster_name}-private*"
+      securityGroupSelectorTerms:
+        tags:
+          Name: ${module.eks.cluster_name}-node
+      userData: |
+        MIME-Version: 1.0
+        Content-Type: multipart/mixed; boundary="BOUNDARY"
+
+        --BOUNDARY
+        Content-Type: text/x-shellscript; charset="us-ascii"
+
+        cat <<-EOF > /etc/profile.d/bootstrap.sh
+        #!/bin/sh
+
+
+        # Configure the NVMe volumes in RAID0 configuration in the bootstrap.sh call.
+        # https://github.com/awslabs/amazon-eks-ami/blob/master/files/bootstrap.sh#L35
+        # This will create a RAID volume and mount it at /mnt/k8s-disks/0
+        #   then mount that volume to /var/lib/kubelet, /var/lib/containerd, and /var/log/pods
+        #   this allows the container daemons and pods to write to the RAID0 by default without needing PersistentVolumes
+        export LOCAL_DISKS='raid0'
+        EOF
+
+        # Source extra environment variables in bootstrap script
+        sed -i '/^set -o errexit/a\\nsource /etc/profile.d/bootstrap.sh' /etc/eks/bootstrap.sh
+
+        --BOUNDARY--
+    nodePool:
+      labels:
+        - type: karpenter
+        - NodeGroupType: SparkGravitonMemoryOptimized
+        - multiArch: Spark
+      requirements:
+        - key: "karpenter.sh/capacity-type"
+          operator: In
+          values: ["spot", "on-demand"]
+        - key: "kubernetes.io/arch"
+          operator: In
+          values: ["arm64"]
+        - key: "karpenter.k8s.aws/instance-category"
+          operator: In
+          values: ["r"]
+        - key: "karpenter.k8s.aws/instance-family"
+          operator: In
+          values: ["r6gd"]
+        - key: "karpenter.k8s.aws/instance-cpu"
+          operator: In
+          values: ["4", "8", "16", "32"]
+        - key: "karpenter.k8s.aws/instance-hypervisor"
+          operator: In
+          values: ["nitro"]
+        - key: "karpenter.k8s.aws/instance-generation"
+          operator: Gt
+          values: ["2"]
+      limits:
+        cpu: 1000
+      disruption:
+        consolidationPolicy: WhenEmpty
+        consolidateAfter: 30s
+        expireAfter: 720h
+      weight: 50
+```
\ No newline at end of file
diff --git a/website/docs/blueprints/data-analytics/_replace_s3_bucket_placeholders.mdx b/website/docs/blueprints/data-analytics/_replace_s3_bucket_placeholders.mdx
new file mode 100644
index 000000000..e7888bc98
--- /dev/null
+++ b/website/docs/blueprints/data-analytics/_replace_s3_bucket_placeholders.mdx
@@ -0,0 +1,6 @@
+Once our sample data is uploaded you can run the Spark job. You will need to
+replace the *\<S3_BUCKET\>* placeholders in this file with the name of the bucket
+created earlier. You can get that value by running `echo $S3_BUCKET`.
+
+To do this automatically you can run the following, which will create a .old
+backup file and do the replacement for you.
\ No newline at end of file
diff --git a/website/docs/blueprints/data-analytics/_taxi_trip_exec.md b/website/docs/blueprints/data-analytics/_taxi_trip_exec.md
new file mode 100644
index 000000000..ebbdfcc31
--- /dev/null
+++ b/website/docs/blueprints/data-analytics/_taxi_trip_exec.md
@@ -0,0 +1,9 @@
+Run the *taxi-trip-execute.sh* script with the following input. You will use the *S3_BUCKET* variable created earlier. Additionally, you must change YOUR_REGION_HERE with the region of your choice, *us-west-2* for example.
+
+This script will download some example taxi trip data and create duplicates of
+it in order to increase the size a bit. This will take a bit of time and will
+require a relatively fast internet connection.
+
+```bash
+./taxi-trip-execute.sh ${S3_BUCKET} YOUR_REGION_HERE
+```
\ No newline at end of file
diff --git a/website/docs/blueprints/data-analytics/spark-operator-yunikorn.md b/website/docs/blueprints/data-analytics/spark-operator-yunikorn.md
index 652f4f288..a0cfef8b9 100644
--- a/website/docs/blueprints/data-analytics/spark-operator-yunikorn.md
+++ b/website/docs/blueprints/data-analytics/spark-operator-yunikorn.md
@@ -8,6 +8,9 @@ import CollapsibleContent from '../../../src/components/CollapsibleContent';
 
 import GravitonNodepool from './_graviton_nodepool.md'
 import MemoryOptimizedNodepool from './_memory_optimized_nodepool.md'
+import ComputeOptimizedNodepool from './_compute_optimized_nodepool.md'
+import TaxiTripExecute from './_taxi_trip_exec.md'
+import ReplaceS3BucketPlaceholders from './_replace_s3_bucket_placeholders.mdx';
 
 import CodeBlock from '@theme/CodeBlock';
 
@@ -177,23 +180,10 @@ Example PySpark job that uses NVMe based ephemeral SSD disk for Driver and Execu
 cd ${DOEKS_HOME}/analytics/terraform/spark-k8s-operator/examples/karpenter/nvme-ephemeral-storage/
 ```
 
-Run the *taxi-trip-execute.sh* script with the following input. You will use the S3_BUCKET variable created earlier. Additionally, you must change YOUR_REGION_HERE with the region of your choice, *us-west-2* for example.
-
-This script will download some example taxi trip data and create duplicates of
-it in order to increase the size a bit. This will take a bit of time and will
-require a relatively fast internet connection.
-
-```bash
-./taxi-trip-execute.sh ${S3_BUCKET} YOUR_REGION_HERE
-```
-
-Once our sample data is uploaded you can run the Spark job. You will need to
-replace the *\<S3_BUCKET\>* placeholders in this file with the name of the bucket
-created earlier. You can get that value by running `echo $S3_BUCKET`.
-
-To do this automatically you can run the following, which will create a .old
-backup file and do the replacement for you.
+<TaxiTripExecute />
 
+<!-- Docusaurus will not render the {props.filename} inside of a ```codeblock``` -->
+<ReplaceS3BucketPlaceholders filename="./nvme-ephemeral-storage.yaml" />
 ```bash
 sed -i.old s/\<S3_BUCKET\>/${S3_BUCKET}/g ./nvme-ephemeral-storage.yaml
 ```
@@ -211,23 +201,10 @@ Example PySpark job that uses EBS ON_DEMAND volumes using Dynamic PVCs for Drive
 cd ${DOEKS_HOME}/analytics/terraform/spark-k8s-operator/examples/karpenter/ebs-storage-dynamic-pvc
 ```
 
-Run the *taxi-trip-execute.sh* script with the following input. You will use the S3_BUCKET variable created earlier. Additionally, you must change YOUR_REGION_HERE with the region of your choice, *us-west-2* for example.
-
-This script will download some example taxi trip data and create duplicates of
-it in order to increase the size a bit. This will take a bit of time and will
-require a relatively fast internet connection.
-
-```bash
-./taxi-trip-execute.sh ${S3_BUCKET} YOUR_REGION_HERE
-```
-
-Once our sample data is uploaded you can run the Spark job. You will need to
-replace the *\<S3_BUCKET\>* placeholders in this file with the name of the bucket
-created earlier. You can get that value by running `echo $S3_BUCKET`.
-
-To do this automatically you can run the following, which will create a .old
-backup file and do the replacement for you.
+<TaxiTripExecute />
 
+<!-- Docusaurus will not render the {props.filename} inside of a ```codeblock``` -->
+<ReplaceS3BucketPlaceholders filename="./ebs-storage-dynamic-pvc.yaml" />
 ```bash
 sed -i.old s/\<S3_BUCKET\>/${S3_BUCKET}/g ./ebs-storage-dynamic-pvc.yaml
 ```
@@ -246,23 +223,10 @@ Gang Scheduling Spark jobs using Apache YuniKorn and Spark Operator
 cd ${DOEKS_HOME}/analytics/terraform/spark-k8s-operator/examples/karpenter/nvme-yunikorn-gang-scheduling/
 ```
 
-Run the *taxi-trip-execute.sh* script with the following input. You will use the S3_BUCKET variable created earlier. Additionally, you must change YOUR_REGION_HERE with the region of your choice, *us-west-2* for example.
-
-This script will download some example taxi trip data and create duplicates of
-it in order to increase the size a bit. This will take a bit of time and will
-require a relatively fast internet connection.
-
-```bash
-./taxi-trip-execute.sh ${S3_BUCKET} YOUR_REGION_HERE
-```
-
-Once our sample data is uploaded you can run the Spark job. You will need to
-replace the *\<S3_BUCKET\>* placeholders in this file with the name of the bucket
-created earlier. You can get that value by running `echo $S3_BUCKET`.
-
-To do this automatically you can run the following, which will create a .old
-backup file and do the replacement for you.
+<TaxiTripExecute />
 
+<!-- Docusaurus will not render the {props.filename} inside of a ```codeblock``` -->
+<ReplaceS3BucketPlaceholders filename="./ebs-storage-dynamic-pvc.yaml" />
 ```bash
 sed -i.old s/\<S3_BUCKET\>/${S3_BUCKET}/g ./nvme-storage-yunikorn-gang-scheduling.yaml
 ```
@@ -299,13 +263,15 @@ Example PySpark job that uses NVMe based ephemeral SSD disk for Driver and Execu
 cd ${DOEKS_HOME}/analytics/terraform/spark-k8s-operator/examples/cluster-autoscaler/nvme-ephemeral-storage
 ```
 
-Update the variables in Shell script and execute
+<TaxiTripExecute />
 
+<!-- Docusaurus will not render the {props.filename} inside of a ```codeblock``` -->
+<ReplaceS3BucketPlaceholders />
 ```bash
-./taxi-trip-execute.sh
+sed -i.old s/\<S3_BUCKET\>/${S3_BUCKET}/g ./nvme-ephemeral-storage.yaml
 ```
 
-Update YAML file and run the below command
+Now that the bucket name is in place you can create the Spark job.
 
 ```bash
 kubectl apply -f nvme-ephemeral-storage.yaml
@@ -318,13 +284,15 @@ Example PySpark job that uses EBS ON_DEMAND volumes using Dynamic PVCs for Drive
 cd ${DOEKS_HOME}/analytics/terraform/spark-k8s-operator/examples/cluster-autoscaler/ebs-storage-dynamic-pvc
 ```
 
-Update the variables in Shell script and execute
+<TaxiTripExecute />
 
+<!-- Docusaurus will not render the {props.filename} inside of a ```codeblock``` -->
+<ReplaceS3BucketPlaceholders />
 ```bash
-./taxi-trip-execute.sh
+sed -i.old s/\<S3_BUCKET\>/${S3_BUCKET}/g ./ebs-storage-dynamic-pvc.yaml
 ```
 
-Update YAML file and run the below command
+Now that the bucket name is in place you can create the Spark job.
 
 ```bash
 kubectl apply -f ebs-storage-dynamic-pvc.yaml
@@ -337,13 +305,15 @@ Gang Scheduling Spark jobs using Apache YuniKorn and Spark Operator
 cd ${DOEKS_HOME}/analytics/terraform/spark-k8s-operator/examples/cluster-autoscaler/nvme-yunikorn-gang-scheduling
 ```
 
-Update the variables in Shell script and execute
+<TaxiTripExecute />
 
+<!-- Docusaurus will not render the {props.filename} inside of a ```codeblock``` -->
+<ReplaceS3BucketPlaceholders />
 ```bash
-./taxi-trip-execute.sh
+sed -i.old s/\<S3_BUCKET\>/${S3_BUCKET}/g ./nvme-storage-yunikorn-gang-scheduling.yaml
 ```
 
-Update YAML file and run the below command
+Now that the bucket name is in place you can create the Spark job.
 
 ```bash
 kubectl apply -f nvme-storage-yunikorn-gang-scheduling.yaml
@@ -397,147 +367,7 @@ Graviton Nodepool (ARM): Set the weight of the Graviton Nodepool to `100`. This
 
 Intel Nodepool (AMD): Set the weight of the Intel Nodepool to `50`. This ensures that Karpenter will fall back to the Intel Nodepool when Graviton instances are either unavailable or reach their maximum CPU capacity.
 
-```yaml
-  # spark-compute-optimized
-    name: spark-compute-optimized
-    clusterName: ${module.eks.cluster_name}
-    ec2NodeClass:
-      karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]}
-      subnetSelectorTerms:
-        tags:
-          Name: "${module.eks.cluster_name}-private*"
-      securityGroupSelectorTerms:
-        tags:
-          Name: ${module.eks.cluster_name}-node
-      userData: |
-        MIME-Version: 1.0
-        Content-Type: multipart/mixed; boundary="BOUNDARY"
-
-        --BOUNDARY
-        Content-Type: text/x-shellscript; charset="us-ascii"
-
-        cat <<-EOF > /etc/profile.d/bootstrap.sh
-        #!/bin/sh
-
-
-        # Configure the NVMe volumes in RAID0 configuration in the bootstrap.sh call.
-        # https://github.com/awslabs/amazon-eks-ami/blob/master/files/bootstrap.sh#L35
-        # This will create a RAID volume and mount it at /mnt/k8s-disks/0
-        #   then mount that volume to /var/lib/kubelet, /var/lib/containerd, and /var/log/pods
-        #   this allows the container daemons and pods to write to the RAID0 by default without needing PersistentVolumes
-        export LOCAL_DISKS='raid0'
-        EOF
-
-        # Source extra environment variables in bootstrap script
-        sed -i '/^set -o errexit/a\\nsource /etc/profile.d/bootstrap.sh' /etc/eks/bootstrap.sh
-
-        --BOUNDARY--
-
-    nodePool:
-      labels:
-        - type: karpenter
-        - NodeGroupType: SparkComputeOptimized
-        - multiArch: Spark
-      requirements:
-        - key: "karpenter.sh/capacity-type"
-          operator: In
-          values: ["spot", "on-demand"]
-        - key: "kubernetes.io/arch"
-          operator: In
-          values: ["amd64"]
-        - key: "karpenter.k8s.aws/instance-category"
-          operator: In
-          values: ["c"]
-        - key: "karpenter.k8s.aws/instance-family"
-          operator: In
-          values: ["c5d"]
-        - key: "karpenter.k8s.aws/instance-cpu"
-          operator: In
-          values: ["4", "8", "16", "36"]
-        - key: "karpenter.k8s.aws/instance-hypervisor"
-          operator: In
-          values: ["nitro"]
-        - key: "karpenter.k8s.aws/instance-generation"
-          operator: Gt
-          values: ["2"]
-      limits:
-        cpu: 20 # Change this to 1000 or more for production according to your needs
-      disruption:
-        consolidationPolicy: WhenEmpty
-        consolidateAfter: 30s
-        expireAfter: 720h
-      weight: 100
-
-    # spark-graviton-memory-optimized Nodepool
-
-    name: spark-graviton-memory-optimized
-    clusterName: ${module.eks.cluster_name}
-    ec2NodeClass:
-      karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]}
-      subnetSelectorTerms:
-        tags:
-          Name: "${module.eks.cluster_name}-private*"
-      securityGroupSelectorTerms:
-        tags:
-          Name: ${module.eks.cluster_name}-node
-      userData: |
-        MIME-Version: 1.0
-        Content-Type: multipart/mixed; boundary="BOUNDARY"
-
-        --BOUNDARY
-        Content-Type: text/x-shellscript; charset="us-ascii"
-
-        cat <<-EOF > /etc/profile.d/bootstrap.sh
-        #!/bin/sh
-
-
-        # Configure the NVMe volumes in RAID0 configuration in the bootstrap.sh call.
-        # https://github.com/awslabs/amazon-eks-ami/blob/master/files/bootstrap.sh#L35
-        # This will create a RAID volume and mount it at /mnt/k8s-disks/0
-        #   then mount that volume to /var/lib/kubelet, /var/lib/containerd, and /var/log/pods
-        #   this allows the container daemons and pods to write to the RAID0 by default without needing PersistentVolumes
-        export LOCAL_DISKS='raid0'
-        EOF
-
-        # Source extra environment variables in bootstrap script
-        sed -i '/^set -o errexit/a\\nsource /etc/profile.d/bootstrap.sh' /etc/eks/bootstrap.sh
-
-        --BOUNDARY--
-    nodePool:
-      labels:
-        - type: karpenter
-        - NodeGroupType: SparkGravitonMemoryOptimized
-        - multiArch: Spark
-      requirements:
-        - key: "karpenter.sh/capacity-type"
-          operator: In
-          values: ["spot", "on-demand"]
-        - key: "kubernetes.io/arch"
-          operator: In
-          values: ["arm64"]
-        - key: "karpenter.k8s.aws/instance-category"
-          operator: In
-          values: ["r"]
-        - key: "karpenter.k8s.aws/instance-family"
-          operator: In
-          values: ["r6gd"]
-        - key: "karpenter.k8s.aws/instance-cpu"
-          operator: In
-          values: ["4", "8", "16", "32"]
-        - key: "karpenter.k8s.aws/instance-hypervisor"
-          operator: In
-          values: ["nitro"]
-        - key: "karpenter.k8s.aws/instance-generation"
-          operator: Gt
-          values: ["2"]
-      limits:
-        cpu: 1000
-      disruption:
-        consolidationPolicy: WhenEmpty
-        consolidateAfter: 30s
-        expireAfter: 720h
-      weight: 50
-```
+<ComputeOptimizedNodepool />
 
 </CollapsibleContent>
 

From 912d47f5fd3881eba652a9a24dc24e0c6f318a82 Mon Sep 17 00:00:00 2001
From: Ray Krueger <raykrueger@gmail.com>
Date: Tue, 2 Apr 2024 17:22:15 -0500
Subject: [PATCH 11/11] S3_BUCKET not S3_HOME

---
 .../docs/blueprints/data-analytics/spark-operator-yunikorn.md   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/docs/blueprints/data-analytics/spark-operator-yunikorn.md b/website/docs/blueprints/data-analytics/spark-operator-yunikorn.md
index a0cfef8b9..1ee3076da 100644
--- a/website/docs/blueprints/data-analytics/spark-operator-yunikorn.md
+++ b/website/docs/blueprints/data-analytics/spark-operator-yunikorn.md
@@ -334,7 +334,7 @@ else
 fi
 ```
 
-If S3_HOME is set we can proceed into our example.
+If *S3_BUCKET* is set we can proceed into our example.
 
 ```bash
 cd ${DOEKS_HOME}/analytics/terraform/spark-k8s-operator/examples/benchmark