From 9e284c287eac07abe6732490729ad24af9a802c5 Mon Sep 17 00:00:00 2001 From: Ray Krueger Date: Mon, 1 Apr 2024 13:05:08 +0000 Subject: [PATCH 01/11] WIP update spark operator docs --- .../data-analytics/spark-operator-yunikorn.md | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/website/docs/blueprints/data-analytics/spark-operator-yunikorn.md b/website/docs/blueprints/data-analytics/spark-operator-yunikorn.md index c00bd8b41..75397feb2 100644 --- a/website/docs/blueprints/data-analytics/spark-operator-yunikorn.md +++ b/website/docs/blueprints/data-analytics/spark-operator-yunikorn.md @@ -260,12 +260,14 @@ Clone the repository ```bash git clone https://github.com/awslabs/data-on-eks.git +cd data-on-eks +export DOEKS_HOME=$(pwd) ``` Navigate into one of the example directories and run `install.sh` script ```bash -cd data-on-eks/analytics/terraform/spark-k8s-operator +cd ${DOEKS_HOME}/analytics/terraform/spark-k8s-operator chmod +x install.sh ./install.sh ``` @@ -277,7 +279,7 @@ chmod +x install.sh Navigate to example directory and submit the Spark job. ```bash -cd data-on-eks/analytics/terraform/spark-k8s-operator/examples/karpenter +cd ${DOEKS_HOME}/analytics/terraform/spark-k8s-operator/examples/karpenter kubectl apply -f pyspark-pi-job.yaml ``` @@ -295,7 +297,7 @@ You can try the following examples to leverage multiple Karpenter Nodepools, EBS Example PySpark job that uses NVMe based ephemeral SSD disk for Driver and Executor shuffle storage ```bash - cd analytics/terraform/spark-k8s-operator/examples/karpenter/nvme-ephemeral-storage/ + cd ${DOEKS_HOME}/analytics/terraform/spark-k8s-operator/examples/karpenter/nvme-ephemeral-storage/ ``` Update the variables in Shell script and execute @@ -314,7 +316,7 @@ Update YAML file and run the below command Example PySpark job that uses EBS ON_DEMAND volumes using Dynamic PVCs for Driver and Executor shuffle storage ```bash - cd analytics/terraform/spark-k8s-operator/examples/karpenter/ebs-storage-dynamic-pvc/ + cd ${DOEKS_HOME}/analytics/terraform/spark-k8s-operator/examples/karpenter/ebs-storage-dynamic-pvc/ ``` Update the variables in Shell script and execute @@ -355,7 +357,7 @@ Update YAML file and run the below command Navigate to example directory and submit the Spark job. ```bash -cd data-on-eks/analytics/terraform/spark-k8s-operator/examples/cluster-autoscaler +cd ${DOEKS_HOME}/analytics/terraform/spark-k8s-operator/examples/cluster-autoscaler kubectl apply -f pyspark-pi-job.yaml ``` From 3d7956739ab0ca23d26f1fafaf8701db047c9c30 Mon Sep 17 00:00:00 2001 From: Ray Krueger Date: Mon, 1 Apr 2024 13:05:08 +0000 Subject: [PATCH 02/11] Create a DOEKS_HOME environment variable Adding a DOEKS_HOME varaible set to the data-on-eks directory means that each command in the spark on eks docs can be copied and pasted directly. Originally the docs simply said `cd analytics/...` repeastedly and that path was never correct. --- .../data-analytics/spark-operator-yunikorn.md | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/website/docs/blueprints/data-analytics/spark-operator-yunikorn.md b/website/docs/blueprints/data-analytics/spark-operator-yunikorn.md index 75397feb2..dd743408d 100644 --- a/website/docs/blueprints/data-analytics/spark-operator-yunikorn.md +++ b/website/docs/blueprints/data-analytics/spark-operator-yunikorn.md @@ -256,7 +256,7 @@ Ensure that you have installed the following tools on your machine. ### Deploy -Clone the repository +Clone the repository. ```bash git clone https://github.com/awslabs/data-on-eks.git @@ -264,7 +264,10 @@ cd data-on-eks export DOEKS_HOME=$(pwd) ``` -Navigate into one of the example directories and run `install.sh` script +If DOEKS_HOME is ever unset, you can always set it manually using `export +DATA_ON_EKS=$(pwd)` from your data-on-eks directory. + +Navigate into one of the example directories and run `install.sh` script. ```bash cd ${DOEKS_HOME}/analytics/terraform/spark-k8s-operator @@ -335,7 +338,7 @@ Update YAML file and run the below command Gang Scheduling Spark jobs using Apache YuniKorn and Spark Operator ```bash - cd analytics/terraform/spark-k8s-operator/examples/karpenter/nvme-yunikorn-gang-scheduling/ + cd ${DOEKS_HOME}/analytics/terraform/spark-k8s-operator/examples/karpenter/nvme-yunikorn-gang-scheduling/ ``` Update the variables in Shell script and execute @@ -373,7 +376,7 @@ kubectl get pods -n spark-team-a -w Example PySpark job that uses NVMe based ephemeral SSD disk for Driver and Executor shuffle storage ```bash - cd analytics/terraform/spark-k8s-operator/examples/cluster-autoscaler/nvme-ephemeral-storage + cd ${DOEKS_HOME}/analytics/terraform/spark-k8s-operator/examples/cluster-autoscaler/nvme-ephemeral-storage ``` Update the variables in Shell script and execute @@ -392,7 +395,7 @@ Update YAML file and run the below command Example PySpark job that uses EBS ON_DEMAND volumes using Dynamic PVCs for Driver and Executor shuffle storage ```bash - cd analytics/terraform/spark-k8s-operator/examples/cluster-autoscaler/ebs-storage-dynamic-pvc + cd ${DOEKS_HOME}/analytics/terraform/spark-k8s-operator/examples/cluster-autoscaler/ebs-storage-dynamic-pvc ``` Update the variables in Shell script and execute @@ -411,7 +414,7 @@ Update YAML file and run the below command Gang Scheduling Spark jobs using Apache YuniKorn and Spark Operator ```bash - cd analytics/terraform/spark-k8s-operator/examples/cluster-autoscaler/nvme-yunikorn-gang-scheduling + cd ${DOEKS_HOME}/analytics/terraform/spark-k8s-operator/examples/cluster-autoscaler/nvme-yunikorn-gang-scheduling ``` Update the variables in Shell script and execute @@ -433,7 +436,7 @@ Update YAML file and run the below command Check the pre-requisites in yaml file before running this job. ```bash -cd analytics/terraform/spark-k8s-operator/examples/benchmark +cd ${DOEKS_HOME}/analytics/terraform/spark-k8s-operator/examples/benchmark ``` Step1: Benchmark test data generation @@ -615,7 +618,7 @@ Intel Nodepool (AMD): Set the weight of the Intel Nodepool to `50`. This ensures This script will cleanup the environment using `-target` option to ensure all the resources are deleted in correct order. ```bash -cd analytics/terraform/spark-k8s-operator && chmod +x cleanup.sh +cd ${DOEKS_HOME}/analytics/terraform/spark-k8s-operator && chmod +x cleanup.sh ./cleanup.sh ``` From caf5c2fec73b9fab92579de8aa33bba7f55ce521 Mon Sep 17 00:00:00 2001 From: Ray Krueger Date: Mon, 1 Apr 2024 15:28:17 +0200 Subject: [PATCH 03/11] Fix leading white space on some of the commands --- .../data-analytics/spark-operator-yunikorn.md | 36 +++++++++---------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/website/docs/blueprints/data-analytics/spark-operator-yunikorn.md b/website/docs/blueprints/data-analytics/spark-operator-yunikorn.md index dd743408d..81b85c60b 100644 --- a/website/docs/blueprints/data-analytics/spark-operator-yunikorn.md +++ b/website/docs/blueprints/data-analytics/spark-operator-yunikorn.md @@ -300,13 +300,13 @@ You can try the following examples to leverage multiple Karpenter Nodepools, EBS Example PySpark job that uses NVMe based ephemeral SSD disk for Driver and Executor shuffle storage ```bash - cd ${DOEKS_HOME}/analytics/terraform/spark-k8s-operator/examples/karpenter/nvme-ephemeral-storage/ +cd ${DOEKS_HOME}/analytics/terraform/spark-k8s-operator/examples/karpenter/nvme-ephemeral-storage/ ``` Update the variables in Shell script and execute ```bash - ./taxi-trip-execute.sh +./taxi-trip-execute.sh ``` Update YAML file and run the below command @@ -319,38 +319,38 @@ Update YAML file and run the below command Example PySpark job that uses EBS ON_DEMAND volumes using Dynamic PVCs for Driver and Executor shuffle storage ```bash - cd ${DOEKS_HOME}/analytics/terraform/spark-k8s-operator/examples/karpenter/ebs-storage-dynamic-pvc/ +cd ${DOEKS_HOME}/analytics/terraform/spark-k8s-operator/examples/karpenter/ebs-storage-dynamic-pvc/ ``` Update the variables in Shell script and execute ```bash - ./taxi-trip-execute.sh +./taxi-trip-execute.sh ``` Update YAML file and run the below command ```bash - kubectl apply -f ebs-storage-dynamic-pvc.yaml +kubectl apply -f ebs-storage-dynamic-pvc.yaml ``` ## Apache YuniKorn Gang Scheduling with NVMe based SSD disk for shuffle storage Gang Scheduling Spark jobs using Apache YuniKorn and Spark Operator ```bash - cd ${DOEKS_HOME}/analytics/terraform/spark-k8s-operator/examples/karpenter/nvme-yunikorn-gang-scheduling/ +cd ${DOEKS_HOME}/analytics/terraform/spark-k8s-operator/examples/karpenter/nvme-yunikorn-gang-scheduling/ ``` Update the variables in Shell script and execute ```bash - ./taxi-trip-execute.sh +./taxi-trip-execute.sh ``` Update YAML file and run the below command ```bash - kubectl apply -f nvme-storage-yunikorn-gang-scheduling.yaml +kubectl apply -f nvme-storage-yunikorn-gang-scheduling.yaml ``` @@ -376,57 +376,57 @@ kubectl get pods -n spark-team-a -w Example PySpark job that uses NVMe based ephemeral SSD disk for Driver and Executor shuffle storage ```bash - cd ${DOEKS_HOME}/analytics/terraform/spark-k8s-operator/examples/cluster-autoscaler/nvme-ephemeral-storage +cd ${DOEKS_HOME}/analytics/terraform/spark-k8s-operator/examples/cluster-autoscaler/nvme-ephemeral-storage ``` Update the variables in Shell script and execute ```bash - ./taxi-trip-execute.sh +./taxi-trip-execute.sh ``` Update YAML file and run the below command ```bash - kubectl apply -f nvme-ephemeral-storage.yaml +kubectl apply -f nvme-ephemeral-storage.yaml ``` ## EBS Dynamic PVC for shuffle storage Example PySpark job that uses EBS ON_DEMAND volumes using Dynamic PVCs for Driver and Executor shuffle storage ```bash - cd ${DOEKS_HOME}/analytics/terraform/spark-k8s-operator/examples/cluster-autoscaler/ebs-storage-dynamic-pvc +cd ${DOEKS_HOME}/analytics/terraform/spark-k8s-operator/examples/cluster-autoscaler/ebs-storage-dynamic-pvc ``` Update the variables in Shell script and execute ```bash - ./taxi-trip-execute.sh +./taxi-trip-execute.sh ``` Update YAML file and run the below command ```bash - kubectl apply -f ebs-storage-dynamic-pvc.yaml +kubectl apply -f ebs-storage-dynamic-pvc.yaml ``` ## Apache YuniKorn Gang Scheduling with NVMe based SSD disk for shuffle storage Gang Scheduling Spark jobs using Apache YuniKorn and Spark Operator ```bash - cd ${DOEKS_HOME}/analytics/terraform/spark-k8s-operator/examples/cluster-autoscaler/nvme-yunikorn-gang-scheduling +cd ${DOEKS_HOME}/analytics/terraform/spark-k8s-operator/examples/cluster-autoscaler/nvme-yunikorn-gang-scheduling ``` Update the variables in Shell script and execute ```bash - ./taxi-trip-execute.sh +./taxi-trip-execute.sh ``` Update YAML file and run the below command ```bash - kubectl apply -f nvme-storage-yunikorn-gang-scheduling.yaml +kubectl apply -f nvme-storage-yunikorn-gang-scheduling.yaml ``` @@ -447,7 +447,7 @@ kubectl apply -f tpcds-benchmark-data-generation-1t Step2: Execute Benchmark test ```bash - kubectl apply -f tpcds-benchmark-1t.yaml +kubectl apply -f tpcds-benchmark-1t.yaml ``` From 22da57e024a90623c95a08b59ec1a283a0c65d01 Mon Sep 17 00:00:00 2001 From: Ray Krueger Date: Mon, 1 Apr 2024 16:30:39 +0200 Subject: [PATCH 04/11] Rename ENTER_S3_BUCKET to S3_BUCKET --- .../nvme-ephemeral-storage.yaml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/analytics/terraform/spark-k8s-operator/examples/karpenter/nvme-ephemeral-storage/nvme-ephemeral-storage.yaml b/analytics/terraform/spark-k8s-operator/examples/karpenter/nvme-ephemeral-storage/nvme-ephemeral-storage.yaml index 0a4f3cf8e..1b6546c93 100644 --- a/analytics/terraform/spark-k8s-operator/examples/karpenter/nvme-ephemeral-storage/nvme-ephemeral-storage.yaml +++ b/analytics/terraform/spark-k8s-operator/examples/karpenter/nvme-ephemeral-storage/nvme-ephemeral-storage.yaml @@ -1,6 +1,6 @@ # Pre-requisite before running this job -# 1/ Open taxi-trip-execute.sh and update and -# 2/ Replace with your S3 bucket created by this blueprint(Check Terraform outputs) +# 1/ Open taxi-trip-execute.sh and update and +# 2/ Replace with your S3 bucket created by this blueprint(Check Terraform outputs) # 3/ execute taxi-trip-execute.sh --- @@ -30,10 +30,10 @@ spec: mode: cluster image: public.ecr.aws/data-on-eks/spark3.3.1-hadoop3.2-aws-java-sdk-bundle-1.12.647 imagePullPolicy: IfNotPresent - mainApplicationFile: "s3a:///taxi-trip/scripts/pyspark-taxi-trip.py" # MainFile is the path to a bundled JAR, Python, or R file of the application + mainApplicationFile: "s3a:///taxi-trip/scripts/pyspark-taxi-trip.py" # MainFile is the path to a bundled JAR, Python, or R file of the application arguments: - - "s3a:///taxi-trip/input/" - - "s3a:///taxi-trip/output/" + - "s3a:///taxi-trip/input/" + - "s3a:///taxi-trip/output/" hadoopConf: "fs.s3a.aws.credentials.provider": "com.amazonaws.auth.WebIdentityTokenCredentialsProvider" "fs.s3a.impl": "org.apache.hadoop.fs.s3a.S3AFileSystem" @@ -55,7 +55,7 @@ spec: # Spark Event logs "spark.eventLog.enabled": "true" - "spark.eventLog.dir": "s3a:///spark-event-logs" + "spark.eventLog.dir": "s3a:///spark-event-logs" "spark.eventLog.rolling.enabled": "true" "spark.eventLog.rolling.maxFileSize": "64m" # "spark.history.fs.eventLog.rolling.maxFilesToRetain": 100 From 4be92d1b3d40183573d7c44785cb7ec15088dcda Mon Sep 17 00:00:00 2001 From: Ray Krueger Date: Mon, 1 Apr 2024 16:57:49 +0200 Subject: [PATCH 05/11] Provide an example of using sed for bucket names I am unifying all the examples to use as the placeholder in files. I have added documentation to show how to use sed to do the varaiable replaement rather than putting the user through manual efforts. --- .gitignore | 1 + .../nvme-ephemeral-storage.yaml | 4 +-- .../data-analytics/spark-operator-yunikorn.md | 32 ++++++++++++++++--- 3 files changed, 30 insertions(+), 7 deletions(-) diff --git a/.gitignore b/.gitignore index 095c4d90c..b112a3d20 100755 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ .DS_Store .idea .build +*.old # ignore the lock file website/package-lock.json diff --git a/analytics/terraform/spark-k8s-operator/examples/karpenter/nvme-ephemeral-storage/nvme-ephemeral-storage.yaml b/analytics/terraform/spark-k8s-operator/examples/karpenter/nvme-ephemeral-storage/nvme-ephemeral-storage.yaml index 1b6546c93..6dc4407f7 100644 --- a/analytics/terraform/spark-k8s-operator/examples/karpenter/nvme-ephemeral-storage/nvme-ephemeral-storage.yaml +++ b/analytics/terraform/spark-k8s-operator/examples/karpenter/nvme-ephemeral-storage/nvme-ephemeral-storage.yaml @@ -1,7 +1,5 @@ # Pre-requisite before running this job -# 1/ Open taxi-trip-execute.sh and update and -# 2/ Replace with your S3 bucket created by this blueprint(Check Terraform outputs) -# 3/ execute taxi-trip-execute.sh +# Replace with your S3 bucket created by this blueprint(Check Terraform outputs) --- apiVersion: "sparkoperator.k8s.io/v1beta2" diff --git a/website/docs/blueprints/data-analytics/spark-operator-yunikorn.md b/website/docs/blueprints/data-analytics/spark-operator-yunikorn.md index 81b85c60b..895d98039 100644 --- a/website/docs/blueprints/data-analytics/spark-operator-yunikorn.md +++ b/website/docs/blueprints/data-analytics/spark-operator-yunikorn.md @@ -275,6 +275,15 @@ chmod +x install.sh ./install.sh ``` +Now create an S3_BUCKET variable that holds the name of the bucket created +during the install. This bucket will be used in later examples to store output +data. If S3_BUCKET is ever unset, you can run the following commands again. + +```bash +export S3_BUCKET=$(terraform output -raw s3_bucket_id_spark_history_server) +echo $S3_BUCKET +``` + Execute Sample Spark job with Karpenter}> @@ -303,16 +312,31 @@ Example PySpark job that uses NVMe based ephemeral SSD disk for Driver and Execu cd ${DOEKS_HOME}/analytics/terraform/spark-k8s-operator/examples/karpenter/nvme-ephemeral-storage/ ``` -Update the variables in Shell script and execute +Run the *taxi-trip-execute.sh* script with the following input. You will use the S3_BUCKET variable created earlier. Additionally, you must change YOUR_REGION_HERE with the region of your choice, *us-west-2* for example. + +This script will download some example taxi trip data and create duplicates of +it in order to increase the size a bit. This will take a bit of time and will +require a relatively fast internet connection. ```bash -./taxi-trip-execute.sh +./taxi-trip-execute.sh ${S3_BUCKET} YOUR_REGION_HERE ``` -Update YAML file and run the below command +Once our sample data is uploaded you can run the Spark job. You will need to +replace the ** placeholders in this file with the name of the bucket +created earlier. You can get that value by running `echo $S3_BUCKET`. + +To do this automatically you can run the following, which will create a .old +backup file and do the rename for you. ```bash - kubectl apply -f nvme-ephemeral-storage.yaml +sed -i.old s/\/${S3_BUCKET}/g ./nvme-ephemeral-storage.yaml +``` + +Now that the bucket name is in place you can create the Spark job. + +```bash +kubectl apply -f nvme-ephemeral-storage.yaml ``` ## EBS Dynamic PVC for shuffle storage From ee99a02ebac2488ec566c4fa0109646061aeaa5c Mon Sep 17 00:00:00 2001 From: Ray Krueger Date: Mon, 1 Apr 2024 19:41:57 +0200 Subject: [PATCH 06/11] Do the same for EBS Dynamic PVC --- .../data-analytics/spark-operator-yunikorn.md | 27 ++++++++++++++----- 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/website/docs/blueprints/data-analytics/spark-operator-yunikorn.md b/website/docs/blueprints/data-analytics/spark-operator-yunikorn.md index 895d98039..df0395731 100644 --- a/website/docs/blueprints/data-analytics/spark-operator-yunikorn.md +++ b/website/docs/blueprints/data-analytics/spark-operator-yunikorn.md @@ -323,11 +323,11 @@ require a relatively fast internet connection. ``` Once our sample data is uploaded you can run the Spark job. You will need to -replace the ** placeholders in this file with the name of the bucket +replace the *\* placeholders in this file with the name of the bucket created earlier. You can get that value by running `echo $S3_BUCKET`. To do this automatically you can run the following, which will create a .old -backup file and do the rename for you. +backup file and do the replacement for you. ```bash sed -i.old s/\/${S3_BUCKET}/g ./nvme-ephemeral-storage.yaml @@ -343,16 +343,31 @@ kubectl apply -f nvme-ephemeral-storage.yaml Example PySpark job that uses EBS ON_DEMAND volumes using Dynamic PVCs for Driver and Executor shuffle storage ```bash -cd ${DOEKS_HOME}/analytics/terraform/spark-k8s-operator/examples/karpenter/ebs-storage-dynamic-pvc/ +cd ${DOEKS_HOME}/analytics/terraform/spark-k8s-operator/examples/karpenter/ebs-storage-dynamic-pvc ``` -Update the variables in Shell script and execute +Run the *taxi-trip-execute.sh* script with the following input. You will use the S3_BUCKET variable created earlier. Additionally, you must change YOUR_REGION_HERE with the region of your choice, *us-west-2* for example. + +This script will download some example taxi trip data and create duplicates of +it in order to increase the size a bit. This will take a bit of time and will +require a relatively fast internet connection. ```bash -./taxi-trip-execute.sh +./taxi-trip-execute.sh ${S3_BUCKET} YOUR_REGION_HERE ``` -Update YAML file and run the below command +Once our sample data is uploaded you can run the Spark job. You will need to +replace the *\* placeholders in this file with the name of the bucket +created earlier. You can get that value by running `echo $S3_BUCKET`. + +To do this automatically you can run the following, which will create a .old +backup file and do the replacement for you. + +```bash +sed -i.old s/\/${S3_BUCKET}/g ./ebs-storage-dynamic-pvc.yaml +``` + +Now that the bucket name is in place you can create the Spark job. ```bash kubectl apply -f ebs-storage-dynamic-pvc.yaml From 8731ef8eabe4950a7208eca42a8b26be8efecf6d Mon Sep 17 00:00:00 2001 From: Ray Krueger Date: Mon, 1 Apr 2024 19:54:19 +0200 Subject: [PATCH 07/11] Update spark operator benchmark example The benchmark docs for spark operator point to the wrong file names. Unify the Bucket place holder to Update the benchmark docs with clear instructions --- .../benchmark/tpcds-benchmark-3t.yaml | 4 ++-- .../tpcds-benchmark-data-generation-3t.yaml | 4 ++-- .../data-analytics/spark-operator-yunikorn.md | 19 ++++++++++++++----- 3 files changed, 18 insertions(+), 9 deletions(-) diff --git a/analytics/terraform/spark-k8s-operator/examples/benchmark/tpcds-benchmark-3t.yaml b/analytics/terraform/spark-k8s-operator/examples/benchmark/tpcds-benchmark-3t.yaml index 51b3f2a8b..9e1d37685 100644 --- a/analytics/terraform/spark-k8s-operator/examples/benchmark/tpcds-benchmark-3t.yaml +++ b/analytics/terraform/spark-k8s-operator/examples/benchmark/tpcds-benchmark-3t.yaml @@ -1,6 +1,6 @@ # NOTE: This example requires the following prerequisites before executing the jobs # 1. Ensure spark-team-a name space exists -# 2. replace with your bucket name +# 2. replace with your bucket name # 3. Ensure you run "analytics/spark-k8s-operator/spark-samples/tpcds-benchmark-data-generation-1t.yaml" which generates 3 TB input data --- @@ -26,7 +26,7 @@ spec: # TPC-DS data location - "s3://blogpost-sparkoneks-us-east-1/blog/BLOG_TPCDS-TEST-3T-partitioned" # results location - - "s3:///TPCDS-TEST-3T-RESULT" + - "s3:///TPCDS-TEST-3T-RESULT" # Path to kit in the docker image - "/opt/tpcds-kit/tools" # Data Format diff --git a/analytics/terraform/spark-k8s-operator/examples/benchmark/tpcds-benchmark-data-generation-3t.yaml b/analytics/terraform/spark-k8s-operator/examples/benchmark/tpcds-benchmark-data-generation-3t.yaml index 14eda2f98..d6d66fdba 100644 --- a/analytics/terraform/spark-k8s-operator/examples/benchmark/tpcds-benchmark-data-generation-3t.yaml +++ b/analytics/terraform/spark-k8s-operator/examples/benchmark/tpcds-benchmark-data-generation-3t.yaml @@ -1,6 +1,6 @@ # NOTE: This example requires the following prerequisites before executing the jobs # 1. Ensure spark-team-a name space exists -# 2. replace with your bucket name +# 2. replace with your bucket name --- apiVersion: "sparkoperator.k8s.io/v1beta2" @@ -23,7 +23,7 @@ spec: mainApplicationFile: local:///opt/spark/examples/jars/eks-spark-benchmark-assembly-1.0.jar arguments: # TPC-DS data location - - "s3a:///TPCDS-TEST-3T" + - "s3a:///TPCDS-TEST-3T" # Path to kit in the docker image - "/opt/tpcds-kit/tools" # Data Format diff --git a/website/docs/blueprints/data-analytics/spark-operator-yunikorn.md b/website/docs/blueprints/data-analytics/spark-operator-yunikorn.md index df0395731..b342a7d7b 100644 --- a/website/docs/blueprints/data-analytics/spark-operator-yunikorn.md +++ b/website/docs/blueprints/data-analytics/spark-operator-yunikorn.md @@ -472,21 +472,30 @@ kubectl apply -f nvme-storage-yunikorn-gang-scheduling.yaml Example for TPCDS Benchmark test}> -Check the pre-requisites in yaml file before running this job. +Be sure that the S3_BUCKET variable is set in the terminal session. If it is +not, see the Deployment documentation above. ```bash -cd ${DOEKS_HOME}/analytics/terraform/spark-k8s-operator/examples/benchmark +if [ -z "$S3_BUCKET" ] ; then + printf "\nS3_BUCKET is NOT set." +else + printf "\nS3_BUCKET is set, rock on." +fi ``` -Step1: Benchmark test data generation +If S3_HOME is set we can proceed into our example. + +```bash +cd ${DOEKS_HOME}/analytics/terraform/spark-k8s-operator/examples/benchmark +``` ```bash -kubectl apply -f tpcds-benchmark-data-generation-1t +kubectl apply -f tpcds-benchmark-data-generation-3t.yaml ``` Step2: Execute Benchmark test ```bash -kubectl apply -f tpcds-benchmark-1t.yaml +kubectl apply -f tpcds-benchmark-3t.yaml ``` From e04bf95410f057eec47c5e8850ea2647dad464d9 Mon Sep 17 00:00:00 2001 From: Ray Krueger Date: Tue, 2 Apr 2024 14:38:14 -0500 Subject: [PATCH 08/11] Update Yunikorn and Karpenter spark operator docs This closes out the Karpenter updates. --- .../data-analytics/spark-operator-yunikorn.md | 22 ++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/website/docs/blueprints/data-analytics/spark-operator-yunikorn.md b/website/docs/blueprints/data-analytics/spark-operator-yunikorn.md index b342a7d7b..3339c4af2 100644 --- a/website/docs/blueprints/data-analytics/spark-operator-yunikorn.md +++ b/website/docs/blueprints/data-analytics/spark-operator-yunikorn.md @@ -374,19 +374,35 @@ kubectl apply -f ebs-storage-dynamic-pvc.yaml ``` ## Apache YuniKorn Gang Scheduling with NVMe based SSD disk for shuffle storage + Gang Scheduling Spark jobs using Apache YuniKorn and Spark Operator ```bash cd ${DOEKS_HOME}/analytics/terraform/spark-k8s-operator/examples/karpenter/nvme-yunikorn-gang-scheduling/ ``` -Update the variables in Shell script and execute +Run the *taxi-trip-execute.sh* script with the following input. You will use the S3_BUCKET variable created earlier. Additionally, you must change YOUR_REGION_HERE with the region of your choice, *us-west-2* for example. + +This script will download some example taxi trip data and create duplicates of +it in order to increase the size a bit. This will take a bit of time and will +require a relatively fast internet connection. ```bash -./taxi-trip-execute.sh +./taxi-trip-execute.sh ${S3_BUCKET} YOUR_REGION_HERE ``` -Update YAML file and run the below command +Once our sample data is uploaded you can run the Spark job. You will need to +replace the *\* placeholders in this file with the name of the bucket +created earlier. You can get that value by running `echo $S3_BUCKET`. + +To do this automatically you can run the following, which will create a .old +backup file and do the replacement for you. + +```bash +sed -i.old s/\/${S3_BUCKET}/g ./nvme-storage-yunikorn-gang-scheduling.yaml +``` + +Now that the bucket name is in place you can create the Spark job. ```bash kubectl apply -f nvme-storage-yunikorn-gang-scheduling.yaml From 71d7c0126e27126578f660667bfe90fa4bcf9605 Mon Sep 17 00:00:00 2001 From: Ray Krueger Date: Tue, 2 Apr 2024 15:39:34 -0500 Subject: [PATCH 09/11] Move big yaml blocks to includes --- .../data-analytics/_graviton_nodepool.md | 71 +++++++++ .../_memory_optimized_nodepool.md | 70 +++++++++ .../data-analytics/spark-operator-yunikorn.md | 147 +----------------- 3 files changed, 147 insertions(+), 141 deletions(-) create mode 100644 website/docs/blueprints/data-analytics/_graviton_nodepool.md create mode 100644 website/docs/blueprints/data-analytics/_memory_optimized_nodepool.md diff --git a/website/docs/blueprints/data-analytics/_graviton_nodepool.md b/website/docs/blueprints/data-analytics/_graviton_nodepool.md new file mode 100644 index 000000000..a783e3608 --- /dev/null +++ b/website/docs/blueprints/data-analytics/_graviton_nodepool.md @@ -0,0 +1,71 @@ +```yaml + name: spark-graviton-memory-optimized + clusterName: ${module.eks.cluster_name} + ec2NodeClass: + karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]} + subnetSelectorTerms: + tags: + Name: "${module.eks.cluster_name}-private*" + securityGroupSelectorTerms: + tags: + Name: ${module.eks.cluster_name}-node + userData: | + MIME-Version: 1.0 + Content-Type: multipart/mixed; boundary="BOUNDARY" + + --BOUNDARY + Content-Type: text/x-shellscript; charset="us-ascii" + + cat <<-EOF > /etc/profile.d/bootstrap.sh + #!/bin/sh + + + # Configure the NVMe volumes in RAID0 configuration in the bootstrap.sh call. + # https://github.com/awslabs/amazon-eks-ami/blob/master/files/bootstrap.sh#L35 + # This will create a RAID volume and mount it at /mnt/k8s-disks/0 + # then mount that volume to /var/lib/kubelet, /var/lib/containerd, and /var/log/pods + # this allows the container daemons and pods to write to the RAID0 by default without needing PersistentVolumes + export LOCAL_DISKS='raid0' + EOF + + # Source extra environment variables in bootstrap script + sed -i '/^set -o errexit/a\\nsource /etc/profile.d/bootstrap.sh' /etc/eks/bootstrap.sh + + --BOUNDARY-- + + + nodePool: + labels: + - type: karpenter + - NodeGroupType: SparkGravitonMemoryOptimized + - multiArch: Spark + requirements: + - key: "karpenter.sh/capacity-type" + operator: In + values: ["spot", "on-demand"] + - key: "kubernetes.io/arch" + operator: In + values: ["arm64"] + - key: "karpenter.k8s.aws/instance-category" + operator: In + values: ["r"] + - key: "karpenter.k8s.aws/instance-family" + operator: In + values: ["r6gd"] + - key: "karpenter.k8s.aws/instance-cpu" + operator: In + values: ["4", "8", "16", "32"] + - key: "karpenter.k8s.aws/instance-hypervisor" + operator: In + values: ["nitro"] + - key: "karpenter.k8s.aws/instance-generation" + operator: Gt + values: ["2"] + limits: + cpu: 1000 + disruption: + consolidationPolicy: WhenEmpty + consolidateAfter: 30s + expireAfter: 720h + weight: 50 +``` \ No newline at end of file diff --git a/website/docs/blueprints/data-analytics/_memory_optimized_nodepool.md b/website/docs/blueprints/data-analytics/_memory_optimized_nodepool.md new file mode 100644 index 000000000..4117c7bbe --- /dev/null +++ b/website/docs/blueprints/data-analytics/_memory_optimized_nodepool.md @@ -0,0 +1,70 @@ +```yaml + name: spark-memory-optimized + clusterName: ${module.eks.cluster_name} + ec2NodeClass: + karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]} + subnetSelectorTerms: + tags: + Name: "${module.eks.cluster_name}-private*" + securityGroupSelectorTerms: + tags: + Name: ${module.eks.cluster_name}-node + userData: | + MIME-Version: 1.0 + Content-Type: multipart/mixed; boundary="BOUNDARY" + + --BOUNDARY + Content-Type: text/x-shellscript; charset="us-ascii" + + cat <<-EOF > /etc/profile.d/bootstrap.sh + #!/bin/sh + + + # Configure the NVMe volumes in RAID0 configuration in the bootstrap.sh call. + # https://github.com/awslabs/amazon-eks-ami/blob/master/files/bootstrap.sh#L35 + # This will create a RAID volume and mount it at /mnt/k8s-disks/0 + # then mount that volume to /var/lib/kubelet, /var/lib/containerd, and /var/log/pods + # this allows the container daemons and pods to write to the RAID0 by default without needing PersistentVolumes + export LOCAL_DISKS='raid0' + EOF + + # Source extra environment variables in bootstrap script + sed -i '/^set -o errexit/a\\nsource /etc/profile.d/bootstrap.sh' /etc/eks/bootstrap.sh + + --BOUNDARY-- + + nodePool: + labels: + - type: karpenter + - NodeGroupType: SparkComputeOptimized + - multiArch: Spark + requirements: + - key: "karpenter.sh/capacity-type" + operator: In + values: ["spot", "on-demand"] + - key: "kubernetes.io/arch" + operator: In + values: ["amd64"] + - key: "karpenter.k8s.aws/instance-category" + operator: In + values: ["r"] + - key: "karpenter.k8s.aws/instance-family" + operator: In + values: ["r5d"] + - key: "karpenter.k8s.aws/instance-cpu" + operator: In + values: ["4", "8", "16", "32"] + - key: "karpenter.k8s.aws/instance-hypervisor" + operator: In + values: ["nitro"] + - key: "karpenter.k8s.aws/instance-generation" + operator: Gt + values: ["2"] + limits: + cpu: 1000 + disruption: + consolidationPolicy: WhenEmpty + consolidateAfter: 30s + expireAfter: 720h + weight: 100 +``` \ No newline at end of file diff --git a/website/docs/blueprints/data-analytics/spark-operator-yunikorn.md b/website/docs/blueprints/data-analytics/spark-operator-yunikorn.md index 3339c4af2..652f4f288 100644 --- a/website/docs/blueprints/data-analytics/spark-operator-yunikorn.md +++ b/website/docs/blueprints/data-analytics/spark-operator-yunikorn.md @@ -6,6 +6,9 @@ import Tabs from '@theme/Tabs'; import TabItem from '@theme/TabItem'; import CollapsibleContent from '../../../src/components/CollapsibleContent'; +import GravitonNodepool from './_graviton_nodepool.md' +import MemoryOptimizedNodepool from './_memory_optimized_nodepool.md' + import CodeBlock from '@theme/CodeBlock'; # Spark Operator with YuniKorn @@ -29,76 +32,8 @@ In this tutorial, you will use Karpenter Nodepools that uses memory optimized in
To view Karpenter Nodepool for memory optimized instances, Click to toggle content! -```yaml - name: spark-memory-optimized - clusterName: ${module.eks.cluster_name} - ec2NodeClass: - karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]} - subnetSelectorTerms: - tags: - Name: "${module.eks.cluster_name}-private*" - securityGroupSelectorTerms: - tags: - Name: ${module.eks.cluster_name}-node - userData: | - MIME-Version: 1.0 - Content-Type: multipart/mixed; boundary="BOUNDARY" - - --BOUNDARY - Content-Type: text/x-shellscript; charset="us-ascii" - - cat <<-EOF > /etc/profile.d/bootstrap.sh - #!/bin/sh - - - # Configure the NVMe volumes in RAID0 configuration in the bootstrap.sh call. - # https://github.com/awslabs/amazon-eks-ami/blob/master/files/bootstrap.sh#L35 - # This will create a RAID volume and mount it at /mnt/k8s-disks/0 - # then mount that volume to /var/lib/kubelet, /var/lib/containerd, and /var/log/pods - # this allows the container daemons and pods to write to the RAID0 by default without needing PersistentVolumes - export LOCAL_DISKS='raid0' - EOF - - # Source extra environment variables in bootstrap script - sed -i '/^set -o errexit/a\\nsource /etc/profile.d/bootstrap.sh' /etc/eks/bootstrap.sh - - --BOUNDARY-- - - nodePool: - labels: - - type: karpenter - - NodeGroupType: SparkComputeOptimized - - multiArch: Spark - requirements: - - key: "karpenter.sh/capacity-type" - operator: In - values: ["spot", "on-demand"] - - key: "kubernetes.io/arch" - operator: In - values: ["amd64"] - - key: "karpenter.k8s.aws/instance-category" - operator: In - values: ["r"] - - key: "karpenter.k8s.aws/instance-family" - operator: In - values: ["r5d"] - - key: "karpenter.k8s.aws/instance-cpu" - operator: In - values: ["4", "8", "16", "32"] - - key: "karpenter.k8s.aws/instance-hypervisor" - operator: In - values: ["nitro"] - - key: "karpenter.k8s.aws/instance-generation" - operator: Gt - values: ["2"] - limits: - cpu: 1000 - disruption: - consolidationPolicy: WhenEmpty - consolidateAfter: 30s - expireAfter: 720h - weight: 100 -``` + +
@@ -111,77 +46,7 @@ In this yaml, you will use Karpenter Nodepool that uses Graviton memory optimize
To view Karpenter Nodepool for Graviton memory optimized instances, Click to toggle content! -```yaml - name: spark-graviton-memory-optimized - clusterName: ${module.eks.cluster_name} - ec2NodeClass: - karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]} - subnetSelectorTerms: - tags: - Name: "${module.eks.cluster_name}-private*" - securityGroupSelectorTerms: - tags: - Name: ${module.eks.cluster_name}-node - userData: | - MIME-Version: 1.0 - Content-Type: multipart/mixed; boundary="BOUNDARY" - - --BOUNDARY - Content-Type: text/x-shellscript; charset="us-ascii" - - cat <<-EOF > /etc/profile.d/bootstrap.sh - #!/bin/sh - - - # Configure the NVMe volumes in RAID0 configuration in the bootstrap.sh call. - # https://github.com/awslabs/amazon-eks-ami/blob/master/files/bootstrap.sh#L35 - # This will create a RAID volume and mount it at /mnt/k8s-disks/0 - # then mount that volume to /var/lib/kubelet, /var/lib/containerd, and /var/log/pods - # this allows the container daemons and pods to write to the RAID0 by default without needing PersistentVolumes - export LOCAL_DISKS='raid0' - EOF - - # Source extra environment variables in bootstrap script - sed -i '/^set -o errexit/a\\nsource /etc/profile.d/bootstrap.sh' /etc/eks/bootstrap.sh - - --BOUNDARY-- - - - nodePool: - labels: - - type: karpenter - - NodeGroupType: SparkGravitonMemoryOptimized - - multiArch: Spark - requirements: - - key: "karpenter.sh/capacity-type" - operator: In - values: ["spot", "on-demand"] - - key: "kubernetes.io/arch" - operator: In - values: ["arm64"] - - key: "karpenter.k8s.aws/instance-category" - operator: In - values: ["r"] - - key: "karpenter.k8s.aws/instance-family" - operator: In - values: ["r6gd"] - - key: "karpenter.k8s.aws/instance-cpu" - operator: In - values: ["4", "8", "16", "32"] - - key: "karpenter.k8s.aws/instance-hypervisor" - operator: In - values: ["nitro"] - - key: "karpenter.k8s.aws/instance-generation" - operator: Gt - values: ["2"] - limits: - cpu: 1000 - disruption: - consolidationPolicy: WhenEmpty - consolidateAfter: 30s - expireAfter: 720h - weight: 50 -``` +
From 2519999921ddcd9421292e7d6a1968df7822abae Mon Sep 17 00:00:00 2001 From: Ray Krueger Date: Tue, 2 Apr 2024 16:26:15 -0500 Subject: [PATCH 10/11] Move repetative paragraphs to partial includes --- .../_compute_optimized_nodepool.md | 141 +++++++++++ .../_replace_s3_bucket_placeholders.mdx | 6 + .../data-analytics/_taxi_trip_exec.md | 9 + .../data-analytics/spark-operator-yunikorn.md | 226 +++--------------- 4 files changed, 184 insertions(+), 198 deletions(-) create mode 100644 website/docs/blueprints/data-analytics/_compute_optimized_nodepool.md create mode 100644 website/docs/blueprints/data-analytics/_replace_s3_bucket_placeholders.mdx create mode 100644 website/docs/blueprints/data-analytics/_taxi_trip_exec.md diff --git a/website/docs/blueprints/data-analytics/_compute_optimized_nodepool.md b/website/docs/blueprints/data-analytics/_compute_optimized_nodepool.md new file mode 100644 index 000000000..385683085 --- /dev/null +++ b/website/docs/blueprints/data-analytics/_compute_optimized_nodepool.md @@ -0,0 +1,141 @@ +```yaml + # spark-compute-optimized + name: spark-compute-optimized + clusterName: ${module.eks.cluster_name} + ec2NodeClass: + karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]} + subnetSelectorTerms: + tags: + Name: "${module.eks.cluster_name}-private*" + securityGroupSelectorTerms: + tags: + Name: ${module.eks.cluster_name}-node + userData: | + MIME-Version: 1.0 + Content-Type: multipart/mixed; boundary="BOUNDARY" + + --BOUNDARY + Content-Type: text/x-shellscript; charset="us-ascii" + + cat <<-EOF > /etc/profile.d/bootstrap.sh + #!/bin/sh + + + # Configure the NVMe volumes in RAID0 configuration in the bootstrap.sh call. + # https://github.com/awslabs/amazon-eks-ami/blob/master/files/bootstrap.sh#L35 + # This will create a RAID volume and mount it at /mnt/k8s-disks/0 + # then mount that volume to /var/lib/kubelet, /var/lib/containerd, and /var/log/pods + # this allows the container daemons and pods to write to the RAID0 by default without needing PersistentVolumes + export LOCAL_DISKS='raid0' + EOF + + # Source extra environment variables in bootstrap script + sed -i '/^set -o errexit/a\\nsource /etc/profile.d/bootstrap.sh' /etc/eks/bootstrap.sh + + --BOUNDARY-- + + nodePool: + labels: + - type: karpenter + - NodeGroupType: SparkComputeOptimized + - multiArch: Spark + requirements: + - key: "karpenter.sh/capacity-type" + operator: In + values: ["spot", "on-demand"] + - key: "kubernetes.io/arch" + operator: In + values: ["amd64"] + - key: "karpenter.k8s.aws/instance-category" + operator: In + values: ["c"] + - key: "karpenter.k8s.aws/instance-family" + operator: In + values: ["c5d"] + - key: "karpenter.k8s.aws/instance-cpu" + operator: In + values: ["4", "8", "16", "36"] + - key: "karpenter.k8s.aws/instance-hypervisor" + operator: In + values: ["nitro"] + - key: "karpenter.k8s.aws/instance-generation" + operator: Gt + values: ["2"] + limits: + cpu: 20 # Change this to 1000 or more for production according to your needs + disruption: + consolidationPolicy: WhenEmpty + consolidateAfter: 30s + expireAfter: 720h + weight: 100 + + # spark-graviton-memory-optimized Nodepool + + name: spark-graviton-memory-optimized + clusterName: ${module.eks.cluster_name} + ec2NodeClass: + karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]} + subnetSelectorTerms: + tags: + Name: "${module.eks.cluster_name}-private*" + securityGroupSelectorTerms: + tags: + Name: ${module.eks.cluster_name}-node + userData: | + MIME-Version: 1.0 + Content-Type: multipart/mixed; boundary="BOUNDARY" + + --BOUNDARY + Content-Type: text/x-shellscript; charset="us-ascii" + + cat <<-EOF > /etc/profile.d/bootstrap.sh + #!/bin/sh + + + # Configure the NVMe volumes in RAID0 configuration in the bootstrap.sh call. + # https://github.com/awslabs/amazon-eks-ami/blob/master/files/bootstrap.sh#L35 + # This will create a RAID volume and mount it at /mnt/k8s-disks/0 + # then mount that volume to /var/lib/kubelet, /var/lib/containerd, and /var/log/pods + # this allows the container daemons and pods to write to the RAID0 by default without needing PersistentVolumes + export LOCAL_DISKS='raid0' + EOF + + # Source extra environment variables in bootstrap script + sed -i '/^set -o errexit/a\\nsource /etc/profile.d/bootstrap.sh' /etc/eks/bootstrap.sh + + --BOUNDARY-- + nodePool: + labels: + - type: karpenter + - NodeGroupType: SparkGravitonMemoryOptimized + - multiArch: Spark + requirements: + - key: "karpenter.sh/capacity-type" + operator: In + values: ["spot", "on-demand"] + - key: "kubernetes.io/arch" + operator: In + values: ["arm64"] + - key: "karpenter.k8s.aws/instance-category" + operator: In + values: ["r"] + - key: "karpenter.k8s.aws/instance-family" + operator: In + values: ["r6gd"] + - key: "karpenter.k8s.aws/instance-cpu" + operator: In + values: ["4", "8", "16", "32"] + - key: "karpenter.k8s.aws/instance-hypervisor" + operator: In + values: ["nitro"] + - key: "karpenter.k8s.aws/instance-generation" + operator: Gt + values: ["2"] + limits: + cpu: 1000 + disruption: + consolidationPolicy: WhenEmpty + consolidateAfter: 30s + expireAfter: 720h + weight: 50 +``` \ No newline at end of file diff --git a/website/docs/blueprints/data-analytics/_replace_s3_bucket_placeholders.mdx b/website/docs/blueprints/data-analytics/_replace_s3_bucket_placeholders.mdx new file mode 100644 index 000000000..e7888bc98 --- /dev/null +++ b/website/docs/blueprints/data-analytics/_replace_s3_bucket_placeholders.mdx @@ -0,0 +1,6 @@ +Once our sample data is uploaded you can run the Spark job. You will need to +replace the *\* placeholders in this file with the name of the bucket +created earlier. You can get that value by running `echo $S3_BUCKET`. + +To do this automatically you can run the following, which will create a .old +backup file and do the replacement for you. \ No newline at end of file diff --git a/website/docs/blueprints/data-analytics/_taxi_trip_exec.md b/website/docs/blueprints/data-analytics/_taxi_trip_exec.md new file mode 100644 index 000000000..ebbdfcc31 --- /dev/null +++ b/website/docs/blueprints/data-analytics/_taxi_trip_exec.md @@ -0,0 +1,9 @@ +Run the *taxi-trip-execute.sh* script with the following input. You will use the *S3_BUCKET* variable created earlier. Additionally, you must change YOUR_REGION_HERE with the region of your choice, *us-west-2* for example. + +This script will download some example taxi trip data and create duplicates of +it in order to increase the size a bit. This will take a bit of time and will +require a relatively fast internet connection. + +```bash +./taxi-trip-execute.sh ${S3_BUCKET} YOUR_REGION_HERE +``` \ No newline at end of file diff --git a/website/docs/blueprints/data-analytics/spark-operator-yunikorn.md b/website/docs/blueprints/data-analytics/spark-operator-yunikorn.md index 652f4f288..a0cfef8b9 100644 --- a/website/docs/blueprints/data-analytics/spark-operator-yunikorn.md +++ b/website/docs/blueprints/data-analytics/spark-operator-yunikorn.md @@ -8,6 +8,9 @@ import CollapsibleContent from '../../../src/components/CollapsibleContent'; import GravitonNodepool from './_graviton_nodepool.md' import MemoryOptimizedNodepool from './_memory_optimized_nodepool.md' +import ComputeOptimizedNodepool from './_compute_optimized_nodepool.md' +import TaxiTripExecute from './_taxi_trip_exec.md' +import ReplaceS3BucketPlaceholders from './_replace_s3_bucket_placeholders.mdx'; import CodeBlock from '@theme/CodeBlock'; @@ -177,23 +180,10 @@ Example PySpark job that uses NVMe based ephemeral SSD disk for Driver and Execu cd ${DOEKS_HOME}/analytics/terraform/spark-k8s-operator/examples/karpenter/nvme-ephemeral-storage/ ``` -Run the *taxi-trip-execute.sh* script with the following input. You will use the S3_BUCKET variable created earlier. Additionally, you must change YOUR_REGION_HERE with the region of your choice, *us-west-2* for example. - -This script will download some example taxi trip data and create duplicates of -it in order to increase the size a bit. This will take a bit of time and will -require a relatively fast internet connection. - -```bash -./taxi-trip-execute.sh ${S3_BUCKET} YOUR_REGION_HERE -``` - -Once our sample data is uploaded you can run the Spark job. You will need to -replace the *\* placeholders in this file with the name of the bucket -created earlier. You can get that value by running `echo $S3_BUCKET`. - -To do this automatically you can run the following, which will create a .old -backup file and do the replacement for you. + + + ```bash sed -i.old s/\/${S3_BUCKET}/g ./nvme-ephemeral-storage.yaml ``` @@ -211,23 +201,10 @@ Example PySpark job that uses EBS ON_DEMAND volumes using Dynamic PVCs for Drive cd ${DOEKS_HOME}/analytics/terraform/spark-k8s-operator/examples/karpenter/ebs-storage-dynamic-pvc ``` -Run the *taxi-trip-execute.sh* script with the following input. You will use the S3_BUCKET variable created earlier. Additionally, you must change YOUR_REGION_HERE with the region of your choice, *us-west-2* for example. - -This script will download some example taxi trip data and create duplicates of -it in order to increase the size a bit. This will take a bit of time and will -require a relatively fast internet connection. - -```bash -./taxi-trip-execute.sh ${S3_BUCKET} YOUR_REGION_HERE -``` - -Once our sample data is uploaded you can run the Spark job. You will need to -replace the *\* placeholders in this file with the name of the bucket -created earlier. You can get that value by running `echo $S3_BUCKET`. - -To do this automatically you can run the following, which will create a .old -backup file and do the replacement for you. + + + ```bash sed -i.old s/\/${S3_BUCKET}/g ./ebs-storage-dynamic-pvc.yaml ``` @@ -246,23 +223,10 @@ Gang Scheduling Spark jobs using Apache YuniKorn and Spark Operator cd ${DOEKS_HOME}/analytics/terraform/spark-k8s-operator/examples/karpenter/nvme-yunikorn-gang-scheduling/ ``` -Run the *taxi-trip-execute.sh* script with the following input. You will use the S3_BUCKET variable created earlier. Additionally, you must change YOUR_REGION_HERE with the region of your choice, *us-west-2* for example. - -This script will download some example taxi trip data and create duplicates of -it in order to increase the size a bit. This will take a bit of time and will -require a relatively fast internet connection. - -```bash -./taxi-trip-execute.sh ${S3_BUCKET} YOUR_REGION_HERE -``` - -Once our sample data is uploaded you can run the Spark job. You will need to -replace the *\* placeholders in this file with the name of the bucket -created earlier. You can get that value by running `echo $S3_BUCKET`. - -To do this automatically you can run the following, which will create a .old -backup file and do the replacement for you. + + + ```bash sed -i.old s/\/${S3_BUCKET}/g ./nvme-storage-yunikorn-gang-scheduling.yaml ``` @@ -299,13 +263,15 @@ Example PySpark job that uses NVMe based ephemeral SSD disk for Driver and Execu cd ${DOEKS_HOME}/analytics/terraform/spark-k8s-operator/examples/cluster-autoscaler/nvme-ephemeral-storage ``` -Update the variables in Shell script and execute + + + ```bash -./taxi-trip-execute.sh +sed -i.old s/\/${S3_BUCKET}/g ./nvme-ephemeral-storage.yaml ``` -Update YAML file and run the below command +Now that the bucket name is in place you can create the Spark job. ```bash kubectl apply -f nvme-ephemeral-storage.yaml @@ -318,13 +284,15 @@ Example PySpark job that uses EBS ON_DEMAND volumes using Dynamic PVCs for Drive cd ${DOEKS_HOME}/analytics/terraform/spark-k8s-operator/examples/cluster-autoscaler/ebs-storage-dynamic-pvc ``` -Update the variables in Shell script and execute + + + ```bash -./taxi-trip-execute.sh +sed -i.old s/\/${S3_BUCKET}/g ./ebs-storage-dynamic-pvc.yaml ``` -Update YAML file and run the below command +Now that the bucket name is in place you can create the Spark job. ```bash kubectl apply -f ebs-storage-dynamic-pvc.yaml @@ -337,13 +305,15 @@ Gang Scheduling Spark jobs using Apache YuniKorn and Spark Operator cd ${DOEKS_HOME}/analytics/terraform/spark-k8s-operator/examples/cluster-autoscaler/nvme-yunikorn-gang-scheduling ``` -Update the variables in Shell script and execute + + + ```bash -./taxi-trip-execute.sh +sed -i.old s/\/${S3_BUCKET}/g ./nvme-storage-yunikorn-gang-scheduling.yaml ``` -Update YAML file and run the below command +Now that the bucket name is in place you can create the Spark job. ```bash kubectl apply -f nvme-storage-yunikorn-gang-scheduling.yaml @@ -397,147 +367,7 @@ Graviton Nodepool (ARM): Set the weight of the Graviton Nodepool to `100`. This Intel Nodepool (AMD): Set the weight of the Intel Nodepool to `50`. This ensures that Karpenter will fall back to the Intel Nodepool when Graviton instances are either unavailable or reach their maximum CPU capacity. -```yaml - # spark-compute-optimized - name: spark-compute-optimized - clusterName: ${module.eks.cluster_name} - ec2NodeClass: - karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]} - subnetSelectorTerms: - tags: - Name: "${module.eks.cluster_name}-private*" - securityGroupSelectorTerms: - tags: - Name: ${module.eks.cluster_name}-node - userData: | - MIME-Version: 1.0 - Content-Type: multipart/mixed; boundary="BOUNDARY" - - --BOUNDARY - Content-Type: text/x-shellscript; charset="us-ascii" - - cat <<-EOF > /etc/profile.d/bootstrap.sh - #!/bin/sh - - - # Configure the NVMe volumes in RAID0 configuration in the bootstrap.sh call. - # https://github.com/awslabs/amazon-eks-ami/blob/master/files/bootstrap.sh#L35 - # This will create a RAID volume and mount it at /mnt/k8s-disks/0 - # then mount that volume to /var/lib/kubelet, /var/lib/containerd, and /var/log/pods - # this allows the container daemons and pods to write to the RAID0 by default without needing PersistentVolumes - export LOCAL_DISKS='raid0' - EOF - - # Source extra environment variables in bootstrap script - sed -i '/^set -o errexit/a\\nsource /etc/profile.d/bootstrap.sh' /etc/eks/bootstrap.sh - - --BOUNDARY-- - - nodePool: - labels: - - type: karpenter - - NodeGroupType: SparkComputeOptimized - - multiArch: Spark - requirements: - - key: "karpenter.sh/capacity-type" - operator: In - values: ["spot", "on-demand"] - - key: "kubernetes.io/arch" - operator: In - values: ["amd64"] - - key: "karpenter.k8s.aws/instance-category" - operator: In - values: ["c"] - - key: "karpenter.k8s.aws/instance-family" - operator: In - values: ["c5d"] - - key: "karpenter.k8s.aws/instance-cpu" - operator: In - values: ["4", "8", "16", "36"] - - key: "karpenter.k8s.aws/instance-hypervisor" - operator: In - values: ["nitro"] - - key: "karpenter.k8s.aws/instance-generation" - operator: Gt - values: ["2"] - limits: - cpu: 20 # Change this to 1000 or more for production according to your needs - disruption: - consolidationPolicy: WhenEmpty - consolidateAfter: 30s - expireAfter: 720h - weight: 100 - - # spark-graviton-memory-optimized Nodepool - - name: spark-graviton-memory-optimized - clusterName: ${module.eks.cluster_name} - ec2NodeClass: - karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]} - subnetSelectorTerms: - tags: - Name: "${module.eks.cluster_name}-private*" - securityGroupSelectorTerms: - tags: - Name: ${module.eks.cluster_name}-node - userData: | - MIME-Version: 1.0 - Content-Type: multipart/mixed; boundary="BOUNDARY" - - --BOUNDARY - Content-Type: text/x-shellscript; charset="us-ascii" - - cat <<-EOF > /etc/profile.d/bootstrap.sh - #!/bin/sh - - - # Configure the NVMe volumes in RAID0 configuration in the bootstrap.sh call. - # https://github.com/awslabs/amazon-eks-ami/blob/master/files/bootstrap.sh#L35 - # This will create a RAID volume and mount it at /mnt/k8s-disks/0 - # then mount that volume to /var/lib/kubelet, /var/lib/containerd, and /var/log/pods - # this allows the container daemons and pods to write to the RAID0 by default without needing PersistentVolumes - export LOCAL_DISKS='raid0' - EOF - - # Source extra environment variables in bootstrap script - sed -i '/^set -o errexit/a\\nsource /etc/profile.d/bootstrap.sh' /etc/eks/bootstrap.sh - - --BOUNDARY-- - nodePool: - labels: - - type: karpenter - - NodeGroupType: SparkGravitonMemoryOptimized - - multiArch: Spark - requirements: - - key: "karpenter.sh/capacity-type" - operator: In - values: ["spot", "on-demand"] - - key: "kubernetes.io/arch" - operator: In - values: ["arm64"] - - key: "karpenter.k8s.aws/instance-category" - operator: In - values: ["r"] - - key: "karpenter.k8s.aws/instance-family" - operator: In - values: ["r6gd"] - - key: "karpenter.k8s.aws/instance-cpu" - operator: In - values: ["4", "8", "16", "32"] - - key: "karpenter.k8s.aws/instance-hypervisor" - operator: In - values: ["nitro"] - - key: "karpenter.k8s.aws/instance-generation" - operator: Gt - values: ["2"] - limits: - cpu: 1000 - disruption: - consolidationPolicy: WhenEmpty - consolidateAfter: 30s - expireAfter: 720h - weight: 50 -``` +
From 912d47f5fd3881eba652a9a24dc24e0c6f318a82 Mon Sep 17 00:00:00 2001 From: Ray Krueger Date: Tue, 2 Apr 2024 17:22:15 -0500 Subject: [PATCH 11/11] S3_BUCKET not S3_HOME --- .../docs/blueprints/data-analytics/spark-operator-yunikorn.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/blueprints/data-analytics/spark-operator-yunikorn.md b/website/docs/blueprints/data-analytics/spark-operator-yunikorn.md index a0cfef8b9..1ee3076da 100644 --- a/website/docs/blueprints/data-analytics/spark-operator-yunikorn.md +++ b/website/docs/blueprints/data-analytics/spark-operator-yunikorn.md @@ -334,7 +334,7 @@ else fi ``` -If S3_HOME is set we can proceed into our example. +If *S3_BUCKET* is set we can proceed into our example. ```bash cd ${DOEKS_HOME}/analytics/terraform/spark-k8s-operator/examples/benchmark