docs: Spark operator updates (awslabs#484)

ovaleanu · Apr 3, 2024 · d0ae18b · d0ae18b
2 parents 4c3c6de + 912d47f
commit d0ae18b
Show file tree

Hide file tree

Showing 10 changed files with 394 additions and 334 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,6 +1,7 @@
 .DS_Store
 .idea
 .build
+*.old
 
 # ignore the lock file
 website/package-lock.json

diff --git a/analytics/terraform/spark-k8s-operator/examples/benchmark/tpcds-benchmark-3t.yaml b/analytics/terraform/spark-k8s-operator/examples/benchmark/tpcds-benchmark-3t.yaml
@@ -1,6 +1,6 @@
 # NOTE: This example requires the following prerequisites before executing the jobs
 # 1. Ensure spark-team-a name space exists
-# 2. replace <REPLACE-WITH-YOUR-S3BUCKET-NAME> with your bucket name
+# 2. replace <S3_BUCKET> with your bucket name
 # 3. Ensure you run "analytics/spark-k8s-operator/spark-samples/tpcds-benchmark-data-generation-1t.yaml"  which generates 3 TB input data
 
 ---
@@ -26,7 +26,7 @@ spec:
     # TPC-DS data location
     - "s3://blogpost-sparkoneks-us-east-1/blog/BLOG_TPCDS-TEST-3T-partitioned"
     # results location
-    - "s3://<REPLACE-WITH-YOUR-S3BUCKET-NAME>/TPCDS-TEST-3T-RESULT"
+    - "s3://<S3_BUCKET>/TPCDS-TEST-3T-RESULT"
     # Path to kit in the docker image
     - "/opt/tpcds-kit/tools"
     # Data Format

diff --git a/...s/terraform/spark-k8s-operator/examples/benchmark/tpcds-benchmark-data-generation-3t.yaml b/...s/terraform/spark-k8s-operator/examples/benchmark/tpcds-benchmark-data-generation-3t.yaml
@@ -1,6 +1,6 @@
 # NOTE: This example requires the following prerequisites before executing the jobs
 # 1. Ensure spark-team-a name space exists
-# 2. replace <REPLACE-WITH-YOUR-S3BUCKET-NAME> with your bucket name
+# 2. replace <S3_BUCKET> with your bucket name
 
 ---
 apiVersion: "sparkoperator.k8s.io/v1beta2"
@@ -23,7 +23,7 @@ spec:
   mainApplicationFile: local:///opt/spark/examples/jars/eks-spark-benchmark-assembly-1.0.jar
   arguments:
     # TPC-DS data location
-    - "s3a://<REPLACE-WITH-YOUR-S3BUCKET-NAME>/TPCDS-TEST-3T"
+    - "s3a://<S3_BUCKET>/TPCDS-TEST-3T"
     # Path to kit in the docker image
     - "/opt/tpcds-kit/tools"
     # Data Format

diff --git a/.../spark-k8s-operator/examples/karpenter/nvme-ephemeral-storage/nvme-ephemeral-storage.yaml b/.../spark-k8s-operator/examples/karpenter/nvme-ephemeral-storage/nvme-ephemeral-storage.yaml
@@ -1,7 +1,5 @@
 # Pre-requisite before running this job
-# 1/ Open taxi-trip-execute.sh and update <ENTER_S3_BUCKET> and <REGION>
-# 2/ Replace <ENTER_S3_BUCKET> with your S3 bucket created by this blueprint(Check Terraform outputs)
-# 3/ execute taxi-trip-execute.sh
+# Replace <S3_BUCKET> with your S3 bucket created by this blueprint(Check Terraform outputs)
 
 ---
 apiVersion: "sparkoperator.k8s.io/v1beta2"
@@ -30,10 +28,10 @@ spec:
   mode: cluster
   image: public.ecr.aws/data-on-eks/spark3.3.1-hadoop3.2-aws-java-sdk-bundle-1.12.647
   imagePullPolicy: IfNotPresent
-  mainApplicationFile: "s3a://<ENTER_S3_BUCKET>/taxi-trip/scripts/pyspark-taxi-trip.py"  # MainFile is the path to a bundled JAR, Python, or R file of the application
+  mainApplicationFile: "s3a://<S3_BUCKET>/taxi-trip/scripts/pyspark-taxi-trip.py"  # MainFile is the path to a bundled JAR, Python, or R file of the application
   arguments:
-    - "s3a://<ENTER_S3_BUCKET>/taxi-trip/input/"
-    - "s3a://<ENTER_S3_BUCKET>/taxi-trip/output/"
+    - "s3a://<S3_BUCKET>/taxi-trip/input/"
+    - "s3a://<S3_BUCKET>/taxi-trip/output/"
   hadoopConf:
     "fs.s3a.aws.credentials.provider": "com.amazonaws.auth.WebIdentityTokenCredentialsProvider"
     "fs.s3a.impl": "org.apache.hadoop.fs.s3a.S3AFileSystem"
@@ -55,7 +53,7 @@ spec:
 
     # Spark Event logs
     "spark.eventLog.enabled": "true"
-    "spark.eventLog.dir": "s3a://<ENTER_S3_BUCKET>/spark-event-logs"
+    "spark.eventLog.dir": "s3a://<S3_BUCKET>/spark-event-logs"
     "spark.eventLog.rolling.enabled": "true"
     "spark.eventLog.rolling.maxFileSize": "64m"
 #    "spark.history.fs.eventLog.rolling.maxFilesToRetain": 100

diff --git a/website/docs/blueprints/data-analytics/_compute_optimized_nodepool.md b/website/docs/blueprints/data-analytics/_compute_optimized_nodepool.md
@@ -0,0 +1,141 @@
+```yaml
+  # spark-compute-optimized
+    name: spark-compute-optimized
+    clusterName: ${module.eks.cluster_name}
+    ec2NodeClass:
+      karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]}
+      subnetSelectorTerms:
+        tags:
+          Name: "${module.eks.cluster_name}-private*"
+      securityGroupSelectorTerms:
+        tags:
+          Name: ${module.eks.cluster_name}-node
+      userData: |
+        MIME-Version: 1.0
+        Content-Type: multipart/mixed; boundary="BOUNDARY"
+
+        --BOUNDARY
+        Content-Type: text/x-shellscript; charset="us-ascii"
+
+        cat <<-EOF > /etc/profile.d/bootstrap.sh
+        #!/bin/sh
+
+
+        # Configure the NVMe volumes in RAID0 configuration in the bootstrap.sh call.
+        # https://github.com/awslabs/amazon-eks-ami/blob/master/files/bootstrap.sh#L35
+        # This will create a RAID volume and mount it at /mnt/k8s-disks/0
+        #   then mount that volume to /var/lib/kubelet, /var/lib/containerd, and /var/log/pods
+        #   this allows the container daemons and pods to write to the RAID0 by default without needing PersistentVolumes
+        export LOCAL_DISKS='raid0'
+        EOF
+
+        # Source extra environment variables in bootstrap script
+        sed -i '/^set -o errexit/a\\nsource /etc/profile.d/bootstrap.sh' /etc/eks/bootstrap.sh
+
+        --BOUNDARY--
+
+    nodePool:
+      labels:
+        - type: karpenter
+        - NodeGroupType: SparkComputeOptimized
+        - multiArch: Spark
+      requirements:
+        - key: "karpenter.sh/capacity-type"
+          operator: In
+          values: ["spot", "on-demand"]
+        - key: "kubernetes.io/arch"
+          operator: In
+          values: ["amd64"]
+        - key: "karpenter.k8s.aws/instance-category"
+          operator: In
+          values: ["c"]
+        - key: "karpenter.k8s.aws/instance-family"
+          operator: In
+          values: ["c5d"]
+        - key: "karpenter.k8s.aws/instance-cpu"
+          operator: In
+          values: ["4", "8", "16", "36"]
+        - key: "karpenter.k8s.aws/instance-hypervisor"
+          operator: In
+          values: ["nitro"]
+        - key: "karpenter.k8s.aws/instance-generation"
+          operator: Gt
+          values: ["2"]
+      limits:
+        cpu: 20 # Change this to 1000 or more for production according to your needs
+      disruption:
+        consolidationPolicy: WhenEmpty
+        consolidateAfter: 30s
+        expireAfter: 720h
+      weight: 100
+
+    # spark-graviton-memory-optimized Nodepool
+
+    name: spark-graviton-memory-optimized
+    clusterName: ${module.eks.cluster_name}
+    ec2NodeClass:
+      karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]}
+      subnetSelectorTerms:
+        tags:
+          Name: "${module.eks.cluster_name}-private*"
+      securityGroupSelectorTerms:
+        tags:
+          Name: ${module.eks.cluster_name}-node
+      userData: |
+        MIME-Version: 1.0
+        Content-Type: multipart/mixed; boundary="BOUNDARY"
+
+        --BOUNDARY
+        Content-Type: text/x-shellscript; charset="us-ascii"
+
+        cat <<-EOF > /etc/profile.d/bootstrap.sh
+        #!/bin/sh
+
+
+        # Configure the NVMe volumes in RAID0 configuration in the bootstrap.sh call.
+        # https://github.com/awslabs/amazon-eks-ami/blob/master/files/bootstrap.sh#L35
+        # This will create a RAID volume and mount it at /mnt/k8s-disks/0
+        #   then mount that volume to /var/lib/kubelet, /var/lib/containerd, and /var/log/pods
+        #   this allows the container daemons and pods to write to the RAID0 by default without needing PersistentVolumes
+        export LOCAL_DISKS='raid0'
+        EOF
+
+        # Source extra environment variables in bootstrap script
+        sed -i '/^set -o errexit/a\\nsource /etc/profile.d/bootstrap.sh' /etc/eks/bootstrap.sh
+
+        --BOUNDARY--
+    nodePool:
+      labels:
+        - type: karpenter
+        - NodeGroupType: SparkGravitonMemoryOptimized
+        - multiArch: Spark
+      requirements:
+        - key: "karpenter.sh/capacity-type"
+          operator: In
+          values: ["spot", "on-demand"]
+        - key: "kubernetes.io/arch"
+          operator: In
+          values: ["arm64"]
+        - key: "karpenter.k8s.aws/instance-category"
+          operator: In
+          values: ["r"]
+        - key: "karpenter.k8s.aws/instance-family"
+          operator: In
+          values: ["r6gd"]
+        - key: "karpenter.k8s.aws/instance-cpu"
+          operator: In
+          values: ["4", "8", "16", "32"]
+        - key: "karpenter.k8s.aws/instance-hypervisor"
+          operator: In
+          values: ["nitro"]
+        - key: "karpenter.k8s.aws/instance-generation"
+          operator: Gt
+          values: ["2"]
+      limits:
+        cpu: 1000
+      disruption:
+        consolidationPolicy: WhenEmpty
+        consolidateAfter: 30s
+        expireAfter: 720h
+      weight: 50
+```
diff --git a/website/docs/blueprints/data-analytics/_graviton_nodepool.md b/website/docs/blueprints/data-analytics/_graviton_nodepool.md
@@ -0,0 +1,71 @@
+```yaml
+    name: spark-graviton-memory-optimized
+    clusterName: ${module.eks.cluster_name}
+    ec2NodeClass:
+      karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]}
+      subnetSelectorTerms:
+        tags:
+          Name: "${module.eks.cluster_name}-private*"
+      securityGroupSelectorTerms:
+        tags:
+          Name: ${module.eks.cluster_name}-node
+      userData: |
+        MIME-Version: 1.0
+        Content-Type: multipart/mixed; boundary="BOUNDARY"
+
+        --BOUNDARY
+        Content-Type: text/x-shellscript; charset="us-ascii"
+
+        cat <<-EOF > /etc/profile.d/bootstrap.sh
+        #!/bin/sh
+
+
+        # Configure the NVMe volumes in RAID0 configuration in the bootstrap.sh call.
+        # https://github.com/awslabs/amazon-eks-ami/blob/master/files/bootstrap.sh#L35
+        # This will create a RAID volume and mount it at /mnt/k8s-disks/0
+        #   then mount that volume to /var/lib/kubelet, /var/lib/containerd, and /var/log/pods
+        #   this allows the container daemons and pods to write to the RAID0 by default without needing PersistentVolumes
+        export LOCAL_DISKS='raid0'
+        EOF
+
+        # Source extra environment variables in bootstrap script
+        sed -i '/^set -o errexit/a\\nsource /etc/profile.d/bootstrap.sh' /etc/eks/bootstrap.sh
+
+        --BOUNDARY--
+
+
+    nodePool:
+      labels:
+        - type: karpenter
+        - NodeGroupType: SparkGravitonMemoryOptimized
+        - multiArch: Spark
+      requirements:
+        - key: "karpenter.sh/capacity-type"
+          operator: In
+          values: ["spot", "on-demand"]
+        - key: "kubernetes.io/arch"
+          operator: In
+          values: ["arm64"]
+        - key: "karpenter.k8s.aws/instance-category"
+          operator: In
+          values: ["r"]
+        - key: "karpenter.k8s.aws/instance-family"
+          operator: In
+          values: ["r6gd"]
+        - key: "karpenter.k8s.aws/instance-cpu"
+          operator: In
+          values: ["4", "8", "16", "32"]
+        - key: "karpenter.k8s.aws/instance-hypervisor"
+          operator: In
+          values: ["nitro"]
+        - key: "karpenter.k8s.aws/instance-generation"
+          operator: Gt
+          values: ["2"]
+      limits:
+        cpu: 1000
+      disruption:
+        consolidationPolicy: WhenEmpty
+        consolidateAfter: 30s
+        expireAfter: 720h
+      weight: 50
+```
diff --git a/website/docs/blueprints/data-analytics/_memory_optimized_nodepool.md b/website/docs/blueprints/data-analytics/_memory_optimized_nodepool.md
@@ -0,0 +1,70 @@
+```yaml
+    name: spark-memory-optimized
+      clusterName: ${module.eks.cluster_name}
+      ec2NodeClass:
+        karpenterRole: ${split("/", module.eks_blueprints_addons.karpenter.node_iam_role_arn)[1]}
+        subnetSelectorTerms:
+          tags:
+            Name: "${module.eks.cluster_name}-private*"
+        securityGroupSelectorTerms:
+          tags:
+            Name: ${module.eks.cluster_name}-node
+        userData: |
+          MIME-Version: 1.0
+          Content-Type: multipart/mixed; boundary="BOUNDARY"
+
+          --BOUNDARY
+          Content-Type: text/x-shellscript; charset="us-ascii"
+
+          cat <<-EOF > /etc/profile.d/bootstrap.sh
+          #!/bin/sh
+
+
+          # Configure the NVMe volumes in RAID0 configuration in the bootstrap.sh call.
+          # https://github.com/awslabs/amazon-eks-ami/blob/master/files/bootstrap.sh#L35
+          # This will create a RAID volume and mount it at /mnt/k8s-disks/0
+          #   then mount that volume to /var/lib/kubelet, /var/lib/containerd, and /var/log/pods
+          #   this allows the container daemons and pods to write to the RAID0 by default without needing PersistentVolumes
+          export LOCAL_DISKS='raid0'
+          EOF
+
+          # Source extra environment variables in bootstrap script
+          sed -i '/^set -o errexit/a\\nsource /etc/profile.d/bootstrap.sh' /etc/eks/bootstrap.sh
+
+          --BOUNDARY--
+
+      nodePool:
+        labels:
+          - type: karpenter
+          - NodeGroupType: SparkComputeOptimized
+          - multiArch: Spark
+        requirements:
+          - key: "karpenter.sh/capacity-type"
+            operator: In
+            values: ["spot", "on-demand"]
+          - key: "kubernetes.io/arch"
+            operator: In
+            values: ["amd64"]
+          - key: "karpenter.k8s.aws/instance-category"
+            operator: In
+            values: ["r"]
+          - key: "karpenter.k8s.aws/instance-family"
+            operator: In
+            values: ["r5d"]
+          - key: "karpenter.k8s.aws/instance-cpu"
+            operator: In
+            values: ["4", "8", "16", "32"]
+          - key: "karpenter.k8s.aws/instance-hypervisor"
+            operator: In
+            values: ["nitro"]
+          - key: "karpenter.k8s.aws/instance-generation"
+            operator: Gt
+            values: ["2"]
+        limits:
+          cpu: 1000
+        disruption:
+          consolidationPolicy: WhenEmpty
+          consolidateAfter: 30s
+          expireAfter: 720h
+        weight: 100
+```
diff --git a/website/docs/blueprints/data-analytics/_replace_s3_bucket_placeholders.mdx b/website/docs/blueprints/data-analytics/_replace_s3_bucket_placeholders.mdx
@@ -0,0 +1,6 @@
+Once our sample data is uploaded you can run the Spark job. You will need to
+replace the *\<S3_BUCKET\>* placeholders in this file with the name of the bucket
+created earlier. You can get that value by running `echo $S3_BUCKET`.
+
+To do this automatically you can run the following, which will create a .old
+backup file and do the replacement for you.
diff --git a/website/docs/blueprints/data-analytics/_taxi_trip_exec.md b/website/docs/blueprints/data-analytics/_taxi_trip_exec.md
@@ -0,0 +1,9 @@
+Run the *taxi-trip-execute.sh* script with the following input. You will use the *S3_BUCKET* variable created earlier. Additionally, you must change YOUR_REGION_HERE with the region of your choice, *us-west-2* for example.
+
+This script will download some example taxi trip data and create duplicates of
+it in order to increase the size a bit. This will take a bit of time and will
+require a relatively fast internet connection.
+
+```bash
+./taxi-trip-execute.sh ${S3_BUCKET} YOUR_REGION_HERE
+```