Merge pull request #196 from treeverse/databricks-ci-cd-demo-v1.0

Added Databricks CI/CD demo
treeverse · May 10, 2024 · 0fb17d6 · 0fb17d6
2 parents 8365dc1 + 14c8c89
commit 0fb17d6
Show file tree

Hide file tree

Showing 7 changed files with 523 additions and 0 deletions.
diff --git a/...andalone_examples/databricks-ci-cd/.github/workflows/pr_commit_run_databricks_etl_job.yml b/...andalone_examples/databricks-ci-cd/.github/workflows/pr_commit_run_databricks_etl_job.yml
@@ -0,0 +1,101 @@
+name: Run Databricks ETL jobs in an isolated environment by using lakeFS
+
+on:
+  pull_request:
+
+env:
+  DATABRICKS_HOST: ${{ vars.DATABRICKS_HOST }}
+  DATABRICKS_TOKEN: ${{ secrets.DATABRICKS_TOKEN }}
+
+jobs:
+  run-etl-jobs-in-isolated-environment:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Check out the repo
+        uses: actions/checkout@v4
+      - name: Set additional environment variables
+        run:  |
+          echo "WORKSPACE_NOTEBOOK_PATH=${{ vars.DATABRICKS_WORKSPACE_NOTEBOOK_PATH }}/pr-${{ github.event.number }}-${{ github.event.pull_request.head.sha }}" >> $GITHUB_ENV
+          echo "LOCAL_NOTEBOOK_PATH=/home/runner/work/${{ github.event.pull_request.head.repo.name }}/${{ github.event.pull_request.head.repo.name }}/databricks-notebooks" >> $GITHUB_ENV
+          echo "LAKFES_BRANCH_NAME=pr-${{ github.event.number }}-${{ github.event.pull_request.head.sha }}" >> $GITHUB_ENV
+      - name: Create Databricks Workspace directory and import Databricks notebooks
+        run:  |
+          curl -F path="${{ env.WORKSPACE_NOTEBOOK_PATH }}"  \
+            ${{ vars.DATABRICKS_HOST }}/api/2.0/workspace/mkdirs --header "Authorization: Bearer ${{ env.DATABRICKS_TOKEN }}"
+
+          cd ${{ env.LOCAL_NOTEBOOK_PATH }}
+
+          for file in *.py
+          do
+            curl -F path="${{ env.WORKSPACE_NOTEBOOK_PATH }}/$file"  \
+            -F language=PYTHON -F overwrite=true -F content=@"$file" \
+              ${{ vars.DATABRICKS_HOST }}/api/2.0/workspace/import --header "Authorization: Bearer ${{ env.DATABRICKS_TOKEN }}"
+          done
+      - name: Trigger Databricks job to create sample Delta tables
+        uses: databricks/[email protected]
+        id: trigger_databricks_notebook_create_sample_delta_tables
+        with:
+          run-name: "GitHub Action - PR ${{ github.event.number }} - Create Sample Delta Tables"
+          workspace-notebook-path: "${{ env.WORKSPACE_NOTEBOOK_PATH }}/Create Sample Delta Tables.py"
+          notebook-params-json:  >
+            {
+              "data_source_storage_namespace": "${{ vars.DATA_SOURCE_STORAGE_NAMESPACE }}"
+            }
+          existing-cluster-id: "${{ vars.DATABRICKS_CLUSTER_ID }}"
+      - name: Trigger Databricks job to create lakeFS repo and import data
+        uses: databricks/[email protected]
+        id: trigger_databricks_notebook_create_lakefs_repo
+        with:
+          run-name: "GitHub Action - PR ${{ github.event.number }} - Create lakeFS Repo and Import Data"
+          workspace-notebook-path: "${{ env.WORKSPACE_NOTEBOOK_PATH }}/Create lakeFS Repo and Import Data.py"
+          notebook-params-json:  >
+            {
+              "databricks_secret_scope": "${{ vars.DATABRICKS_SECRET_SCOPE }}",
+              "lakefs_end_point": "${{ vars.LAKEFS_END_POINT }}",
+              "lakefs_repo": "${{ vars.LAKFES_REPO_NAME }}",
+              "lakefs_repo_storage_namespace": "${{ vars.LAKEFS_REPO_STORAGE_NAMESPACE }}",
+              "lakefs_branch": "${{ env.LAKFES_BRANCH_NAME }}",
+              "data_source_storage_namespace": "${{ vars.DATA_SOURCE_STORAGE_NAMESPACE }}"
+            }
+          existing-cluster-id: "${{ vars.DATABRICKS_CLUSTER_ID }}"
+      - name: Trigger Databricks ETL Job
+        uses: databricks/[email protected]
+        id: trigger_databricks_notebook_etl_job
+        with:
+          run-name: "GitHub Action - PR ${{ github.event.number }} - ETL Job"
+          workspace-notebook-path: "${{ env.WORKSPACE_NOTEBOOK_PATH }}/ETL Job.py"
+          notebook-params-json:  >
+            {
+              "environment": "dev",
+              "data_source_storage_namespace": "${{ vars.DATA_SOURCE_STORAGE_NAMESPACE }}",
+              "lakefs_repo": "${{ vars.LAKFES_REPO_NAME }}",
+              "lakefs_branch": "${{ env.LAKFES_BRANCH_NAME }}"
+            }
+          existing-cluster-id: "${{ vars.DATABRICKS_CLUSTER_ID }}"
+      - name: Trigger Databricks job to run validations
+        uses: databricks/[email protected]
+        id: trigger_databricks_notebook_run_validations
+        with:
+          run-name: "GitHub Action - PR ${{ github.event.number }} - Run Validations"
+          workspace-notebook-path: "${{ env.WORKSPACE_NOTEBOOK_PATH }}/Run Validations.py"
+          notebook-params-json:  >
+            {
+              "databricks_secret_scope": "${{ vars.DATABRICKS_SECRET_SCOPE }}",
+              "lakefs_end_point": "${{ vars.LAKEFS_END_POINT }}",
+              "lakefs_repo": "${{ vars.LAKFES_REPO_NAME }}",
+              "lakefs_branch": "${{ env.LAKFES_BRANCH_NAME }}"
+            }
+          existing-cluster-id: "${{ vars.DATABRICKS_CLUSTER_ID }}"
+          outputs: >
+            notebook-output >> "$GITHUB_OUTPUT"
+      - name: Check for failed validations
+        run: |
+          echo "Validation Output: ${{ steps.trigger_databricks_notebook_run_validations.outputs.notebook-output }}"  
+          if [[ "${{ steps.trigger_databricks_notebook_run_validations.outputs.notebook-output }}" == "Success" ]]
+          then
+            echo "## ✅ No validation failures found"
+          else
+            echo "## 🚨👆🏻👆🏻 Validation checks failed 👆🏻👆🏻🚨"
+            exit 1
+          fi
diff --git a/01_standalone_examples/databricks-ci-cd/README.md b/01_standalone_examples/databricks-ci-cd/README.md
@@ -0,0 +1,239 @@
+# lakeFS-samples-ci-cd
+
+Start by ⭐️ starring [lakeFS open source](https://go.lakefs.io/oreilly-course) project.
+
+**This sample captures a collection of Databricks notebooks and GitHub Action code that demonstrate how to run Databricks ETL jobs in an isolated environment by using lakeFS.**
+
+## Prerequisites
+* lakeFS installed and running on a server or in the cloud. If you don't have lakeFS already running then either use [lakeFS Cloud](https://demo.lakefs.io/) which provides free lakeFS server on-demand with a single click or refer to [lakeFS Quickstart](https://docs.lakefs.io/quickstart/) doc.
+* Databricks server with the ability to run compute clusters on top of it. 
+* Configure your Databricks cluster to use lakeFS Hadoop file system. Read this blog [Databricks and lakeFS Integration: Step-by-Step Configuration Tutorial](https://lakefs.io/blog/databricks-lakefs-integration-tutorial/) or [lakeFS documentation](https://docs.lakefs.io/integrations/spark.html#lakefs-hadoop-filesystem) for the configuration.
+* Permissions to manage the cluster configuration, including adding libraries. 
+* GitHub account. 
+
+## Setup
+
+1. Create [Databricks personal access token](https://docs.databricks.com/en/dev-tools/auth/pat.html).
+
+
+1. Create Databricks secret scope e.g. **demos** or use an existing secret scope. Add following secrets in that secret scope by following [Secret management docs](https://docs.databricks.com/en/security/secrets/index.html): 
+
+       lakefs_access_key_id e.g. 'AKIAIOSFOLKFSSAMPLES'
+
+       lakefs_secret_access_key e.g. 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY'
+
+    You can use following Databricks commands to create secrets:
+   ```bash
+   databricks secrets put-secret --json '{
+     "scope": "demos",
+     "key": "lakefs_access_key_id",
+     "string_value": "AKIAIOSFOLKFSSAMPLES"
+   }'
+
+   databricks secrets put-secret --json '{
+     "scope": "demos",
+     "key": "lakefs_secret_access_key",
+     "string_value": "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY"
+   }'
+   ```
+
+1. Create a Git repository. It can be named **lakeFS-samples-ci-cd**.
+
+1. Clone this repository:
+
+   ```bash
+   git clone https://github.com/treeverse/lakeFS-samples && cd lakeFS-samples/01_standalone_examples/databricks-ci-cd
+   ```
+
+1. Create folders **.github/workflows** and **databricks-notebooks** in your Git repo.
+
+1. Upload **pr_commit_run_databricks_etl_job.yml** file in **lakeFS-samples/01_standalone_examples/databricks-ci-cd/.github/workflows** folder to **.github/workflows** folder in your Git repo.
+
+1. Upload all files in **lakeFS-samples/01_standalone_examples/databricks-ci-cd/databricks-notebooks** folder to **databricks-notebooks** folder in your Git repo.
+
+1. Add following secrets in your Git repo by following [Creating secrets for a repository docs](https://docs.github.com/en/actions/security-guides/using-secrets-in-github-actions#creating-secrets-for-a-repository). This is the Databricks token created in 1st step above. If you copy & paste the secret name then verify that there are no spaces before and after the secret name.
+
+       DATABRICKS_TOKEN
+
+
+1. Add following variables in your Git repo by following [Creating configuration variables for a repository docs](https://docs.github.com/en/actions/learn-github-actions/variables#creating-configuration-variables-for-a-repository):
+* Variable to store your [Databricks host name or URL](https://docs.databricks.com/en/workspace/workspace-details.html#workspace-instance-names-urls-and-ids) e.g. https://cust-success.cloud.databricks.com
+
+      DATABRICKS_HOST
+
+* Variable to store your [Databricks Cluster ID](https://docs.databricks.com/en/workspace/workspace-details.html#cluster-url-and-id) e.g. 1115-164516-often242
+
+      DATABRICKS_CLUSTER_ID
+
+* Variable to store your [Databricks Workspace Folder path](https://docs.databricks.com/en/workspace/workspace-details.html#folder-id) e.g. /Shared/lakefs_demos/ci_cd_demo or /Users/[email protected]/MyFolder/lakefs_demos/ci_cd_demo
+
+      DATABRICKS_WORKSPACE_NOTEBOOK_PATH
+
+* Variable to store your Databricks Secret Scope created in 2nd step e.g. demos
+
+      DATABRICKS_SECRET_SCOPE
+
+* Variable to store your lakeFS End Point e.g. https://company.region.lakefscloud.io
+
+      LAKEFS_END_POINT
+
+* Variable to store your lakeFS repository name (which will be created by this demo) e.g. databricks-ci-cd-repo
+
+      LAKFES_REPO_NAME
+
+* Variable to store the storage namespace for the lakeFS repo. It is a location in the underlying storage where data for lakeFS repository will be stored. e.g. s3://example
+
+      LAKEFS_REPO_STORAGE_NAMESPACE
+
+* Variable to store the storage namespace where Delta tables created by this demo will be stored e.g. s3://data-source/delta-tables. Do NOT use the same storage namespace as above.
+
+  If it is not there then create Databricks [External Location](https://docs.databricks.com/en/sql/language-manual/sql-ref-external-locations.html) to write to s3://data-source URL and you should have **READ FILES** and **WRITES FILES** [premissions on and External Location](https://docs.databricks.com/en/connect/unity-catalog/manage-external-locations.html#grant-permissions-on-an-external-location)
+
+      DATA_SOURCE_STORAGE_NAMESPACE
+
+## Demo Instructions
+
+1. Create a new branch in your Git repository. Select newly created branch.
+1. Remove the comment from the last 5 lines of code in **ETL Job.py** inside **databricks-notebooks** folder and Commit your changes.
+1. Go to **Pull requests** tab in your Git repo, create Pull Request.
+1. Go to **Actions** tab in your Git repo. Git Action will start running automaically and validation checks will fail.
+1. Go back to **Code** tab in your Git repo and select the branch created in 1st step. Comment back the last 5 lines of code in **ETL Job.py** and Commit your changes.
+1. Go back to **Actions** tab in your Git repo. Git Action will start running again and validation checks will pass this time.
+
+## Useful Information
+
+1. Databricks [Continuous integration and delivery using GitHub Actions](https://docs.databricks.com/en/dev-tools/ci-cd/ci-cd-github.html).
+1. Information on how to [run Databricks notebooks from GitHub Action](https://github.com/databricks/run-notebook/tree/main).
+1. See [action.yml](https://github.com/databricks/run-notebook/blob/main/action.yml) for the latest interface and docs for databricks/run-notebook.
+1. [Databricks REST API reference](https://docs.databricks.com/api/workspace/introduction).
+1. GitHub [Events that trigger workflows](https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows).
+1. GitHub [Webhook events and payloads](https://docs.github.com/en/webhooks/webhook-events-and-payloads).
+1. GitHub [Payloads for Pull Request](https://docs.github.com/en/webhooks/webhook-events-and-payloads?actionType=closed#pull_request).
+
+## Additional Useful GitHub Action Code
+
+1. Code to run the Action workflow only if any file changes in a specific folder e.g. databricks-notebooks. So, changing README file will not run the workflow:
+
+   ```bash
+   name: Run Databricks ETL jobs in an isolated environment by using lakeFS
+
+   on:
+      pull_request:
+         paths:
+            - 'databricks-notebooks/**'
+   ```
+
+1. If you use Scala for Databricks notebooks then this is the step to build Scala job:
+
+   ```bash
+      - name: Build Scala job
+        run:  |
+          cd etl_jobs
+          sbt assembly
+          ls -lh /home/runner/work/image-segmentation-repo/image-segmentation-repo/etl_jobs/target/scala-2.12/transform-assembly-0.1.0-SNAPSHOT.jar
+   ```
+
+1. Upload a file to S3 e.g. upload JAR file built in the previous step:
+
+   ```bash
+      - name: Upload JAR to S3
+        uses: hkusu/s3-upload-action@v2
+        id: S3
+        with:
+          aws-access-key-id: ${{secrets.AWS_ACCESS_KEY}}
+          aws-secret-access-key: ${{secrets.AWS_SECRET_KEY}}
+          aws-region: 'us-east-1'
+          aws-bucket: "treeverse-ort-simulation-bucket"
+          bucket-root: "amit"
+          destination-dir: "jars/pr-${{ github.event.number }}"
+          file-path: "/home/runner/work/image-segmentation-repo/image-segmentation-repo/etl_jobs/target/scala-2.12/transform-assembly-0.1.0-SNAPSHOT.jar"
+   ```
+
+1. Upload a file to DBFS (Databricks FS) e.g. upload JAR file built in the previous step:
+
+   ```bash
+      - name: Upload JAR to DBFS
+        uses: databricks/upload-dbfs-temp@v0
+        with:
+          local-path: /home/runner/work/image-segmentation-repo/image-segmentation-repo/etl_jobs/target/scala-2.12/transform-assembly-0.1.0-SNAPSHOT.jar
+        id: upload_jar
+      - name: Get JAR location
+        run: |
+            echo "JAR location: ${{ steps.upload_jar.outputs.dbfs-file-path }}"
+   ```
+
+1. Create a new Databricks cluster instead of using an existing cluster, install libraries and trigger Scala job:
+
+   ```bash
+      - name: Trigger Databricks Run Scala ETL Job
+        uses: databricks/[email protected]
+        id: trigger_databricks_notebook_run_scala_etl_job
+        with:
+          run-name: "PR ${{ github.event.number }} GitHub Action - Run Scala ETL job"
+          local-notebook-path: "Run Scala ETL Job.py"
+          notebook-params-json:  >
+            {
+              "env": "dev",
+              "repo": "amit-pr-checks-repo",
+              "branch": "pr-${{ github.event.number }}-${{ github.event.pull_request.head.sha }}",
+              "etl_start_date": "2012-01-01",
+              "etl_end_date": "2012-03-01"
+            }
+          new-cluster-json: >
+            {
+              "num_workers": 1,
+              "spark_version": "14.3.x-scala2.12",
+              "node_type_id": "m5d.large",
+              "spark_conf": {
+                "spark.hadoop.fs.lakefs.access.mode": "presigned",
+                "spark.hadoop.fs.lakefs.impl": "io.lakefs.LakeFSFileSystem",
+                "spark.hadoop.fs.lakefs.endpoint": "https://treeverse.us-east-1.lakefscloud.io/api/v1",
+                "spark.hadoop.fs.lakefs.access.key": "${{secrets.LAKEFS_ACCESS_KEY}}",
+                "spark.hadoop.fs.lakefs.secret.key": "${{secrets.LAKEFS_SECRET_KEY}}",
+                "spark.hadoop.fs.s3a.access.key": "${{secrets.AWS_ACCESS_KEY}}",
+                "spark.hadoop.fs.s3a.secret.key": "${{secrets.AWS_SECRET_KEY}}"
+              }
+            }
+          libraries-json: >
+            [
+              { "jar": "s3://treeverse-ort-simulation-bucket/amit/jars/pr-${{ github.event.number }}/transform-assembly-0.1.0-SNAPSHOT.jar" },
+              { "maven": {"coordinates": "io.lakefs:hadoop-lakefs-assembly:0.2.1"} },
+              { "pypi": {"package": "lakefs==0.4.1"} }
+            ]
+   ```
+
+1. Code to checkout a folder from the repo instead of full repo:
+
+   ```bash
+      # Checkout project code
+      # Use sparse checkout to only select files in a directory
+      # Turning off cone mode ensures that files in the project root are not included during checkout
+      - name: Checks out the repo
+        uses: actions/checkout@v4
+        with:
+          sparse-checkout: 'etl_jobs/src'
+          sparse-checkout-cone-mode: false
+   ```
+
+1. Get branch list and store it in a GitHub multi-line environment variable:
+
+   ```bash
+      - name: Get branch list
+        run: |
+          {
+           echo 'PR_BRANCHES<<EOF'
+           git log -${{ env.PR_FETCH_DEPTH }} --pretty=format:'%H'
+           echo ''
+           echo 'EOF'
+          } >> $GITHUB_ENV
+   ```
+
+1. If you use Scala for Databricks notebooks then this is the step to build Scala job:
+
+   ```bash
+   ```
+
+1. If you use Scala for Databricks notebooks then this is the step to build Scala job:
+
+   ```bash
+   ```
diff --git a/01_standalone_examples/databricks-ci-cd/databricks-notebooks/Create Sample Delta Tables.py b/01_standalone_examples/databricks-ci-cd/databricks-notebooks/Create Sample Delta Tables.py
@@ -0,0 +1,34 @@
+# Databricks notebook source
+#dbutils.widgets.text("data_source_storage_namespace", "s3://treeverse-ort-simulation-bucket/amit/data-source/delta-tables")
+
+dataSourceStorageNamespace = getArgument('data_source_storage_namespace')
+
+# COMMAND ----------
+
+data = [
+   (100,'intelligence'),
+   (200,'music'),
+   (300,'entertainment'),
+   (400,'professional athlete'),
+]
+columns = ["category_id", "category"]
+
+df = spark.createDataFrame(data=data, schema = columns)
+df.write.format("delta").mode("overwrite").save(f"{dataSourceStorageNamespace}/category_raw")
+df.display()
+
+# COMMAND ----------
+
+data = [
+   ('James','Bond','England',100),
+   ('Robbie','Williams','England',200),
+   ('Hulk','Hogan','USA',300),
+   ('Mister','T','USA',300),
+   ('Rafael','Nadal','Spain',400),
+   ('Paul','Haver','Belgium',200),
+]
+columns = ["firstname", "lastname", "country", "category"]
+
+df = spark.createDataFrame(data=data, schema = columns)
+df.write.format("delta").mode("overwrite").save(f"{dataSourceStorageNamespace}/famous_people_raw")
+df.display()