From f96a57658fd3bf9307c6ddeb8589503f411d633d Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Thu, 9 May 2024 09:30:56 -0700
Subject: [PATCH 1/3] Added Databricks CI/CD demo

---
 .../pr_commit_run_databricks_etl_job.yml      | 101 ++++++++
 .../databricks-ci-cd/README.md                | 222 ++++++++++++++++++
 .../Create Sample Delta Tables.py             |  34 +++
 .../Create lakeFS Repo and Import Data.py     |  66 ++++++
 .../databricks-notebooks/ETL Job.py           |  27 +++
 .../databricks-notebooks/Run Validations.py   |  55 +++++
 6 files changed, 505 insertions(+)
 create mode 100644 01_standalone_examples/databricks-ci-cd/.github/workflows/pr_commit_run_databricks_etl_job.yml
 create mode 100644 01_standalone_examples/databricks-ci-cd/README.md
 create mode 100644 01_standalone_examples/databricks-ci-cd/databricks-notebooks/Create Sample Delta Tables.py
 create mode 100644 01_standalone_examples/databricks-ci-cd/databricks-notebooks/Create lakeFS Repo and Import Data.py
 create mode 100644 01_standalone_examples/databricks-ci-cd/databricks-notebooks/ETL Job.py
 create mode 100644 01_standalone_examples/databricks-ci-cd/databricks-notebooks/Run Validations.py

diff --git a/01_standalone_examples/databricks-ci-cd/.github/workflows/pr_commit_run_databricks_etl_job.yml b/01_standalone_examples/databricks-ci-cd/.github/workflows/pr_commit_run_databricks_etl_job.yml
new file mode 100644
index 000000000..11ed3223b
--- /dev/null
+++ b/01_standalone_examples/databricks-ci-cd/.github/workflows/pr_commit_run_databricks_etl_job.yml
@@ -0,0 +1,101 @@
+name: Run Databricks ETL jobs in an isolated environment by using lakeFS
+
+on:
+  pull_request:
+
+env:
+  DATABRICKS_HOST: ${{ vars.DATABRICKS_HOST }}
+  DATABRICKS_TOKEN: ${{ secrets.DATABRICKS_TOKEN }}
+
+jobs:
+  run-etl-jobs-in-isolated-environment:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Check out the repo
+        uses: actions/checkout@v4
+      - name: Set additional environment variables
+        run:  |
+          echo "WORKSPACE_NOTEBOOK_PATH=${{ vars.DATABRICKS_WORKSPACE_NOTEBOOK_PATH }}/pr-${{ github.event.number }}-${{ github.event.pull_request.head.sha }}" >> $GITHUB_ENV
+          echo "LOCAL_NOTEBOOK_PATH=/home/runner/work/${{ github.event.pull_request.head.repo.name }}/${{ github.event.pull_request.head.repo.name }}/databricks-notebooks" >> $GITHUB_ENV
+          echo "LAKFES_BRANCH_NAME=pr-${{ github.event.number }}-${{ github.event.pull_request.head.sha }}" >> $GITHUB_ENV
+      - name: Create Databricks Workspace directory and import Databricks notebooks
+        run:  |
+          curl -F path="${{ env.WORKSPACE_NOTEBOOK_PATH }}"  \
+            ${{ vars.DATABRICKS_HOST }}/api/2.0/workspace/mkdirs --header "Authorization: Bearer ${{ env.DATABRICKS_TOKEN }}"
+
+          cd ${{ env.LOCAL_NOTEBOOK_PATH }}
+
+          for file in *.py
+          do
+            curl -F path="${{ env.WORKSPACE_NOTEBOOK_PATH }}/$file"  \
+            -F language=PYTHON -F overwrite=true -F content=@"$file" \
+              ${{ vars.DATABRICKS_HOST }}/api/2.0/workspace/import --header "Authorization: Bearer ${{ env.DATABRICKS_TOKEN }}"
+          done
+      - name: Trigger Databricks job to create sample Delta tables
+        uses: databricks/run-notebook@v0.0.3
+        id: trigger_databricks_notebook_create_sample_delta_tables
+        with:
+          run-name: "GitHub Action - PR ${{ github.event.number }} - Create Sample Delta Tables"
+          workspace-notebook-path: "${{ env.WORKSPACE_NOTEBOOK_PATH }}/Create Sample Delta Tables.py"
+          notebook-params-json:  >
+            {
+              "data_source_storage_namespace": "${{ vars.DATA_SOURCE_STORAGE_NAMESPACE }}"
+            }
+          existing-cluster-id: "${{ vars.DATABRICKS_CLUSTER_ID }}"
+      - name: Trigger Databricks job to create lakeFS repo and import data
+        uses: databricks/run-notebook@v0.0.3
+        id: trigger_databricks_notebook_create_lakefs_repo
+        with:
+          run-name: "GitHub Action - PR ${{ github.event.number }} - Create lakeFS Repo and Import Data"
+          workspace-notebook-path: "${{ env.WORKSPACE_NOTEBOOK_PATH }}/Create lakeFS Repo and Import Data.py"
+          notebook-params-json:  >
+            {
+              "databricks_secret_scope": "${{ vars.DATABRICKS_SECRET_SCOPE }}",
+              "lakefs_end_point": "${{ vars.LAKEFS_END_POINT }}",
+              "lakefs_repo": "${{ vars.LAKFES_REPO_NAME }}",
+              "lakefs_repo_storage_namespace": "${{ vars.LAKEFS_REPO_STORAGE_NAMESPACE }}",
+              "lakefs_branch": "${{ env.LAKFES_BRANCH_NAME }}",
+              "data_source_storage_namespace": "${{ vars.DATA_SOURCE_STORAGE_NAMESPACE }}"
+            }
+          existing-cluster-id: "${{ vars.DATABRICKS_CLUSTER_ID }}"
+      - name: Trigger Databricks ETL Job
+        uses: databricks/run-notebook@v0.0.3
+        id: trigger_databricks_notebook_etl_job
+        with:
+          run-name: "GitHub Action - PR ${{ github.event.number }} - ETL Job"
+          workspace-notebook-path: "${{ env.WORKSPACE_NOTEBOOK_PATH }}/ETL Job.py"
+          notebook-params-json:  >
+            {
+              "environment": "dev",
+              "data_source_storage_namespace": "${{ vars.DATA_SOURCE_STORAGE_NAMESPACE }}",
+              "lakefs_repo": "${{ vars.LAKFES_REPO_NAME }}",
+              "lakefs_branch": "${{ env.LAKFES_BRANCH_NAME }}"
+            }
+          existing-cluster-id: "${{ vars.DATABRICKS_CLUSTER_ID }}"
+      - name: Trigger Databricks job to run validations
+        uses: databricks/run-notebook@v0.0.3
+        id: trigger_databricks_notebook_run_validations
+        with:
+          run-name: "GitHub Action - PR ${{ github.event.number }} - Run Validations"
+          workspace-notebook-path: "${{ env.WORKSPACE_NOTEBOOK_PATH }}/Run Validations.py"
+          notebook-params-json:  >
+            {
+              "databricks_secret_scope": "${{ vars.DATABRICKS_SECRET_SCOPE }}",
+              "lakefs_end_point": "${{ vars.LAKEFS_END_POINT }}",
+              "lakefs_repo": "${{ vars.LAKFES_REPO_NAME }}",
+              "lakefs_branch": "${{ env.LAKFES_BRANCH_NAME }}"
+            }
+          existing-cluster-id: "${{ vars.DATABRICKS_CLUSTER_ID }}"
+          outputs: >
+            notebook-output >> "$GITHUB_OUTPUT"
+      - name: Check for failed validations
+        run: |
+          echo "Validation Output: ${{ steps.trigger_databricks_notebook_run_validations.outputs.notebook-output }}"  
+          if [[ "${{ steps.trigger_databricks_notebook_run_validations.outputs.notebook-output }}" == "Success" ]]
+          then
+            echo "## ✅ No validation failures found"
+          else
+            echo "## 🚨👆🏻👆🏻 Validation checks failed 👆🏻👆🏻🚨"
+            exit 1
+          fi
\ No newline at end of file
diff --git a/01_standalone_examples/databricks-ci-cd/README.md b/01_standalone_examples/databricks-ci-cd/README.md
new file mode 100644
index 000000000..3ec81babd
--- /dev/null
+++ b/01_standalone_examples/databricks-ci-cd/README.md
@@ -0,0 +1,222 @@
+# lakeFS-samples-ci-cd
+
+Start by ⭐️ starring [lakeFS open source](https://go.lakefs.io/oreilly-course) project.
+
+**This sample captures a collection of Databricks notebooks and GitHub Action code that demonstrate how to run Databricks ETL jobs in an isolated environment by using lakeFS.**
+
+## Prerequisites
+* lakeFS installed and running on a server or in the cloud. If you don't have lakeFS already running then either use [lakeFS Cloud](https://demo.lakefs.io/) which provides free lakeFS server on-demand with a single click or refer to [lakeFS Quickstart](https://docs.lakefs.io/quickstart/) doc.
+* Databricks server with the ability to run compute clusters on top of it. 
+* Configure your Databricks cluster to use lakeFS Hadoop file system. Read this blog [Databricks and lakeFS Integration: Step-by-Step Configuration Tutorial](https://lakefs.io/blog/databricks-lakefs-integration-tutorial/) or [lakeFS documentation](https://docs.lakefs.io/integrations/spark.html#lakefs-hadoop-filesystem) for the configuration.
+* Permissions to manage the cluster configuration, including adding libraries. 
+* GitHub account. 
+
+## Setup
+
+1. Create [Databricks personal access token](https://docs.databricks.com/en/dev-tools/auth/pat.html).
+
+
+1. Create Databricks secret scope e.g. **demos** or use an existing secret scope. Add following secrets in that secret scope by following [Secret management docs](https://docs.databricks.com/en/security/secrets/index.html): 
+
+        lakefs_access_key_id e.g. 'AKIAIOSFOLKFSSAMPLES'
+
+        lakefs_secret_access_key e.g. 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY'
+
+1. Create a Git repository. It can be named **lakeFS-samples-ci-cd**.
+
+1. Clone this repository:
+
+   ```bash
+   git clone https://github.com/treeverse/lakeFS-samples && cd lakeFS-samples/01_standalone_examples/aws-databricks/ci-cd-demo
+   ```
+
+1. Create folders **.github/workflows** and **databricks-notebooks** in your Git repo.
+
+1. Upload **pr_commit_run_databricks_etl_job.yml** file in **lakeFS-samples/01_standalone_examples/aws-databricks/ci-cd-demo/.github/workflows** folder to **.github/workflows** folder in your Git repo.
+
+1. Upload all files in **lakeFS-samples/01_standalone_examples/aws-databricks/ci-cd-demo/databricks-notebooks** folder to **databricks-notebooks** folder in your Git repo.
+
+1. Add following secrets in your Git repo by following [Creating secrets for a repository docs](https://docs.github.com/en/actions/security-guides/using-secrets-in-github-actions#creating-secrets-for-a-repository). This is the Databricks token created in 1st step above.
+
+        DATABRICKS_TOKEN
+
+
+1. Add following variables in your Git repo by following [Creating configuration variables for a repository docs](https://docs.github.com/en/actions/learn-github-actions/variables#creating-configuration-variables-for-a-repository):
+* Variable to store your [Databricks host name or URL](https://docs.databricks.com/en/workspace/workspace-details.html#workspace-instance-names-urls-and-ids) e.g. https://cust-success.cloud.databricks.com
+
+        DATABRICKS_HOST 
+
+* Variable to store your [Databricks Cluster ID](https://docs.databricks.com/en/workspace/workspace-details.html#cluster-url-and-id) e.g. 1115-164516-often242
+
+        DATABRICKS_CLUSTER_ID
+
+* Variable to store your [Databricks Workspace Folder path](https://docs.databricks.com/en/workspace/workspace-details.html#folder-id) e.g. /Shared/lakefs_demos/ci_cd_demo or /Users/me@example.com/MyFolder/lakefs_demos/ci_cd_demo
+
+        DATABRICKS_WORKSPACE_NOTEBOOK_PATH 
+
+* Variable to store your Databricks Secret Scope created in 2nd step e.g. demos
+
+        DATABRICKS_SECRET_SCOPE
+
+* Variable to store your lakeFS End Point e.g. https://company.region.lakefscloud.io
+
+        LAFEFS_END_POINT
+
+* Variable to store your lakeFS repository name (which will be created by this demo) e.g. databricks-ci-cd-repo
+
+        LAKFES_REPO_NAME
+
+* Variable to store the storage namespace for the lakeFS repo. It is a location in the underlying storage where data for lakeFS repository will be stored. e.g. s3://example
+
+        LAKEFS_REPO_STORAGE_NAMESPACE
+
+* Variable to store the storage namespace where Delta tables created by this demo will be stored e.g. s3://data-source/delta-tables. Do NOT use the same storage namespace as above.
+
+        DATA_SOURCE_STORAGE_NAMESPACE
+
+## Demo Instructions
+
+1. Create a new branch in your Git repository. Select newly created branch.
+1. Remove the comment from the last 5 lines of code in **ETL Job.py** inside **databricks-notebooks** folder and Commit your changes.
+1. Go to **Pull requests** tab in your Git repo, create Pull Request.
+1. Go to **Actions** tab in your Git repo. Git Action will start running automaically and validation checks will fail.
+1. Go back to **Code** tab in your Git repo and select the branch created in 1st step. Comment back the last 5 lines of code in **ETL Job.py** and Commit your changes.
+1. Go back to **Actions** tab in your Git repo. Git Action will start running again and validation checks will pass this time.
+
+## Useful Information
+
+1. Databricks [Continuous integration and delivery using GitHub Actions](https://docs.databricks.com/en/dev-tools/ci-cd/ci-cd-github.html).
+1. Information on how to [run Databricks notebooks from GitHub Action](https://github.com/databricks/run-notebook/tree/main).
+1. See [action.yml](https://github.com/databricks/run-notebook/blob/main/action.yml) for the latest interface and docs for databricks/run-notebook.
+1. [Databricks REST API reference](https://docs.databricks.com/api/workspace/introduction).
+1. GitHub [Events that trigger workflows](https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows).
+1. GitHub [Webhook events and payloads](https://docs.github.com/en/webhooks/webhook-events-and-payloads).
+1. GitHub [Payloads for Pull Request](https://docs.github.com/en/webhooks/webhook-events-and-payloads?actionType=closed#pull_request).
+
+## Additional Useful GitHub Action Code
+
+1. If you use Scala for Databricks notebooks then this is the step to build Scala job:
+
+   ```bash
+      - name: Build Scala job
+        run:  |
+          cd etl_jobs
+          sbt assembly
+          ls -lh /home/runner/work/image-segmentation-repo/image-segmentation-repo/etl_jobs/target/scala-2.12/transform-assembly-0.1.0-SNAPSHOT.jar
+   ```
+
+1. Upload a file to S3 e.g. upload JAR file built in the previous step:
+
+   ```bash
+      - name: Upload JAR to S3
+        uses: hkusu/s3-upload-action@v2
+        id: S3
+        with:
+          aws-access-key-id: ${{secrets.AWS_ACCESS_KEY}}
+          aws-secret-access-key: ${{secrets.AWS_SECRET_KEY}}
+          aws-region: 'us-east-1'
+          aws-bucket: "treeverse-ort-simulation-bucket"
+          bucket-root: "amit"
+          destination-dir: "jars/pr-${{ github.event.number }}"
+          file-path: "/home/runner/work/image-segmentation-repo/image-segmentation-repo/etl_jobs/target/scala-2.12/transform-assembly-0.1.0-SNAPSHOT.jar"
+   ```
+
+1. Upload a file to DBFS (Databricks FS) e.g. upload JAR file built in the previous step:
+
+   ```bash
+      - name: Upload JAR to DBFS
+        uses: databricks/upload-dbfs-temp@v0
+        with:
+          local-path: /home/runner/work/image-segmentation-repo/image-segmentation-repo/etl_jobs/target/scala-2.12/transform-assembly-0.1.0-SNAPSHOT.jar
+        id: upload_jar
+      - name: Get JAR location
+        run: |
+            echo "JAR location: ${{ steps.upload_jar.outputs.dbfs-file-path }}"
+   ```
+
+1. Create a new Databricks cluster instead of using an existing cluster, install libraries and trigger Scala job:
+
+   ```bash
+      - name: Trigger Databricks Run Scala ETL Job
+        uses: databricks/run-notebook@v0.0.3
+        id: trigger_databricks_notebook_run_scala_etl_job
+        with:
+          run-name: "PR ${{ github.event.number }} GitHub Action - Run Scala ETL job"
+          local-notebook-path: "Run Scala ETL Job.py"
+          notebook-params-json:  >
+            {
+              "env": "dev",
+              "repo": "amit-pr-checks-repo",
+              "branch": "pr-${{ github.event.number }}-${{ github.event.pull_request.head.sha }}",
+              "etl_start_date": "2012-01-01",
+              "etl_end_date": "2012-03-01"
+            }
+          new-cluster-json: >
+            {
+              "num_workers": 1,
+              "spark_version": "14.3.x-scala2.12",
+              "node_type_id": "m5d.large",
+              "spark_conf": {
+                "spark.hadoop.fs.lakefs.access.mode": "presigned",
+                "spark.hadoop.fs.lakefs.impl": "io.lakefs.LakeFSFileSystem",
+                "spark.hadoop.fs.lakefs.endpoint": "https://treeverse.us-east-1.lakefscloud.io/api/v1",
+                "spark.hadoop.fs.lakefs.access.key": "${{secrets.LAKEFS_ACCESS_KEY}}",
+                "spark.hadoop.fs.lakefs.secret.key": "${{secrets.LAKEFS_SECRET_KEY}}",
+                "spark.hadoop.fs.s3a.access.key": "${{secrets.AWS_ACCESS_KEY}}",
+                "spark.hadoop.fs.s3a.secret.key": "${{secrets.AWS_SECRET_KEY}}"
+              }
+            }
+          libraries-json: >
+            [
+              { "jar": "s3://treeverse-ort-simulation-bucket/amit/jars/pr-${{ github.event.number }}/transform-assembly-0.1.0-SNAPSHOT.jar" },
+              { "maven": {"coordinates": "io.lakefs:hadoop-lakefs-assembly:0.2.1"} },
+              { "pypi": {"package": "lakefs==0.4.1"} }
+            ]
+   ```
+
+1. Code to run the workflow only if any file changes in a specific folder (etl_jobs in this case)       :
+
+   ```bash
+   name: Run Scala job for isolated testing by using lakeFS
+
+   on:
+      pull_request:
+         paths:
+            - 'etl_jobs/**'
+   ```
+
+1. Code to checkout a folder from the repo instead of full repo:
+
+   ```bash
+      # Checkout project code
+      # Use sparse checkout to only select files in a directory
+      # Turning off cone mode ensures that files in the project root are not included during checkout
+      - name: Checks out the repo
+        uses: actions/checkout@v4
+        with:
+          sparse-checkout: 'etl_jobs/src'
+          sparse-checkout-cone-mode: false
+   ```
+
+1. Get branch list and store it in a GitHub multi-line environment variable:
+
+   ```bash
+      - name: Get branch list
+        run: |
+          {
+           echo 'PR_BRANCHES<<EOF'
+           git log -${{ env.PR_FETCH_DEPTH }} --pretty=format:'%H'
+           echo ''
+           echo 'EOF'
+          } >> $GITHUB_ENV
+   ```
+
+1. If you use Scala for Databricks notebooks then this is the step to build Scala job:
+
+   ```bash
+   ```
+
+1. If you use Scala for Databricks notebooks then this is the step to build Scala job:
+
+   ```bash
+   ```
diff --git a/01_standalone_examples/databricks-ci-cd/databricks-notebooks/Create Sample Delta Tables.py b/01_standalone_examples/databricks-ci-cd/databricks-notebooks/Create Sample Delta Tables.py
new file mode 100644
index 000000000..de7f433bf
--- /dev/null
+++ b/01_standalone_examples/databricks-ci-cd/databricks-notebooks/Create Sample Delta Tables.py	
@@ -0,0 +1,34 @@
+# Databricks notebook source
+#dbutils.widgets.text("data_source_storage_namespace", "s3://treeverse-ort-simulation-bucket/amit/data-source/delta-tables")
+
+dataSourceStorageNamespace = getArgument('data_source_storage_namespace')
+
+# COMMAND ----------
+
+data = [
+   (100,'intelligence'),
+   (200,'music'),
+   (300,'entertainment'),
+   (400,'professional athlete'),
+]
+columns = ["category_id", "category"]
+
+df = spark.createDataFrame(data=data, schema = columns)
+df.write.format("delta").mode("overwrite").save(f"{dataSourceStorageNamespace}/category_raw")
+df.display()
+
+# COMMAND ----------
+
+data = [
+   ('James','Bond','England',100),
+   ('Robbie','Williams','England',200),
+   ('Hulk','Hogan','USA',300),
+   ('Mister','T','USA',300),
+   ('Rafael','Nadal','Spain',400),
+   ('Paul','Haver','Belgium',200),
+]
+columns = ["firstname", "lastname", "country", "category"]
+
+df = spark.createDataFrame(data=data, schema = columns)
+df.write.format("delta").mode("overwrite").save(f"{dataSourceStorageNamespace}/famous_people_raw")
+df.display()
\ No newline at end of file
diff --git a/01_standalone_examples/databricks-ci-cd/databricks-notebooks/Create lakeFS Repo and Import Data.py b/01_standalone_examples/databricks-ci-cd/databricks-notebooks/Create lakeFS Repo and Import Data.py
new file mode 100644
index 000000000..c2ed72f2f
--- /dev/null
+++ b/01_standalone_examples/databricks-ci-cd/databricks-notebooks/Create lakeFS Repo and Import Data.py	
@@ -0,0 +1,66 @@
+# Databricks notebook source
+#dbutils.widgets.text("databricks_secret_scope", "demos")
+#dbutils.widgets.text("lakefs_end_point", "https://treeverse.us-east-1.lakefscloud.io")
+#dbutils.widgets.text("lakefs_repo", "amit-databricks-ci-cd-repo")
+#dbutils.widgets.text("lakefs_repo_storage_namespace", "s3://treeverse-ort-simulation-bucket/amit")
+#dbutils.widgets.text("lakefs_branch", "test")
+#dbutils.widgets.text("data_source_storage_namespace", "s3://treeverse-ort-simulation-bucket/amit/data-source/delta-tables")
+
+databricksSecretScope = getArgument('databricks_secret_scope')
+lakefsEndPoint = getArgument('lakefs_end_point')
+repo_name = getArgument('lakefs_repo')
+storageNamespace = getArgument('lakefs_repo_storage_namespace') + '/' + repo_name
+newBranch = getArgument('lakefs_branch')
+importSource = getArgument('data_source_storage_namespace')
+
+lakefsAccessKey = dbutils.secrets.get(databricksSecretScope, 'lakefs_access_key_id')
+lakefsSecretKey = dbutils.secrets.get(databricksSecretScope, 'lakefs_secret_access_key')
+sourceBranch = "main"
+importDestination = ""
+
+# COMMAND ----------
+
+import lakefs
+from lakefs.client import Client
+
+clt = Client(
+    host=lakefsEndPoint,
+    username=lakefsAccessKey,
+    password=lakefsSecretKey,
+)
+
+print("Verifying lakeFS credentials")
+print(clt.version)
+print("lakeFS credentials verified")
+
+# COMMAND ----------
+
+repo = lakefs.Repository(repo_name, client=clt).create(storage_namespace=storageNamespace, default_branch=sourceBranch, exist_ok=True)
+branchMain = repo.branch(sourceBranch)
+print(repo)
+
+# COMMAND ----------
+
+import time
+
+importer = branchMain.import_data(commit_message="import objects", metadata={"key": "value"}) \
+    .prefix(importSource, destination=importDestination)
+
+importer.start()
+status = importer.status()
+print(status)
+
+while not status.completed and status.error is None:
+    time.sleep(2)
+    status = importer.status()
+    print(status)
+
+if status.error:
+    raise Exception(status.error)
+    
+print(f"\nImported a total of {status.ingested_objects} objects into branch {sourceBranch}")
+
+# COMMAND ----------
+
+branchNew = repo.branch(newBranch).create(source_reference=sourceBranch)
+print(f"{newBranch} ref:", branchNew.get_commit().id)
\ No newline at end of file
diff --git a/01_standalone_examples/databricks-ci-cd/databricks-notebooks/ETL Job.py b/01_standalone_examples/databricks-ci-cd/databricks-notebooks/ETL Job.py
new file mode 100644
index 000000000..4c6ca8b34
--- /dev/null
+++ b/01_standalone_examples/databricks-ci-cd/databricks-notebooks/ETL Job.py	
@@ -0,0 +1,27 @@
+# Databricks notebook source
+#dbutils.widgets.text("environment", "dev")
+#dbutils.widgets.text("data_source_storage_namespace", "s3://treeverse-ort-simulation-bucket/amit/data-source/delta-tables")
+#dbutils.widgets.text("lakefs_repo", "amit-databricks-ci-cd-repo")
+#dbutils.widgets.text("lakefs_branch", "test")
+
+ENVIRONMENT = getArgument('environment')
+
+if ENVIRONMENT == "prod":
+    DATA_SOURCE = getArgument('data_source_storage_namespace')
+elif ENVIRONMENT == "dev":
+    DATA_SOURCE = f"lakefs://{getArgument('lakefs_repo')}/{getArgument('lakefs_branch')}/delta-tables"
+print(DATA_SOURCE)
+
+# COMMAND ----------
+
+df = spark.read.format("delta").load(f"{DATA_SOURCE}/famous_people_raw")
+df.write.format("delta").partitionBy("country").save(f"{DATA_SOURCE}/famous_people")
+df.display()
+
+# COMMAND ----------
+
+# from pyspark.sql.functions import col
+# df = spark.read.format("delta").load(f"{DATA_SOURCE}/category_raw")
+# df_not_music = df.filter(col("category") != "music")
+# df_not_music.write.format("delta").mode("overwrite").save(f"{DATA_SOURCE}/category_raw")
+# df_not_music.display()
\ No newline at end of file
diff --git a/01_standalone_examples/databricks-ci-cd/databricks-notebooks/Run Validations.py b/01_standalone_examples/databricks-ci-cd/databricks-notebooks/Run Validations.py
new file mode 100644
index 000000000..54d84d96a
--- /dev/null
+++ b/01_standalone_examples/databricks-ci-cd/databricks-notebooks/Run Validations.py	
@@ -0,0 +1,55 @@
+# Databricks notebook source
+#dbutils.widgets.text("databricks_secret_scope", "demos")
+#dbutils.widgets.text("lakefs_end_point", "https://treeverse.us-east-1.lakefscloud.io")
+#dbutils.widgets.text("lakefs_repo", "amit-databricks-ci-cd-repo")
+#dbutils.widgets.text("lakefs_branch", "test")
+
+databricksSecretScope = getArgument('databricks_secret_scope')
+lakefsEndPoint = getArgument('lakefs_end_point')
+repo_name = getArgument('lakefs_repo')
+newBranch = getArgument('lakefs_branch')
+
+lakefsAccessKey = dbutils.secrets.get(databricksSecretScope, 'lakefs_access_key_id')
+lakefsSecretKey = dbutils.secrets.get(databricksSecretScope, 'lakefs_secret_access_key')
+sourceBranch = "main"
+DATA_SOURCE = f"lakefs://{repo_name}/{newBranch}/delta-tables"
+
+# COMMAND ----------
+
+import lakefs
+from lakefs.client import Client
+
+clt = Client(
+    host=lakefsEndPoint,
+    username=lakefsAccessKey,
+    password=lakefsSecretKey,
+)
+
+print("Verifying lakeFS credentials")
+print(clt.version)
+print("lakeFS credentials verified")
+
+# COMMAND ----------
+
+repo = lakefs.Repository(repo_name, client=clt)
+branchNew = repo.branch(newBranch)
+branchNew.commit(message='Commit ETL job changes')
+
+# COMMAND ----------
+
+df_category = spark.read.format("delta").load(f"{DATA_SOURCE}/category_raw")
+df_category.display()
+
+# COMMAND ----------
+
+df_famous_people = spark.read.format("delta").load(f"{DATA_SOURCE}/famous_people")
+df_famous_people.groupby("category").count().display()
+
+# COMMAND ----------
+
+# Check number of categories
+number_of_categories = df_famous_people.groupby("category").count().count()
+if number_of_categories == df_category.count():
+    dbutils.notebook.exit("Success")
+else:
+    dbutils.notebook.exit(f"Referential integrity issue. Number of categories in 'famous_people' table are {number_of_categories} while number of categories in parent 'category_raw' table are {df_category.count()}.")
\ No newline at end of file

From 6ba0fdadadd300911440192a67c8cd49dfcea751 Mon Sep 17 00:00:00 2001
From: Your Name <you@example.com>
Date: Thu, 9 May 2024 15:17:58 -0700
Subject: [PATCH 2/3] Fixed issues

---
 .../databricks-ci-cd/README.md                | 69 ++++++++++++-------
 README.md                                     |  1 +
 2 files changed, 44 insertions(+), 26 deletions(-)

diff --git a/01_standalone_examples/databricks-ci-cd/README.md b/01_standalone_examples/databricks-ci-cd/README.md
index 3ec81babd..ef36dd8dc 100644
--- a/01_standalone_examples/databricks-ci-cd/README.md
+++ b/01_standalone_examples/databricks-ci-cd/README.md
@@ -18,61 +18,78 @@ Start by ⭐️ starring [lakeFS open source](https://go.lakefs.io/oreilly-cours
 
 1. Create Databricks secret scope e.g. **demos** or use an existing secret scope. Add following secrets in that secret scope by following [Secret management docs](https://docs.databricks.com/en/security/secrets/index.html): 
 
-        lakefs_access_key_id e.g. 'AKIAIOSFOLKFSSAMPLES'
+       lakefs_access_key_id e.g. 'AKIAIOSFOLKFSSAMPLES'
 
-        lakefs_secret_access_key e.g. 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY'
+       lakefs_secret_access_key e.g. 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY'
+
+    You can use following Databricks commands to create secrets:
+   ```bash
+   databricks secrets put-secret --json '{
+     "scope": "demos",
+     "key": "lakefs_access_key_id",
+     "string_value": "AKIAIOSFOLKFSSAMPLES"
+   }'
+
+   databricks secrets put-secret --json '{
+     "scope": "demos",
+     "key": "lakefs_secret_access_key",
+     "string_value": "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY"
+   }'
+   ```
 
 1. Create a Git repository. It can be named **lakeFS-samples-ci-cd**.
 
 1. Clone this repository:
 
    ```bash
-   git clone https://github.com/treeverse/lakeFS-samples && cd lakeFS-samples/01_standalone_examples/aws-databricks/ci-cd-demo
+   git clone https://github.com/treeverse/lakeFS-samples && cd lakeFS-samples/01_standalone_examples/databricks-ci-cd
    ```
 
 1. Create folders **.github/workflows** and **databricks-notebooks** in your Git repo.
 
-1. Upload **pr_commit_run_databricks_etl_job.yml** file in **lakeFS-samples/01_standalone_examples/aws-databricks/ci-cd-demo/.github/workflows** folder to **.github/workflows** folder in your Git repo.
+1. Upload **pr_commit_run_databricks_etl_job.yml** file in **lakeFS-samples/01_standalone_examples/databricks-ci-cd/.github/workflows** folder to **.github/workflows** folder in your Git repo.
 
-1. Upload all files in **lakeFS-samples/01_standalone_examples/aws-databricks/ci-cd-demo/databricks-notebooks** folder to **databricks-notebooks** folder in your Git repo.
+1. Upload all files in **lakeFS-samples/01_standalone_examples/databricks-ci-cd/databricks-notebooks** folder to **databricks-notebooks** folder in your Git repo.
 
-1. Add following secrets in your Git repo by following [Creating secrets for a repository docs](https://docs.github.com/en/actions/security-guides/using-secrets-in-github-actions#creating-secrets-for-a-repository). This is the Databricks token created in 1st step above.
+1. Add following secrets in your Git repo by following [Creating secrets for a repository docs](https://docs.github.com/en/actions/security-guides/using-secrets-in-github-actions#creating-secrets-for-a-repository). This is the Databricks token created in 1st step above. If you copy & paste the secret name then verify that there are no spaces before and after the secret name.
 
-        DATABRICKS_TOKEN
+       DATABRICKS_TOKEN
 
 
 1. Add following variables in your Git repo by following [Creating configuration variables for a repository docs](https://docs.github.com/en/actions/learn-github-actions/variables#creating-configuration-variables-for-a-repository):
 * Variable to store your [Databricks host name or URL](https://docs.databricks.com/en/workspace/workspace-details.html#workspace-instance-names-urls-and-ids) e.g. https://cust-success.cloud.databricks.com
 
-        DATABRICKS_HOST 
+      DATABRICKS_HOST
 
 * Variable to store your [Databricks Cluster ID](https://docs.databricks.com/en/workspace/workspace-details.html#cluster-url-and-id) e.g. 1115-164516-often242
 
-        DATABRICKS_CLUSTER_ID
+      DATABRICKS_CLUSTER_ID
 
 * Variable to store your [Databricks Workspace Folder path](https://docs.databricks.com/en/workspace/workspace-details.html#folder-id) e.g. /Shared/lakefs_demos/ci_cd_demo or /Users/me@example.com/MyFolder/lakefs_demos/ci_cd_demo
 
-        DATABRICKS_WORKSPACE_NOTEBOOK_PATH 
+      DATABRICKS_WORKSPACE_NOTEBOOK_PATH
 
 * Variable to store your Databricks Secret Scope created in 2nd step e.g. demos
 
-        DATABRICKS_SECRET_SCOPE
+      DATABRICKS_SECRET_SCOPE
 
 * Variable to store your lakeFS End Point e.g. https://company.region.lakefscloud.io
 
-        LAFEFS_END_POINT
+      LAKEFS_END_POINT
 
 * Variable to store your lakeFS repository name (which will be created by this demo) e.g. databricks-ci-cd-repo
 
-        LAKFES_REPO_NAME
+      LAKFES_REPO_NAME
 
 * Variable to store the storage namespace for the lakeFS repo. It is a location in the underlying storage where data for lakeFS repository will be stored. e.g. s3://example
 
-        LAKEFS_REPO_STORAGE_NAMESPACE
+      LAKEFS_REPO_STORAGE_NAMESPACE
 
 * Variable to store the storage namespace where Delta tables created by this demo will be stored e.g. s3://data-source/delta-tables. Do NOT use the same storage namespace as above.
 
-        DATA_SOURCE_STORAGE_NAMESPACE
+  If it is not there then create Databricks [External Location](https://docs.databricks.com/en/sql/language-manual/sql-ref-external-locations.html) to write to s3://data-source URL and you should have **READ FILES** and **WRITES FILES** [premissions on and External Location](https://docs.databricks.com/en/connect/unity-catalog/manage-external-locations.html#grant-permissions-on-an-external-location)
+
+      DATA_SOURCE_STORAGE_NAMESPACE
 
 ## Demo Instructions
 
@@ -95,6 +112,17 @@ Start by ⭐️ starring [lakeFS open source](https://go.lakefs.io/oreilly-cours
 
 ## Additional Useful GitHub Action Code
 
+1. Code to run the Action workflow only if any file changes in a specific folder e.g. databricks-notebooks. So, changing README file will not run the workflow:
+
+   ```bash
+   name: Run Databricks ETL jobs in an isolated environment by using lakeFS
+
+   on:
+      pull_request:
+         paths:
+            - 'databricks-notebooks/**'
+   ```
+
 1. If you use Scala for Databricks notebooks then this is the step to build Scala job:
 
    ```bash
@@ -174,17 +202,6 @@ Start by ⭐️ starring [lakeFS open source](https://go.lakefs.io/oreilly-cours
             ]
    ```
 
-1. Code to run the workflow only if any file changes in a specific folder (etl_jobs in this case)       :
-
-   ```bash
-   name: Run Scala job for isolated testing by using lakeFS
-
-   on:
-      pull_request:
-         paths:
-            - 'etl_jobs/**'
-   ```
-
 1. Code to checkout a folder from the repo instead of full repo:
 
    ```bash
diff --git a/README.md b/README.md
index ac8694601..45b4e2693 100644
--- a/README.md
+++ b/README.md
@@ -80,6 +80,7 @@ Under the [standalone_examples](./01_standalone_examples/) folder are a set of e
 * [Airflow (2)](./01_standalone_examples/airflow-02/) - lakeFS + Airflow
 * [Azure Databricks](./01_standalone_examples/azure-databricks/)
 * [AWS Databricks](./01_standalone_examples/aws-databricks/)
+* [Databricks CI/CD](./01_standalone_examples/databricks-ci-cd/)
 * [AWS Glue and Athena](./01_standalone_examples/aws-glue-athena/)
 * [AWS Glue and Trino](./01_standalone_examples/aws-glue-trino/)
 * [lakeFS + Dagster](./01_standalone_examples/dagster-integration/)

From 14c8c89f11156b3d3b5c5d9e52d1e6a645bcdc3d Mon Sep 17 00:00:00 2001
From: iddoavn <87393827+iddoavn@users.noreply.github.com>
Date: Fri, 10 May 2024 11:15:59 -0400
Subject: [PATCH 3/3] Update notebooks_to_exclude.txt

Added ./R.ipynb to excluded files
---
 .github/workflows/notebooks_to_exclude.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/notebooks_to_exclude.txt b/.github/workflows/notebooks_to_exclude.txt
index c98398875..9303a65aa 100644
--- a/.github/workflows/notebooks_to_exclude.txt
+++ b/.github/workflows/notebooks_to_exclude.txt
@@ -31,6 +31,7 @@
 # and not designed to run end-to-end
 ./R-client.ipynb
 ./R-weather.ipynb
+./R.ipynb
 
 # Ignore any temporary notebook caches etc
 .ipynb_checkpoints