From f96a57658fd3bf9307c6ddeb8589503f411d633d Mon Sep 17 00:00:00 2001 From: Your Name Date: Thu, 9 May 2024 09:30:56 -0700 Subject: [PATCH 1/3] Added Databricks CI/CD demo --- .../pr_commit_run_databricks_etl_job.yml | 101 ++++++++ .../databricks-ci-cd/README.md | 222 ++++++++++++++++++ .../Create Sample Delta Tables.py | 34 +++ .../Create lakeFS Repo and Import Data.py | 66 ++++++ .../databricks-notebooks/ETL Job.py | 27 +++ .../databricks-notebooks/Run Validations.py | 55 +++++ 6 files changed, 505 insertions(+) create mode 100644 01_standalone_examples/databricks-ci-cd/.github/workflows/pr_commit_run_databricks_etl_job.yml create mode 100644 01_standalone_examples/databricks-ci-cd/README.md create mode 100644 01_standalone_examples/databricks-ci-cd/databricks-notebooks/Create Sample Delta Tables.py create mode 100644 01_standalone_examples/databricks-ci-cd/databricks-notebooks/Create lakeFS Repo and Import Data.py create mode 100644 01_standalone_examples/databricks-ci-cd/databricks-notebooks/ETL Job.py create mode 100644 01_standalone_examples/databricks-ci-cd/databricks-notebooks/Run Validations.py diff --git a/01_standalone_examples/databricks-ci-cd/.github/workflows/pr_commit_run_databricks_etl_job.yml b/01_standalone_examples/databricks-ci-cd/.github/workflows/pr_commit_run_databricks_etl_job.yml new file mode 100644 index 000000000..11ed3223b --- /dev/null +++ b/01_standalone_examples/databricks-ci-cd/.github/workflows/pr_commit_run_databricks_etl_job.yml @@ -0,0 +1,101 @@ +name: Run Databricks ETL jobs in an isolated environment by using lakeFS + +on: + pull_request: + +env: + DATABRICKS_HOST: ${{ vars.DATABRICKS_HOST }} + DATABRICKS_TOKEN: ${{ secrets.DATABRICKS_TOKEN }} + +jobs: + run-etl-jobs-in-isolated-environment: + runs-on: ubuntu-latest + + steps: + - name: Check out the repo + uses: actions/checkout@v4 + - name: Set additional environment variables + run: | + echo "WORKSPACE_NOTEBOOK_PATH=${{ vars.DATABRICKS_WORKSPACE_NOTEBOOK_PATH }}/pr-${{ github.event.number }}-${{ github.event.pull_request.head.sha }}" >> $GITHUB_ENV + echo "LOCAL_NOTEBOOK_PATH=/home/runner/work/${{ github.event.pull_request.head.repo.name }}/${{ github.event.pull_request.head.repo.name }}/databricks-notebooks" >> $GITHUB_ENV + echo "LAKFES_BRANCH_NAME=pr-${{ github.event.number }}-${{ github.event.pull_request.head.sha }}" >> $GITHUB_ENV + - name: Create Databricks Workspace directory and import Databricks notebooks + run: | + curl -F path="${{ env.WORKSPACE_NOTEBOOK_PATH }}" \ + ${{ vars.DATABRICKS_HOST }}/api/2.0/workspace/mkdirs --header "Authorization: Bearer ${{ env.DATABRICKS_TOKEN }}" + + cd ${{ env.LOCAL_NOTEBOOK_PATH }} + + for file in *.py + do + curl -F path="${{ env.WORKSPACE_NOTEBOOK_PATH }}/$file" \ + -F language=PYTHON -F overwrite=true -F content=@"$file" \ + ${{ vars.DATABRICKS_HOST }}/api/2.0/workspace/import --header "Authorization: Bearer ${{ env.DATABRICKS_TOKEN }}" + done + - name: Trigger Databricks job to create sample Delta tables + uses: databricks/run-notebook@v0.0.3 + id: trigger_databricks_notebook_create_sample_delta_tables + with: + run-name: "GitHub Action - PR ${{ github.event.number }} - Create Sample Delta Tables" + workspace-notebook-path: "${{ env.WORKSPACE_NOTEBOOK_PATH }}/Create Sample Delta Tables.py" + notebook-params-json: > + { + "data_source_storage_namespace": "${{ vars.DATA_SOURCE_STORAGE_NAMESPACE }}" + } + existing-cluster-id: "${{ vars.DATABRICKS_CLUSTER_ID }}" + - name: Trigger Databricks job to create lakeFS repo and import data + uses: databricks/run-notebook@v0.0.3 + id: trigger_databricks_notebook_create_lakefs_repo + with: + run-name: "GitHub Action - PR ${{ github.event.number }} - Create lakeFS Repo and Import Data" + workspace-notebook-path: "${{ env.WORKSPACE_NOTEBOOK_PATH }}/Create lakeFS Repo and Import Data.py" + notebook-params-json: > + { + "databricks_secret_scope": "${{ vars.DATABRICKS_SECRET_SCOPE }}", + "lakefs_end_point": "${{ vars.LAKEFS_END_POINT }}", + "lakefs_repo": "${{ vars.LAKFES_REPO_NAME }}", + "lakefs_repo_storage_namespace": "${{ vars.LAKEFS_REPO_STORAGE_NAMESPACE }}", + "lakefs_branch": "${{ env.LAKFES_BRANCH_NAME }}", + "data_source_storage_namespace": "${{ vars.DATA_SOURCE_STORAGE_NAMESPACE }}" + } + existing-cluster-id: "${{ vars.DATABRICKS_CLUSTER_ID }}" + - name: Trigger Databricks ETL Job + uses: databricks/run-notebook@v0.0.3 + id: trigger_databricks_notebook_etl_job + with: + run-name: "GitHub Action - PR ${{ github.event.number }} - ETL Job" + workspace-notebook-path: "${{ env.WORKSPACE_NOTEBOOK_PATH }}/ETL Job.py" + notebook-params-json: > + { + "environment": "dev", + "data_source_storage_namespace": "${{ vars.DATA_SOURCE_STORAGE_NAMESPACE }}", + "lakefs_repo": "${{ vars.LAKFES_REPO_NAME }}", + "lakefs_branch": "${{ env.LAKFES_BRANCH_NAME }}" + } + existing-cluster-id: "${{ vars.DATABRICKS_CLUSTER_ID }}" + - name: Trigger Databricks job to run validations + uses: databricks/run-notebook@v0.0.3 + id: trigger_databricks_notebook_run_validations + with: + run-name: "GitHub Action - PR ${{ github.event.number }} - Run Validations" + workspace-notebook-path: "${{ env.WORKSPACE_NOTEBOOK_PATH }}/Run Validations.py" + notebook-params-json: > + { + "databricks_secret_scope": "${{ vars.DATABRICKS_SECRET_SCOPE }}", + "lakefs_end_point": "${{ vars.LAKEFS_END_POINT }}", + "lakefs_repo": "${{ vars.LAKFES_REPO_NAME }}", + "lakefs_branch": "${{ env.LAKFES_BRANCH_NAME }}" + } + existing-cluster-id: "${{ vars.DATABRICKS_CLUSTER_ID }}" + outputs: > + notebook-output >> "$GITHUB_OUTPUT" + - name: Check for failed validations + run: | + echo "Validation Output: ${{ steps.trigger_databricks_notebook_run_validations.outputs.notebook-output }}" + if [[ "${{ steps.trigger_databricks_notebook_run_validations.outputs.notebook-output }}" == "Success" ]] + then + echo "## ✅ No validation failures found" + else + echo "## 🚨👆🏻👆🏻 Validation checks failed 👆🏻👆🏻🚨" + exit 1 + fi \ No newline at end of file diff --git a/01_standalone_examples/databricks-ci-cd/README.md b/01_standalone_examples/databricks-ci-cd/README.md new file mode 100644 index 000000000..3ec81babd --- /dev/null +++ b/01_standalone_examples/databricks-ci-cd/README.md @@ -0,0 +1,222 @@ +# lakeFS-samples-ci-cd + +Start by ⭐️ starring [lakeFS open source](https://go.lakefs.io/oreilly-course) project. + +**This sample captures a collection of Databricks notebooks and GitHub Action code that demonstrate how to run Databricks ETL jobs in an isolated environment by using lakeFS.** + +## Prerequisites +* lakeFS installed and running on a server or in the cloud. If you don't have lakeFS already running then either use [lakeFS Cloud](https://demo.lakefs.io/) which provides free lakeFS server on-demand with a single click or refer to [lakeFS Quickstart](https://docs.lakefs.io/quickstart/) doc. +* Databricks server with the ability to run compute clusters on top of it. +* Configure your Databricks cluster to use lakeFS Hadoop file system. Read this blog [Databricks and lakeFS Integration: Step-by-Step Configuration Tutorial](https://lakefs.io/blog/databricks-lakefs-integration-tutorial/) or [lakeFS documentation](https://docs.lakefs.io/integrations/spark.html#lakefs-hadoop-filesystem) for the configuration. +* Permissions to manage the cluster configuration, including adding libraries. +* GitHub account. + +## Setup + +1. Create [Databricks personal access token](https://docs.databricks.com/en/dev-tools/auth/pat.html). + + +1. Create Databricks secret scope e.g. **demos** or use an existing secret scope. Add following secrets in that secret scope by following [Secret management docs](https://docs.databricks.com/en/security/secrets/index.html): + + lakefs_access_key_id e.g. 'AKIAIOSFOLKFSSAMPLES' + + lakefs_secret_access_key e.g. 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY' + +1. Create a Git repository. It can be named **lakeFS-samples-ci-cd**. + +1. Clone this repository: + + ```bash + git clone https://github.com/treeverse/lakeFS-samples && cd lakeFS-samples/01_standalone_examples/aws-databricks/ci-cd-demo + ``` + +1. Create folders **.github/workflows** and **databricks-notebooks** in your Git repo. + +1. Upload **pr_commit_run_databricks_etl_job.yml** file in **lakeFS-samples/01_standalone_examples/aws-databricks/ci-cd-demo/.github/workflows** folder to **.github/workflows** folder in your Git repo. + +1. Upload all files in **lakeFS-samples/01_standalone_examples/aws-databricks/ci-cd-demo/databricks-notebooks** folder to **databricks-notebooks** folder in your Git repo. + +1. Add following secrets in your Git repo by following [Creating secrets for a repository docs](https://docs.github.com/en/actions/security-guides/using-secrets-in-github-actions#creating-secrets-for-a-repository). This is the Databricks token created in 1st step above. + + DATABRICKS_TOKEN + + +1. Add following variables in your Git repo by following [Creating configuration variables for a repository docs](https://docs.github.com/en/actions/learn-github-actions/variables#creating-configuration-variables-for-a-repository): +* Variable to store your [Databricks host name or URL](https://docs.databricks.com/en/workspace/workspace-details.html#workspace-instance-names-urls-and-ids) e.g. https://cust-success.cloud.databricks.com + + DATABRICKS_HOST + +* Variable to store your [Databricks Cluster ID](https://docs.databricks.com/en/workspace/workspace-details.html#cluster-url-and-id) e.g. 1115-164516-often242 + + DATABRICKS_CLUSTER_ID + +* Variable to store your [Databricks Workspace Folder path](https://docs.databricks.com/en/workspace/workspace-details.html#folder-id) e.g. /Shared/lakefs_demos/ci_cd_demo or /Users/me@example.com/MyFolder/lakefs_demos/ci_cd_demo + + DATABRICKS_WORKSPACE_NOTEBOOK_PATH + +* Variable to store your Databricks Secret Scope created in 2nd step e.g. demos + + DATABRICKS_SECRET_SCOPE + +* Variable to store your lakeFS End Point e.g. https://company.region.lakefscloud.io + + LAFEFS_END_POINT + +* Variable to store your lakeFS repository name (which will be created by this demo) e.g. databricks-ci-cd-repo + + LAKFES_REPO_NAME + +* Variable to store the storage namespace for the lakeFS repo. It is a location in the underlying storage where data for lakeFS repository will be stored. e.g. s3://example + + LAKEFS_REPO_STORAGE_NAMESPACE + +* Variable to store the storage namespace where Delta tables created by this demo will be stored e.g. s3://data-source/delta-tables. Do NOT use the same storage namespace as above. + + DATA_SOURCE_STORAGE_NAMESPACE + +## Demo Instructions + +1. Create a new branch in your Git repository. Select newly created branch. +1. Remove the comment from the last 5 lines of code in **ETL Job.py** inside **databricks-notebooks** folder and Commit your changes. +1. Go to **Pull requests** tab in your Git repo, create Pull Request. +1. Go to **Actions** tab in your Git repo. Git Action will start running automaically and validation checks will fail. +1. Go back to **Code** tab in your Git repo and select the branch created in 1st step. Comment back the last 5 lines of code in **ETL Job.py** and Commit your changes. +1. Go back to **Actions** tab in your Git repo. Git Action will start running again and validation checks will pass this time. + +## Useful Information + +1. Databricks [Continuous integration and delivery using GitHub Actions](https://docs.databricks.com/en/dev-tools/ci-cd/ci-cd-github.html). +1. Information on how to [run Databricks notebooks from GitHub Action](https://github.com/databricks/run-notebook/tree/main). +1. See [action.yml](https://github.com/databricks/run-notebook/blob/main/action.yml) for the latest interface and docs for databricks/run-notebook. +1. [Databricks REST API reference](https://docs.databricks.com/api/workspace/introduction). +1. GitHub [Events that trigger workflows](https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows). +1. GitHub [Webhook events and payloads](https://docs.github.com/en/webhooks/webhook-events-and-payloads). +1. GitHub [Payloads for Pull Request](https://docs.github.com/en/webhooks/webhook-events-and-payloads?actionType=closed#pull_request). + +## Additional Useful GitHub Action Code + +1. If you use Scala for Databricks notebooks then this is the step to build Scala job: + + ```bash + - name: Build Scala job + run: | + cd etl_jobs + sbt assembly + ls -lh /home/runner/work/image-segmentation-repo/image-segmentation-repo/etl_jobs/target/scala-2.12/transform-assembly-0.1.0-SNAPSHOT.jar + ``` + +1. Upload a file to S3 e.g. upload JAR file built in the previous step: + + ```bash + - name: Upload JAR to S3 + uses: hkusu/s3-upload-action@v2 + id: S3 + with: + aws-access-key-id: ${{secrets.AWS_ACCESS_KEY}} + aws-secret-access-key: ${{secrets.AWS_SECRET_KEY}} + aws-region: 'us-east-1' + aws-bucket: "treeverse-ort-simulation-bucket" + bucket-root: "amit" + destination-dir: "jars/pr-${{ github.event.number }}" + file-path: "/home/runner/work/image-segmentation-repo/image-segmentation-repo/etl_jobs/target/scala-2.12/transform-assembly-0.1.0-SNAPSHOT.jar" + ``` + +1. Upload a file to DBFS (Databricks FS) e.g. upload JAR file built in the previous step: + + ```bash + - name: Upload JAR to DBFS + uses: databricks/upload-dbfs-temp@v0 + with: + local-path: /home/runner/work/image-segmentation-repo/image-segmentation-repo/etl_jobs/target/scala-2.12/transform-assembly-0.1.0-SNAPSHOT.jar + id: upload_jar + - name: Get JAR location + run: | + echo "JAR location: ${{ steps.upload_jar.outputs.dbfs-file-path }}" + ``` + +1. Create a new Databricks cluster instead of using an existing cluster, install libraries and trigger Scala job: + + ```bash + - name: Trigger Databricks Run Scala ETL Job + uses: databricks/run-notebook@v0.0.3 + id: trigger_databricks_notebook_run_scala_etl_job + with: + run-name: "PR ${{ github.event.number }} GitHub Action - Run Scala ETL job" + local-notebook-path: "Run Scala ETL Job.py" + notebook-params-json: > + { + "env": "dev", + "repo": "amit-pr-checks-repo", + "branch": "pr-${{ github.event.number }}-${{ github.event.pull_request.head.sha }}", + "etl_start_date": "2012-01-01", + "etl_end_date": "2012-03-01" + } + new-cluster-json: > + { + "num_workers": 1, + "spark_version": "14.3.x-scala2.12", + "node_type_id": "m5d.large", + "spark_conf": { + "spark.hadoop.fs.lakefs.access.mode": "presigned", + "spark.hadoop.fs.lakefs.impl": "io.lakefs.LakeFSFileSystem", + "spark.hadoop.fs.lakefs.endpoint": "https://treeverse.us-east-1.lakefscloud.io/api/v1", + "spark.hadoop.fs.lakefs.access.key": "${{secrets.LAKEFS_ACCESS_KEY}}", + "spark.hadoop.fs.lakefs.secret.key": "${{secrets.LAKEFS_SECRET_KEY}}", + "spark.hadoop.fs.s3a.access.key": "${{secrets.AWS_ACCESS_KEY}}", + "spark.hadoop.fs.s3a.secret.key": "${{secrets.AWS_SECRET_KEY}}" + } + } + libraries-json: > + [ + { "jar": "s3://treeverse-ort-simulation-bucket/amit/jars/pr-${{ github.event.number }}/transform-assembly-0.1.0-SNAPSHOT.jar" }, + { "maven": {"coordinates": "io.lakefs:hadoop-lakefs-assembly:0.2.1"} }, + { "pypi": {"package": "lakefs==0.4.1"} } + ] + ``` + +1. Code to run the workflow only if any file changes in a specific folder (etl_jobs in this case) : + + ```bash + name: Run Scala job for isolated testing by using lakeFS + + on: + pull_request: + paths: + - 'etl_jobs/**' + ``` + +1. Code to checkout a folder from the repo instead of full repo: + + ```bash + # Checkout project code + # Use sparse checkout to only select files in a directory + # Turning off cone mode ensures that files in the project root are not included during checkout + - name: Checks out the repo + uses: actions/checkout@v4 + with: + sparse-checkout: 'etl_jobs/src' + sparse-checkout-cone-mode: false + ``` + +1. Get branch list and store it in a GitHub multi-line environment variable: + + ```bash + - name: Get branch list + run: | + { + echo 'PR_BRANCHES<> $GITHUB_ENV + ``` + +1. If you use Scala for Databricks notebooks then this is the step to build Scala job: + + ```bash + ``` + +1. If you use Scala for Databricks notebooks then this is the step to build Scala job: + + ```bash + ``` diff --git a/01_standalone_examples/databricks-ci-cd/databricks-notebooks/Create Sample Delta Tables.py b/01_standalone_examples/databricks-ci-cd/databricks-notebooks/Create Sample Delta Tables.py new file mode 100644 index 000000000..de7f433bf --- /dev/null +++ b/01_standalone_examples/databricks-ci-cd/databricks-notebooks/Create Sample Delta Tables.py @@ -0,0 +1,34 @@ +# Databricks notebook source +#dbutils.widgets.text("data_source_storage_namespace", "s3://treeverse-ort-simulation-bucket/amit/data-source/delta-tables") + +dataSourceStorageNamespace = getArgument('data_source_storage_namespace') + +# COMMAND ---------- + +data = [ + (100,'intelligence'), + (200,'music'), + (300,'entertainment'), + (400,'professional athlete'), +] +columns = ["category_id", "category"] + +df = spark.createDataFrame(data=data, schema = columns) +df.write.format("delta").mode("overwrite").save(f"{dataSourceStorageNamespace}/category_raw") +df.display() + +# COMMAND ---------- + +data = [ + ('James','Bond','England',100), + ('Robbie','Williams','England',200), + ('Hulk','Hogan','USA',300), + ('Mister','T','USA',300), + ('Rafael','Nadal','Spain',400), + ('Paul','Haver','Belgium',200), +] +columns = ["firstname", "lastname", "country", "category"] + +df = spark.createDataFrame(data=data, schema = columns) +df.write.format("delta").mode("overwrite").save(f"{dataSourceStorageNamespace}/famous_people_raw") +df.display() \ No newline at end of file diff --git a/01_standalone_examples/databricks-ci-cd/databricks-notebooks/Create lakeFS Repo and Import Data.py b/01_standalone_examples/databricks-ci-cd/databricks-notebooks/Create lakeFS Repo and Import Data.py new file mode 100644 index 000000000..c2ed72f2f --- /dev/null +++ b/01_standalone_examples/databricks-ci-cd/databricks-notebooks/Create lakeFS Repo and Import Data.py @@ -0,0 +1,66 @@ +# Databricks notebook source +#dbutils.widgets.text("databricks_secret_scope", "demos") +#dbutils.widgets.text("lakefs_end_point", "https://treeverse.us-east-1.lakefscloud.io") +#dbutils.widgets.text("lakefs_repo", "amit-databricks-ci-cd-repo") +#dbutils.widgets.text("lakefs_repo_storage_namespace", "s3://treeverse-ort-simulation-bucket/amit") +#dbutils.widgets.text("lakefs_branch", "test") +#dbutils.widgets.text("data_source_storage_namespace", "s3://treeverse-ort-simulation-bucket/amit/data-source/delta-tables") + +databricksSecretScope = getArgument('databricks_secret_scope') +lakefsEndPoint = getArgument('lakefs_end_point') +repo_name = getArgument('lakefs_repo') +storageNamespace = getArgument('lakefs_repo_storage_namespace') + '/' + repo_name +newBranch = getArgument('lakefs_branch') +importSource = getArgument('data_source_storage_namespace') + +lakefsAccessKey = dbutils.secrets.get(databricksSecretScope, 'lakefs_access_key_id') +lakefsSecretKey = dbutils.secrets.get(databricksSecretScope, 'lakefs_secret_access_key') +sourceBranch = "main" +importDestination = "" + +# COMMAND ---------- + +import lakefs +from lakefs.client import Client + +clt = Client( + host=lakefsEndPoint, + username=lakefsAccessKey, + password=lakefsSecretKey, +) + +print("Verifying lakeFS credentials") +print(clt.version) +print("lakeFS credentials verified") + +# COMMAND ---------- + +repo = lakefs.Repository(repo_name, client=clt).create(storage_namespace=storageNamespace, default_branch=sourceBranch, exist_ok=True) +branchMain = repo.branch(sourceBranch) +print(repo) + +# COMMAND ---------- + +import time + +importer = branchMain.import_data(commit_message="import objects", metadata={"key": "value"}) \ + .prefix(importSource, destination=importDestination) + +importer.start() +status = importer.status() +print(status) + +while not status.completed and status.error is None: + time.sleep(2) + status = importer.status() + print(status) + +if status.error: + raise Exception(status.error) + +print(f"\nImported a total of {status.ingested_objects} objects into branch {sourceBranch}") + +# COMMAND ---------- + +branchNew = repo.branch(newBranch).create(source_reference=sourceBranch) +print(f"{newBranch} ref:", branchNew.get_commit().id) \ No newline at end of file diff --git a/01_standalone_examples/databricks-ci-cd/databricks-notebooks/ETL Job.py b/01_standalone_examples/databricks-ci-cd/databricks-notebooks/ETL Job.py new file mode 100644 index 000000000..4c6ca8b34 --- /dev/null +++ b/01_standalone_examples/databricks-ci-cd/databricks-notebooks/ETL Job.py @@ -0,0 +1,27 @@ +# Databricks notebook source +#dbutils.widgets.text("environment", "dev") +#dbutils.widgets.text("data_source_storage_namespace", "s3://treeverse-ort-simulation-bucket/amit/data-source/delta-tables") +#dbutils.widgets.text("lakefs_repo", "amit-databricks-ci-cd-repo") +#dbutils.widgets.text("lakefs_branch", "test") + +ENVIRONMENT = getArgument('environment') + +if ENVIRONMENT == "prod": + DATA_SOURCE = getArgument('data_source_storage_namespace') +elif ENVIRONMENT == "dev": + DATA_SOURCE = f"lakefs://{getArgument('lakefs_repo')}/{getArgument('lakefs_branch')}/delta-tables" +print(DATA_SOURCE) + +# COMMAND ---------- + +df = spark.read.format("delta").load(f"{DATA_SOURCE}/famous_people_raw") +df.write.format("delta").partitionBy("country").save(f"{DATA_SOURCE}/famous_people") +df.display() + +# COMMAND ---------- + +# from pyspark.sql.functions import col +# df = spark.read.format("delta").load(f"{DATA_SOURCE}/category_raw") +# df_not_music = df.filter(col("category") != "music") +# df_not_music.write.format("delta").mode("overwrite").save(f"{DATA_SOURCE}/category_raw") +# df_not_music.display() \ No newline at end of file diff --git a/01_standalone_examples/databricks-ci-cd/databricks-notebooks/Run Validations.py b/01_standalone_examples/databricks-ci-cd/databricks-notebooks/Run Validations.py new file mode 100644 index 000000000..54d84d96a --- /dev/null +++ b/01_standalone_examples/databricks-ci-cd/databricks-notebooks/Run Validations.py @@ -0,0 +1,55 @@ +# Databricks notebook source +#dbutils.widgets.text("databricks_secret_scope", "demos") +#dbutils.widgets.text("lakefs_end_point", "https://treeverse.us-east-1.lakefscloud.io") +#dbutils.widgets.text("lakefs_repo", "amit-databricks-ci-cd-repo") +#dbutils.widgets.text("lakefs_branch", "test") + +databricksSecretScope = getArgument('databricks_secret_scope') +lakefsEndPoint = getArgument('lakefs_end_point') +repo_name = getArgument('lakefs_repo') +newBranch = getArgument('lakefs_branch') + +lakefsAccessKey = dbutils.secrets.get(databricksSecretScope, 'lakefs_access_key_id') +lakefsSecretKey = dbutils.secrets.get(databricksSecretScope, 'lakefs_secret_access_key') +sourceBranch = "main" +DATA_SOURCE = f"lakefs://{repo_name}/{newBranch}/delta-tables" + +# COMMAND ---------- + +import lakefs +from lakefs.client import Client + +clt = Client( + host=lakefsEndPoint, + username=lakefsAccessKey, + password=lakefsSecretKey, +) + +print("Verifying lakeFS credentials") +print(clt.version) +print("lakeFS credentials verified") + +# COMMAND ---------- + +repo = lakefs.Repository(repo_name, client=clt) +branchNew = repo.branch(newBranch) +branchNew.commit(message='Commit ETL job changes') + +# COMMAND ---------- + +df_category = spark.read.format("delta").load(f"{DATA_SOURCE}/category_raw") +df_category.display() + +# COMMAND ---------- + +df_famous_people = spark.read.format("delta").load(f"{DATA_SOURCE}/famous_people") +df_famous_people.groupby("category").count().display() + +# COMMAND ---------- + +# Check number of categories +number_of_categories = df_famous_people.groupby("category").count().count() +if number_of_categories == df_category.count(): + dbutils.notebook.exit("Success") +else: + dbutils.notebook.exit(f"Referential integrity issue. Number of categories in 'famous_people' table are {number_of_categories} while number of categories in parent 'category_raw' table are {df_category.count()}.") \ No newline at end of file From 6ba0fdadadd300911440192a67c8cd49dfcea751 Mon Sep 17 00:00:00 2001 From: Your Name Date: Thu, 9 May 2024 15:17:58 -0700 Subject: [PATCH 2/3] Fixed issues --- .../databricks-ci-cd/README.md | 69 ++++++++++++------- README.md | 1 + 2 files changed, 44 insertions(+), 26 deletions(-) diff --git a/01_standalone_examples/databricks-ci-cd/README.md b/01_standalone_examples/databricks-ci-cd/README.md index 3ec81babd..ef36dd8dc 100644 --- a/01_standalone_examples/databricks-ci-cd/README.md +++ b/01_standalone_examples/databricks-ci-cd/README.md @@ -18,61 +18,78 @@ Start by ⭐️ starring [lakeFS open source](https://go.lakefs.io/oreilly-cours 1. Create Databricks secret scope e.g. **demos** or use an existing secret scope. Add following secrets in that secret scope by following [Secret management docs](https://docs.databricks.com/en/security/secrets/index.html): - lakefs_access_key_id e.g. 'AKIAIOSFOLKFSSAMPLES' + lakefs_access_key_id e.g. 'AKIAIOSFOLKFSSAMPLES' - lakefs_secret_access_key e.g. 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY' + lakefs_secret_access_key e.g. 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY' + + You can use following Databricks commands to create secrets: + ```bash + databricks secrets put-secret --json '{ + "scope": "demos", + "key": "lakefs_access_key_id", + "string_value": "AKIAIOSFOLKFSSAMPLES" + }' + + databricks secrets put-secret --json '{ + "scope": "demos", + "key": "lakefs_secret_access_key", + "string_value": "wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY" + }' + ``` 1. Create a Git repository. It can be named **lakeFS-samples-ci-cd**. 1. Clone this repository: ```bash - git clone https://github.com/treeverse/lakeFS-samples && cd lakeFS-samples/01_standalone_examples/aws-databricks/ci-cd-demo + git clone https://github.com/treeverse/lakeFS-samples && cd lakeFS-samples/01_standalone_examples/databricks-ci-cd ``` 1. Create folders **.github/workflows** and **databricks-notebooks** in your Git repo. -1. Upload **pr_commit_run_databricks_etl_job.yml** file in **lakeFS-samples/01_standalone_examples/aws-databricks/ci-cd-demo/.github/workflows** folder to **.github/workflows** folder in your Git repo. +1. Upload **pr_commit_run_databricks_etl_job.yml** file in **lakeFS-samples/01_standalone_examples/databricks-ci-cd/.github/workflows** folder to **.github/workflows** folder in your Git repo. -1. Upload all files in **lakeFS-samples/01_standalone_examples/aws-databricks/ci-cd-demo/databricks-notebooks** folder to **databricks-notebooks** folder in your Git repo. +1. Upload all files in **lakeFS-samples/01_standalone_examples/databricks-ci-cd/databricks-notebooks** folder to **databricks-notebooks** folder in your Git repo. -1. Add following secrets in your Git repo by following [Creating secrets for a repository docs](https://docs.github.com/en/actions/security-guides/using-secrets-in-github-actions#creating-secrets-for-a-repository). This is the Databricks token created in 1st step above. +1. Add following secrets in your Git repo by following [Creating secrets for a repository docs](https://docs.github.com/en/actions/security-guides/using-secrets-in-github-actions#creating-secrets-for-a-repository). This is the Databricks token created in 1st step above. If you copy & paste the secret name then verify that there are no spaces before and after the secret name. - DATABRICKS_TOKEN + DATABRICKS_TOKEN 1. Add following variables in your Git repo by following [Creating configuration variables for a repository docs](https://docs.github.com/en/actions/learn-github-actions/variables#creating-configuration-variables-for-a-repository): * Variable to store your [Databricks host name or URL](https://docs.databricks.com/en/workspace/workspace-details.html#workspace-instance-names-urls-and-ids) e.g. https://cust-success.cloud.databricks.com - DATABRICKS_HOST + DATABRICKS_HOST * Variable to store your [Databricks Cluster ID](https://docs.databricks.com/en/workspace/workspace-details.html#cluster-url-and-id) e.g. 1115-164516-often242 - DATABRICKS_CLUSTER_ID + DATABRICKS_CLUSTER_ID * Variable to store your [Databricks Workspace Folder path](https://docs.databricks.com/en/workspace/workspace-details.html#folder-id) e.g. /Shared/lakefs_demos/ci_cd_demo or /Users/me@example.com/MyFolder/lakefs_demos/ci_cd_demo - DATABRICKS_WORKSPACE_NOTEBOOK_PATH + DATABRICKS_WORKSPACE_NOTEBOOK_PATH * Variable to store your Databricks Secret Scope created in 2nd step e.g. demos - DATABRICKS_SECRET_SCOPE + DATABRICKS_SECRET_SCOPE * Variable to store your lakeFS End Point e.g. https://company.region.lakefscloud.io - LAFEFS_END_POINT + LAKEFS_END_POINT * Variable to store your lakeFS repository name (which will be created by this demo) e.g. databricks-ci-cd-repo - LAKFES_REPO_NAME + LAKFES_REPO_NAME * Variable to store the storage namespace for the lakeFS repo. It is a location in the underlying storage where data for lakeFS repository will be stored. e.g. s3://example - LAKEFS_REPO_STORAGE_NAMESPACE + LAKEFS_REPO_STORAGE_NAMESPACE * Variable to store the storage namespace where Delta tables created by this demo will be stored e.g. s3://data-source/delta-tables. Do NOT use the same storage namespace as above. - DATA_SOURCE_STORAGE_NAMESPACE + If it is not there then create Databricks [External Location](https://docs.databricks.com/en/sql/language-manual/sql-ref-external-locations.html) to write to s3://data-source URL and you should have **READ FILES** and **WRITES FILES** [premissions on and External Location](https://docs.databricks.com/en/connect/unity-catalog/manage-external-locations.html#grant-permissions-on-an-external-location) + + DATA_SOURCE_STORAGE_NAMESPACE ## Demo Instructions @@ -95,6 +112,17 @@ Start by ⭐️ starring [lakeFS open source](https://go.lakefs.io/oreilly-cours ## Additional Useful GitHub Action Code +1. Code to run the Action workflow only if any file changes in a specific folder e.g. databricks-notebooks. So, changing README file will not run the workflow: + + ```bash + name: Run Databricks ETL jobs in an isolated environment by using lakeFS + + on: + pull_request: + paths: + - 'databricks-notebooks/**' + ``` + 1. If you use Scala for Databricks notebooks then this is the step to build Scala job: ```bash @@ -174,17 +202,6 @@ Start by ⭐️ starring [lakeFS open source](https://go.lakefs.io/oreilly-cours ] ``` -1. Code to run the workflow only if any file changes in a specific folder (etl_jobs in this case) : - - ```bash - name: Run Scala job for isolated testing by using lakeFS - - on: - pull_request: - paths: - - 'etl_jobs/**' - ``` - 1. Code to checkout a folder from the repo instead of full repo: ```bash diff --git a/README.md b/README.md index ac8694601..45b4e2693 100644 --- a/README.md +++ b/README.md @@ -80,6 +80,7 @@ Under the [standalone_examples](./01_standalone_examples/) folder are a set of e * [Airflow (2)](./01_standalone_examples/airflow-02/) - lakeFS + Airflow * [Azure Databricks](./01_standalone_examples/azure-databricks/) * [AWS Databricks](./01_standalone_examples/aws-databricks/) +* [Databricks CI/CD](./01_standalone_examples/databricks-ci-cd/) * [AWS Glue and Athena](./01_standalone_examples/aws-glue-athena/) * [AWS Glue and Trino](./01_standalone_examples/aws-glue-trino/) * [lakeFS + Dagster](./01_standalone_examples/dagster-integration/) From 14c8c89f11156b3d3b5c5d9e52d1e6a645bcdc3d Mon Sep 17 00:00:00 2001 From: iddoavn <87393827+iddoavn@users.noreply.github.com> Date: Fri, 10 May 2024 11:15:59 -0400 Subject: [PATCH 3/3] Update notebooks_to_exclude.txt Added ./R.ipynb to excluded files --- .github/workflows/notebooks_to_exclude.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/notebooks_to_exclude.txt b/.github/workflows/notebooks_to_exclude.txt index c98398875..9303a65aa 100644 --- a/.github/workflows/notebooks_to_exclude.txt +++ b/.github/workflows/notebooks_to_exclude.txt @@ -31,6 +31,7 @@ # and not designed to run end-to-end ./R-client.ipynb ./R-weather.ipynb +./R.ipynb # Ignore any temporary notebook caches etc .ipynb_checkpoints