diff --git a/.github/dependabot.yml b/.github/dependabot.yml index ae2be43aa..fc44c9fe5 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -15,3 +15,8 @@ updates: schedule: interval: "weekly" rebase-strategy: "disabled" + - package-ecosystem: "docker" + directory: "/docker-dev" + schedule: + interval: "weekly" + rebase-strategy: "disabled" diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 9b2774f17..cdbdaa13f 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -13,8 +13,8 @@ # This will only run manually. Run this workflow only after the # version bump workflow is completed and related changes are reviewed and merged. # - -name: Release to GitHub and PyPI +name: "Release to GitHub, PyPI, and Docker" +run-name: "Release ${{ inputs.version_number }} to GitHub, PyPI, and Docker" on: workflow_dispatch: @@ -56,6 +56,11 @@ on: type: boolean default: true required: false + only_docker: + description: "Only release Docker image, skip GitHub & PyPI" + type: boolean + default: false + required: false permissions: contents: write # this is the permission that allows creating a new release @@ -66,7 +71,7 @@ defaults: jobs: log-inputs: - name: Log Inputs + name: "Log Inputs" runs-on: ubuntu-latest steps: - name: "[DEBUG] Print Variables" @@ -79,6 +84,7 @@ jobs: echo AWS S3 bucket name: ${{ inputs.s3_bucket_name }} echo Package test command: ${{ inputs.package_test_command }} echo Test run: ${{ inputs.test_run }} + echo Only Docker: ${{ inputs.only_docker }} # The Spark repository uses CircleCI to run integration tests. # Because of this, the process of version bumps will be manual @@ -87,27 +93,21 @@ jobs: # We are passing `env_setup_script_path` as an empty string # so that the integration tests stage will be skipped. audit-version-and-changelog: - name: Bump package version, Generate changelog - + name: "Bump package version, Generate changelog" uses: dbt-labs/dbt-spark/.github/workflows/release-prep.yml@main - with: sha: ${{ inputs.sha }} version_number: ${{ inputs.version_number }} target_branch: ${{ inputs.target_branch }} env_setup_script_path: "" test_run: ${{ inputs.test_run }} - secrets: inherit log-outputs-audit-version-and-changelog: name: "[Log output] Bump package version, Generate changelog" - if: ${{ !failure() && !cancelled() }} - + if: ${{ !failure() && !cancelled() && !inputs.only_docker }} needs: [audit-version-and-changelog] - runs-on: ubuntu-latest - steps: - name: Print variables run: | @@ -115,12 +115,10 @@ jobs: echo Changelog path: ${{ needs.audit-version-and-changelog.outputs.changelog_path }} build-test-package: - name: Build, Test, Package - if: ${{ !failure() && !cancelled() }} + name: "Build, Test, Package" + if: ${{ !failure() && !cancelled() && !inputs.only_docker }} needs: [audit-version-and-changelog] - uses: dbt-labs/dbt-release/.github/workflows/build.yml@main - with: sha: ${{ needs.audit-version-and-changelog.outputs.final_sha }} version_number: ${{ inputs.version_number }} @@ -129,19 +127,15 @@ jobs: s3_bucket_name: ${{ inputs.s3_bucket_name }} package_test_command: ${{ inputs.package_test_command }} test_run: ${{ inputs.test_run }} - secrets: AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} github-release: - name: GitHub Release - if: ${{ !failure() && !cancelled() }} - + name: "GitHub Release" + if: ${{ !failure() && !cancelled() && !inputs.only_docker }} needs: [audit-version-and-changelog, build-test-package] - uses: dbt-labs/dbt-release/.github/workflows/github-release.yml@main - with: sha: ${{ needs.audit-version-and-changelog.outputs.final_sha }} version_number: ${{ inputs.version_number }} @@ -149,35 +143,43 @@ jobs: test_run: ${{ inputs.test_run }} pypi-release: - name: PyPI Release - + name: "PyPI Release" + if: ${{ !failure() && !cancelled() && !inputs.only_docker }} needs: [github-release] - uses: dbt-labs/dbt-release/.github/workflows/pypi-release.yml@main - with: version_number: ${{ inputs.version_number }} test_run: ${{ inputs.test_run }} - secrets: PYPI_API_TOKEN: ${{ secrets.PYPI_API_TOKEN }} TEST_PYPI_API_TOKEN: ${{ secrets.TEST_PYPI_API_TOKEN }} + docker-release: + name: "Docker Release" + # We cannot release to docker on a test run because it uses the tag in GitHub as + # what we need to release but draft releases don't actually tag the commit so it + # finds nothing to release + if: ${{ !failure() && !cancelled() && (!inputs.test_run || inputs.only_docker) }} + needs: [github-release] + permissions: + packages: write + uses: dbt-labs/dbt-release/.github/workflows/release-docker.yml@main + with: + version_number: ${{ inputs.version_number }} + dockerfile: "docker/Dockerfile" + test_run: ${{ inputs.test_run }} + slack-notification: name: Slack Notification if: ${{ failure() && (!inputs.test_run || inputs.nightly_release) }} - needs: [ - audit-version-and-changelog, - build-test-package, github-release, pypi-release, + docker-release, ] - uses: dbt-labs/dbt-release/.github/workflows/slack-post-notification.yml@main with: status: "failure" - secrets: SLACK_WEBHOOK_URL: ${{ secrets.SLACK_DEV_CORE_ALERTS }} diff --git a/Makefile b/Makefile index af3a51541..ff4c0fc1b 100644 --- a/Makefile +++ b/Makefile @@ -38,3 +38,7 @@ help: ## Show this help message. @echo @echo 'targets:' @grep -E '^[7+a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}' + +.PHONY: docker-prod +docker-prod: + docker build -f docker/Dockerfile -t dbt-spark . diff --git a/docker-compose.yml b/docker-compose.yml index ad083eaf4..cd3e1c776 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -2,7 +2,9 @@ version: "3.7" services: dbt-spark3-thrift: - build: docker/ + build: + context: ./docker + dockerfile: spark.Dockerfile ports: - "10000:10000" - "4040:4040" diff --git a/docker/Dockerfile b/docker/Dockerfile index bb4d378ed..ef4574ddd 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -1,30 +1,42 @@ -ARG OPENJDK_VERSION=8 -FROM eclipse-temurin:${OPENJDK_VERSION}-jre - -ARG BUILD_DATE -ARG SPARK_VERSION=3.3.2 -ARG HADOOP_VERSION=3 - -LABEL org.label-schema.name="Apache Spark ${SPARK_VERSION}" \ - org.label-schema.build-date=$BUILD_DATE \ - org.label-schema.version=$SPARK_VERSION - -ENV SPARK_HOME /usr/spark -ENV PATH="/usr/spark/bin:/usr/spark/sbin:${PATH}" - -RUN apt-get update && \ - apt-get install -y wget netcat procps libpostgresql-jdbc-java && \ - wget -q "http://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz" && \ - tar xzf "spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz" && \ - rm "spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz" && \ - mv "spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}" /usr/spark && \ - ln -s /usr/share/java/postgresql-jdbc4.jar /usr/spark/jars/postgresql-jdbc4.jar && \ - apt-get remove -y wget && \ - apt-get autoremove -y && \ - apt-get clean - -COPY entrypoint.sh /scripts/ -RUN chmod +x /scripts/entrypoint.sh - -ENTRYPOINT ["/scripts/entrypoint.sh"] -CMD ["--help"] +# this image gets published to GHCR for production use +ARG py_version=3.11.2 + +FROM python:$py_version-slim-bullseye as base + +RUN apt-get update \ + && apt-get dist-upgrade -y \ + && apt-get install -y --no-install-recommends \ + build-essential=12.9 \ + ca-certificates=20210119 \ + gcc=4:10.2.1-1 \ + git=1:2.30.2-1+deb11u2 \ + libpq-dev=13.14-0+deb11u1 \ + libsasl2-dev=2.1.27+dfsg-2.1+deb11u1 \ + make=4.3-4.1 \ + openssh-client=1:8.4p1-5+deb11u3 \ + python-dev-is-python2=2.7.18-9 \ + software-properties-common=0.96.20.2-2.1 \ + unixodbc-dev=2.3.6-0.1+b1 \ + && apt-get clean \ + && rm -rf \ + /var/lib/apt/lists/* \ + /tmp/* \ + /var/tmp/* + +ENV PYTHONIOENCODING=utf-8 +ENV LANG=C.UTF-8 + +RUN python -m pip install --upgrade "pip==24.0" "setuptools==69.2.0" "wheel==0.43.0" --no-cache-dir + + +FROM base as dbt-spark + +ARG commit_ref=main +ARG extras=all + +HEALTHCHECK CMD dbt --version || exit 1 + +WORKDIR /usr/app/dbt/ +ENTRYPOINT ["dbt"] + +RUN python -m pip install --no-cache-dir "dbt-spark[${extras}] @ git+https://github.com/dbt-labs/dbt-spark@${commit_ref}" diff --git a/docker/README.md b/docker/README.md new file mode 100644 index 000000000..42ca5e227 --- /dev/null +++ b/docker/README.md @@ -0,0 +1,70 @@ +# Docker for dbt +`Dockerfile` is suitable for building dbt Docker images locally or using with CI/CD to automate populating a container registry. + +## Building an image: +This Dockerfile can create images for the following target: `dbt-spark` + +In order to build a new image, run the following docker command. +```shell +docker build --tag --target dbt-spark +``` +--- +> **Note:** Docker must be configured to use [BuildKit](https://docs.docker.com/develop/develop-images/build_enhancements/) in order for images to build properly! + +--- + +By default the image will be populated with the latest version of `dbt-spark` on `main`. +If you need to use a different version you can specify it by git ref using the `--build-arg` flag: +```shell +docker build --tag \ + --target dbt-spark \ + --build-arg commit_ref= \ + +``` + +### Examples: +To build an image named "my-dbt" that supports Snowflake using the latest releases: +```shell +cd dbt-core/docker +docker build --tag my-dbt --target dbt-spark . +``` + +To build an image named "my-other-dbt" that supports Snowflake using the adapter version 1.0.0b1: +```shell +cd dbt-core/docker +docker build \ + --tag my-other-dbt \ + --target dbt-spark \ + --build-arg commit_ref=v1.0.0b1 \ + . +``` + +## Special cases +There are a few special cases worth noting: +* The `dbt-spark` database adapter comes in three different versions named `PyHive`, `ODBC`, and the default `all`. +If you wish to override this you can use the `--build-arg` flag with the value of `extras=`. +See the [docs](https://docs.getdbt.com/reference/warehouse-profiles/spark-profile) for more information. +```shell +docker build --tag my_dbt \ + --target dbt-spark \ + --build-arg commit_ref=v1.0.0b1 \ + --build-arg extras=PyHive \ + +``` + +## Running an image in a container: +The `ENTRYPOINT` for this Dockerfile is the command `dbt` so you can bind-mount your project to `/usr/app` and use dbt as normal: +```shell +docker run \ + --network=host \ + --mount type=bind,source=path/to/project,target=/usr/app \ + --mount type=bind,source=path/to/profiles.yml,target=/root/.dbt/profiles.yml \ + my-dbt \ + ls +``` +--- +**Notes:** +* Bind-mount sources _must_ be an absolute path +* You may need to make adjustments to the docker networking setting depending on the specifics of your data warehouse/database host. + +--- diff --git a/docker/spark.Dockerfile b/docker/spark.Dockerfile new file mode 100644 index 000000000..bb4d378ed --- /dev/null +++ b/docker/spark.Dockerfile @@ -0,0 +1,30 @@ +ARG OPENJDK_VERSION=8 +FROM eclipse-temurin:${OPENJDK_VERSION}-jre + +ARG BUILD_DATE +ARG SPARK_VERSION=3.3.2 +ARG HADOOP_VERSION=3 + +LABEL org.label-schema.name="Apache Spark ${SPARK_VERSION}" \ + org.label-schema.build-date=$BUILD_DATE \ + org.label-schema.version=$SPARK_VERSION + +ENV SPARK_HOME /usr/spark +ENV PATH="/usr/spark/bin:/usr/spark/sbin:${PATH}" + +RUN apt-get update && \ + apt-get install -y wget netcat procps libpostgresql-jdbc-java && \ + wget -q "http://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz" && \ + tar xzf "spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz" && \ + rm "spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz" && \ + mv "spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}" /usr/spark && \ + ln -s /usr/share/java/postgresql-jdbc4.jar /usr/spark/jars/postgresql-jdbc4.jar && \ + apt-get remove -y wget && \ + apt-get autoremove -y && \ + apt-get clean + +COPY entrypoint.sh /scripts/ +RUN chmod +x /scripts/entrypoint.sh + +ENTRYPOINT ["/scripts/entrypoint.sh"] +CMD ["--help"]