From 4bb727b3d4b67adc66cbacd235cbae71b557b34c Mon Sep 17 00:00:00 2001 From: rdsharma26 <65777064+rdsharma26@users.noreply.github.com> Date: Wed, 10 Apr 2024 17:18:47 -0400 Subject: [PATCH] Added a dockerfile for building and testing the package (#195) * Added a dockerfile for building and testing the package This dockerfile can be used to setup and run the tests in the Python Deequ package. This way, we do not need to install any dependencies in our local workspaces. Right now, it only builds against Spark version 3.3. Will be adding other versions in a future PR. Verified that the docker run output is the same as that of the PR workflow. * Locked Poetry version to 1.7.1 --- .github/workflows/base.yml | 2 +- Dockerfile | 27 +++++++++++++++++++++++++++ README.md | 12 +++++++++++- 3 files changed, 39 insertions(+), 2 deletions(-) create mode 100644 Dockerfile diff --git a/.github/workflows/base.yml b/.github/workflows/base.yml index 1bce4a0..ea5f8f0 100644 --- a/.github/workflows/base.yml +++ b/.github/workflows/base.yml @@ -33,7 +33,7 @@ jobs: SPARK_VERSION: ${{matrix.PYSPARK_VERSION}} run: | pip install --upgrade pip - pip install poetry + pip install poetry==1.7.1 poetry install poetry add pyspark==$SPARK_VERSION poetry run python -m pytest -s tests diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..a7a236a --- /dev/null +++ b/Dockerfile @@ -0,0 +1,27 @@ +FROM ubuntu:22.04 + +ARG DEBIAN_FRONTEND=noninteractive + +RUN apt-get update +RUN apt-get install -y software-properties-common +RUN add-apt-repository ppa:deadsnakes/ppa +RUN apt-get install -y python3.8 python3-pip +RUN apt-get install -y python3.8-distutils +RUN apt-get install -y openjdk-11-jdk + +# Update symlink to point to latest +RUN rm /usr/bin/python3 && ln -s /usr/bin/python3.8 /usr/bin/python3 +RUN python3 --version +RUN pip3 --version +RUN java -version +RUN pip install poetry==1.7.1 + +COPY . /python-deequ +WORKDIR python-deequ + +RUN poetry lock --no-update +RUN poetry install +RUN poetry add pyspark==3.3 + +ENV SPARK_VERSION=3.3 +CMD poetry run python -m pytest -s tests diff --git a/README.md b/README.md index 5befb26..cd7eb35 100644 --- a/README.md +++ b/README.md @@ -244,4 +244,14 @@ Take a look at tests in `tests/dataquality` and `tests/jobs` ```bash $ poetry run pytest -``` \ No newline at end of file +``` + +## Running Tests Locally (Docker) + +If you have issues installing the dependencies listed above, another way to run the tests and verify your changes is through Docker. There is a Dockerfile that will install the required dependencies and run the tests in a container. + +``` +docker build . -t spark-3.3-docker-test +docker run spark-3.3-docker-test +``` +