From 12dda6ee3eeac8dd28ca7f7e46327896b0ae20bc Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Wed, 22 May 2024 16:04:22 -0500 Subject: [PATCH 1/8] add delta spark packages to support delta lake --- .gitignore | 3 +- Dockerfile | 22 +- Pipfile | 3 + Pipfile.lock | 188 ++++++++++++++++-- docker-compose.yaml | 49 ++++- .../{entrypoint.sh => notebook_entrypoint.sh} | 0 src/spark/utils.py | 50 ++++- 7 files changed, 283 insertions(+), 32 deletions(-) rename scripts/{entrypoint.sh => notebook_entrypoint.sh} (100%) diff --git a/.gitignore b/.gitignore index 9e74b31..478ea88 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ .DS_Store .idea .coverage -*_pycache__ \ No newline at end of file +*_pycache__ +cdm_shared_workspace/ \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index 67f8ad3..4646c08 100644 --- a/Dockerfile +++ b/Dockerfile @@ -6,9 +6,23 @@ USER root RUN apt-get update && apt-get install -y \ # GCC required to resolve error during JupyterLab installation: psutil could not be installed from sources because gcc is not installed. - gcc \ + gcc curl \ && rm -rf /var/lib/apt/lists/* +# Install jars to support delta lake spark operations +ENV HADOOP_AWS_VER=3.3.4 +RUN curl -O https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/${HADOOP_AWS_VER}/hadoop-aws-${HADOOP_AWS_VER}.jar \ + && mv hadoop-aws-${HADOOP_AWS_VER}.jar /opt/bitnami/spark/jars + +# NOTE: ensure Delta Spark jars matche python pip delta-spark version specified in the Pipfile +ENV DELTA_SPARK_VER=3.2.0 +ENV SCALA_VER=2.12 +RUN curl -O https://repo1.maven.org/maven2/io/delta/delta-spark_${SCALA_VER}/${DELTA_SPARK_VER}/delta-spark_${SCALA_VER}-${DELTA_SPARK_VER}.jar \ + && mv delta-spark_${SCALA_VER}-${DELTA_SPARK_VER}.jar /opt/bitnami/spark/jars + +Run curl -O https://repo1.maven.org/maven2/io/delta/delta-storage/${DELTA_SPARK_VER}/delta-storage-${DELTA_SPARK_VER}.jar \ + && mv delta-storage-${DELTA_SPARK_VER}.jar /opt/bitnami/spark/jars + # install pipenv RUN pip3 install pipenv @@ -19,10 +33,8 @@ RUN pipenv sync --system COPY ./src/ /src ENV PYTHONPATH "${PYTHONPATH}:/src" -COPY scripts/entrypoint.sh /opt/ -RUN chmod a+x /opt/entrypoint.sh +COPY ./scripts/ /opt/scripts/ +RUN chmod a+x /opt/scripts/* # Switch back to the original user USER ${ORI_USER} - -ENTRYPOINT ["/opt/entrypoint.sh"] diff --git a/Pipfile b/Pipfile index 738c9a9..89d9445 100644 --- a/Pipfile +++ b/Pipfile @@ -7,6 +7,9 @@ name = "pypi" jupyterlab= "==4.2.0" pyspark= "==3.5.1" boto3 = "==1.34.109" +minio = "==7.2.7" +delta-spark = "==3.2.0" # should match JAR version (DELTA_SPARK_VER) specified in the Dockerfile +pandas = "==2.2.2" [dev-packages] pytest = "==8.2.1" diff --git a/Pipfile.lock b/Pipfile.lock index 7d88c7b..78cb8bf 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "215bb476408ae146af5f348b694815589f6b9ab391570cc62e81da06e1332633" + "sha256": "ae5920cceec21a61e079a143f2d03f35e3571c8345af2c3bde32106c16ec663b" }, "pipfile-spec": 6, "requires": { @@ -132,11 +132,11 @@ }, "botocore": { "hashes": [ - "sha256:647059a81acbfab85c694b9b57b0ef200dde071449fb8837f10aef9c6472730d", - "sha256:804821252597821f7223cb3bfca2a2a513ae0bb9a71e8e22605aff6866e13e71" + "sha256:1edf3a825ec0a5edf238b2d42ad23305de11d5a71bb27d6f9a58b7e8862df1b6", + "sha256:b2c98c40ecf0b1facb9e61ceb7dfa28e61ae2456490554a16c8dbf99f20d6a18" ], "markers": "python_version >= '3.8'", - "version": "==1.34.109" + "version": "==1.34.110" }, "certifi": { "hashes": [ @@ -352,6 +352,14 @@ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", "version": "==0.7.1" }, + "delta-spark": { + "hashes": [ + "sha256:641967828e47c64805f8c746513da80bea24b5f19b069cdcf64561cd3692e11d", + "sha256:c4ff3fa7218e58a702cb71eb64384b0005c4d6f0bbdd0fe0b38a53564d946e09" + ], + "index": "pypi", + "version": "==3.2.0" + }, "executing": { "hashes": [ "sha256:35afe2ce3affba8ee97f2d69927fa823b08b472b7b994e36a52a964b93d16147", @@ -406,6 +414,14 @@ "markers": "python_version >= '3.5'", "version": "==3.7" }, + "importlib-metadata": { + "hashes": [ + "sha256:30962b96c0c223483ed6cc7280e7f0199feb01a0e40cfae4d4450fc6fab1f570", + "sha256:b78938b926ee8d5f020fc4772d487045805a55ddbad2ecf21c6d60938dc7fcd2" + ], + "markers": "python_version >= '3.8'", + "version": "==7.1.0" + }, "ipykernel": { "hashes": [ "sha256:1181e653d95c6808039c509ef8e67c4126b3b3af7781496c7cbfb5ed938a27da", @@ -550,11 +566,11 @@ }, "jupyterlab-server": { "hashes": [ - "sha256:097b5ac709b676c7284ac9c5e373f11930a561f52cd5a86e4fc7e5a9c8a8631d", - "sha256:f5e26156e5258b24d532c84e7c74cc212e203bff93eb856f81c24c16daeecc75" + "sha256:15cbb349dc45e954e09bacf81b9f9bcb10815ff660fb2034ecd7417db3a7ea27", + "sha256:54aa2d64fd86383b5438d9f0c032f043c4d8c0264b8af9f60bd061157466ea43" ], "markers": "python_version >= '3.8'", - "version": "==2.27.1" + "version": "==2.27.2" }, "markupsafe": { "hashes": [ @@ -630,6 +646,14 @@ "markers": "python_version >= '3.8'", "version": "==0.1.7" }, + "minio": { + "hashes": [ + "sha256:473d5d53d79f340f3cd632054d0c82d2f93177ce1af2eac34a235bea55708d98", + "sha256:59d1f255d852fe7104018db75b3bebbd987e538690e680f7c5de835e422de837" + ], + "index": "pypi", + "version": "==7.2.7" + }, "mistune": { "hashes": [ "sha256:71481854c30fdbc938963d3605b72501f5c10a9320ecd412c121c163a1c7d205", @@ -678,6 +702,48 @@ "markers": "python_version >= '3.7'", "version": "==0.2.4" }, + "numpy": { + "hashes": [ + "sha256:03a8c78d01d9781b28a6989f6fa1bb2c4f2d51201cf99d3dd875df6fbd96b23b", + "sha256:08beddf13648eb95f8d867350f6a018a4be2e5ad54c8d8caed89ebca558b2818", + "sha256:1af303d6b2210eb850fcf03064d364652b7120803a0b872f5211f5234b399f20", + "sha256:1dda2e7b4ec9dd512f84935c5f126c8bd8b9f2fc001e9f54af255e8c5f16b0e0", + "sha256:2a02aba9ed12e4ac4eb3ea9421c420301a0c6460d9830d74a9df87efa4912010", + "sha256:2e4ee3380d6de9c9ec04745830fd9e2eccb3e6cf790d39d7b98ffd19b0dd754a", + "sha256:3373d5d70a5fe74a2c1bb6d2cfd9609ecf686d47a2d7b1d37a8f3b6bf6003aea", + "sha256:47711010ad8555514b434df65f7d7b076bb8261df1ca9bb78f53d3b2db02e95c", + "sha256:4c66707fabe114439db9068ee468c26bbdf909cac0fb58686a42a24de1760c71", + "sha256:50193e430acfc1346175fcbdaa28ffec49947a06918b7b92130744e81e640110", + "sha256:52b8b60467cd7dd1e9ed082188b4e6bb35aa5cdd01777621a1658910745b90be", + "sha256:60dedbb91afcbfdc9bc0b1f3f402804070deed7392c23eb7a7f07fa857868e8a", + "sha256:62b8e4b1e28009ef2846b4c7852046736bab361f7aeadeb6a5b89ebec3c7055a", + "sha256:666dbfb6ec68962c033a450943ded891bed2d54e6755e35e5835d63f4f6931d5", + "sha256:675d61ffbfa78604709862923189bad94014bef562cc35cf61d3a07bba02a7ed", + "sha256:679b0076f67ecc0138fd2ede3a8fd196dddc2ad3254069bcb9faf9a79b1cebcd", + "sha256:7349ab0fa0c429c82442a27a9673fc802ffdb7c7775fad780226cb234965e53c", + "sha256:7ab55401287bfec946ced39700c053796e7cc0e3acbef09993a9ad2adba6ca6e", + "sha256:7e50d0a0cc3189f9cb0aeb3a6a6af18c16f59f004b866cd2be1c14b36134a4a0", + "sha256:95a7476c59002f2f6c590b9b7b998306fba6a5aa646b1e22ddfeaf8f78c3a29c", + "sha256:96ff0b2ad353d8f990b63294c8986f1ec3cb19d749234014f4e7eb0112ceba5a", + "sha256:9fad7dcb1aac3c7f0584a5a8133e3a43eeb2fe127f47e3632d43d677c66c102b", + "sha256:9ff0f4f29c51e2803569d7a51c2304de5554655a60c5d776e35b4a41413830d0", + "sha256:a354325ee03388678242a4d7ebcd08b5c727033fcff3b2f536aea978e15ee9e6", + "sha256:a4abb4f9001ad2858e7ac189089c42178fcce737e4169dc61321660f1a96c7d2", + "sha256:ab47dbe5cc8210f55aa58e4805fe224dac469cde56b9f731a4c098b91917159a", + "sha256:afedb719a9dcfc7eaf2287b839d8198e06dcd4cb5d276a3df279231138e83d30", + "sha256:b3ce300f3644fb06443ee2222c2201dd3a89ea6040541412b8fa189341847218", + "sha256:b97fe8060236edf3662adfc2c633f56a08ae30560c56310562cb4f95500022d5", + "sha256:bfe25acf8b437eb2a8b2d49d443800a5f18508cd811fea3181723922a8a82b07", + "sha256:cd25bcecc4974d09257ffcd1f098ee778f7834c3ad767fe5db785be9a4aa9cb2", + "sha256:d209d8969599b27ad20994c8e41936ee0964e6da07478d6c35016bc386b66ad4", + "sha256:d5241e0a80d808d70546c697135da2c613f30e28251ff8307eb72ba696945764", + "sha256:edd8b5fe47dab091176d21bb6de568acdd906d1887a4584a15a9a96a1dca06ef", + "sha256:f870204a840a60da0b12273ef34f7051e98c3b5961b61b0c2c1be6dfd64fbcd3", + "sha256:ffa75af20b44f8dba823498024771d5ac50620e6915abac414251bd971b4529f" + ], + "markers": "python_version == '3.11'", + "version": "==1.26.4" + }, "overrides": { "hashes": [ "sha256:55158fa3d93b98cc75299b1e67078ad9003ca27945c76162c1c0766d6f91820a", @@ -694,6 +760,41 @@ "markers": "python_version >= '3.7'", "version": "==24.0" }, + "pandas": { + "hashes": [ + "sha256:001910ad31abc7bf06f49dcc903755d2f7f3a9186c0c040b827e522e9cef0863", + "sha256:0ca6377b8fca51815f382bd0b697a0814c8bda55115678cbc94c30aacbb6eff2", + "sha256:0cace394b6ea70c01ca1595f839cf193df35d1575986e484ad35c4aeae7266c1", + "sha256:1cb51fe389360f3b5a4d57dbd2848a5f033350336ca3b340d1c53a1fad33bcad", + "sha256:2925720037f06e89af896c70bca73459d7e6a4be96f9de79e2d440bd499fe0db", + "sha256:3e374f59e440d4ab45ca2fffde54b81ac3834cf5ae2cdfa69c90bc03bde04d76", + "sha256:40ae1dffb3967a52203105a077415a86044a2bea011b5f321c6aa64b379a3f51", + "sha256:43498c0bdb43d55cb162cdc8c06fac328ccb5d2eabe3cadeb3529ae6f0517c32", + "sha256:4abfe0be0d7221be4f12552995e58723c7422c80a659da13ca382697de830c08", + "sha256:58b84b91b0b9f4bafac2a0ac55002280c094dfc6402402332c0913a59654ab2b", + "sha256:640cef9aa381b60e296db324337a554aeeb883ead99dc8f6c18e81a93942f5f4", + "sha256:66b479b0bd07204e37583c191535505410daa8df638fd8e75ae1b383851fe921", + "sha256:696039430f7a562b74fa45f540aca068ea85fa34c244d0deee539cb6d70aa288", + "sha256:6d2123dc9ad6a814bcdea0f099885276b31b24f7edf40f6cdbc0912672e22eee", + "sha256:8635c16bf3d99040fdf3ca3db669a7250ddf49c55dc4aa8fe0ae0fa8d6dcc1f0", + "sha256:873d13d177501a28b2756375d59816c365e42ed8417b41665f346289adc68d24", + "sha256:8e5a0b00e1e56a842f922e7fae8ae4077aee4af0acb5ae3622bd4b4c30aedf99", + "sha256:8e90497254aacacbc4ea6ae5e7a8cd75629d6ad2b30025a4a8b09aa4faf55151", + "sha256:9057e6aa78a584bc93a13f0a9bf7e753a5e9770a30b4d758b8d5f2a62a9433cd", + "sha256:90c6fca2acf139569e74e8781709dccb6fe25940488755716d1d354d6bc58bce", + "sha256:92fd6b027924a7e178ac202cfbe25e53368db90d56872d20ffae94b96c7acc57", + "sha256:9dfde2a0ddef507a631dc9dc4af6a9489d5e2e740e226ad426a05cabfbd7c8ef", + "sha256:9e79019aba43cb4fda9e4d983f8e88ca0373adbb697ae9c6c43093218de28b54", + "sha256:a77e9d1c386196879aa5eb712e77461aaee433e54c68cf253053a73b7e49c33a", + "sha256:c7adfc142dac335d8c1e0dcbd37eb8617eac386596eb9e1a1b77791cf2498238", + "sha256:d187d355ecec3629624fccb01d104da7d7f391db0311145817525281e2804d23", + "sha256:ddf818e4e6c7c6f4f7c8a12709696d193976b591cc7dc50588d3d1a6b5dc8772", + "sha256:e9b79011ff7a0f4b1d6da6a61aa1aa604fb312d6647de5bad20013682d1429ce", + "sha256:eee3a87076c0756de40b05c5e9a6069c035ba43e8dd71c379e68cab2c20f16ad" + ], + "index": "pypi", + "version": "==2.2.2" + }, "pandocfilters": { "hashes": [ "sha256:002b4a555ee4ebc03f8b66307e287fa492e4a77b4ea14d3f934328297bb4939e", @@ -794,6 +895,44 @@ "markers": "python_version >= '3.8'", "version": "==2.22" }, + "pycryptodome": { + "hashes": [ + "sha256:06d6de87c19f967f03b4cf9b34e538ef46e99a337e9a61a77dbe44b2cbcf0690", + "sha256:09609209ed7de61c2b560cc5c8c4fbf892f8b15b1faf7e4cbffac97db1fffda7", + "sha256:210ba1b647837bfc42dd5a813cdecb5b86193ae11a3f5d972b9a0ae2c7e9e4b4", + "sha256:2a1250b7ea809f752b68e3e6f3fd946b5939a52eaeea18c73bdab53e9ba3c2dd", + "sha256:2ab6ab0cb755154ad14e507d1df72de9897e99fd2d4922851a276ccc14f4f1a5", + "sha256:3427d9e5310af6680678f4cce149f54e0bb4af60101c7f2c16fdf878b39ccccc", + "sha256:3cd3ef3aee1079ae44afaeee13393cf68b1058f70576b11439483e34f93cf818", + "sha256:405002eafad114a2f9a930f5db65feef7b53c4784495dd8758069b89baf68eab", + "sha256:417a276aaa9cb3be91f9014e9d18d10e840a7a9b9a9be64a42f553c5b50b4d1d", + "sha256:4401564ebf37dfde45d096974c7a159b52eeabd9969135f0426907db367a652a", + "sha256:49a4c4dc60b78ec41d2afa392491d788c2e06edf48580fbfb0dd0f828af49d25", + "sha256:5601c934c498cd267640b57569e73793cb9a83506f7c73a8ec57a516f5b0b091", + "sha256:6e0e4a987d38cfc2e71b4a1b591bae4891eeabe5fa0f56154f576e26287bfdea", + "sha256:76658f0d942051d12a9bd08ca1b6b34fd762a8ee4240984f7c06ddfb55eaf15a", + "sha256:76cb39afede7055127e35a444c1c041d2e8d2f1f9c121ecef573757ba4cd2c3c", + "sha256:8d6b98d0d83d21fb757a182d52940d028564efe8147baa9ce0f38d057104ae72", + "sha256:9b3ae153c89a480a0ec402e23db8d8d84a3833b65fa4b15b81b83be9d637aab9", + "sha256:a60fedd2b37b4cb11ccb5d0399efe26db9e0dd149016c1cc6c8161974ceac2d6", + "sha256:ac1c7c0624a862f2e53438a15c9259d1655325fc2ec4392e66dc46cdae24d044", + "sha256:acae12b9ede49f38eb0ef76fdec2df2e94aad85ae46ec85be3648a57f0a7db04", + "sha256:acc2614e2e5346a4a4eab6e199203034924313626f9620b7b4b38e9ad74b7e0c", + "sha256:acf6e43fa75aca2d33e93409f2dafe386fe051818ee79ee8a3e21de9caa2ac9e", + "sha256:baee115a9ba6c5d2709a1e88ffe62b73ecc044852a925dcb67713a288c4ec70f", + "sha256:c18b381553638414b38705f07d1ef0a7cf301bc78a5f9bc17a957eb19446834b", + "sha256:d29daa681517f4bc318cd8a23af87e1f2a7bad2fe361e8aa29c77d652a065de4", + "sha256:d5954acfe9e00bc83ed9f5cb082ed22c592fbbef86dc48b907238be64ead5c33", + "sha256:ec0bb1188c1d13426039af8ffcb4dbe3aad1d7680c35a62d8eaf2a529b5d3d4f", + "sha256:ec1f93feb3bb93380ab0ebf8b859e8e5678c0f010d2d78367cf6bc30bfeb148e", + "sha256:f0e6d631bae3f231d3634f91ae4da7a960f7ff87f2865b2d2b831af1dfb04e9a", + "sha256:f35d6cee81fa145333137009d9c8ba90951d7d77b67c79cbe5f03c7eb74d8fe2", + "sha256:f47888542a0633baff535a04726948e876bf1ed880fddb7c10a736fa99146ab3", + "sha256:fb3b87461fa35afa19c971b0a2b7456a7b1db7b4eba9a8424666104925b78128" + ], + "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", + "version": "==3.20.0" + }, "pygments": { "hashes": [ "sha256:786ff802f32e91311bff3889f6e9a86e81505fe99f2735bb6d60ae0c5004f199", @@ -825,6 +964,13 @@ "markers": "python_version >= '3.6'", "version": "==2.0.7" }, + "pytz": { + "hashes": [ + "sha256:2a29735ea9c18baf14b448846bde5a48030ed267578472d8955cd0e7443a9812", + "sha256:328171f4e3623139da4983451950b28e95ac706e13f3f2630a879749e7a8b319" + ], + "version": "==2024.1" + }, "pyyaml": { "hashes": [ "sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5", @@ -986,11 +1132,11 @@ }, "requests": { "hashes": [ - "sha256:f2c3881dddb70d056c5bd7600a4fae312b2a300e39be6a118d30b90bd27262b5", - "sha256:fa5490319474c82ef1d2c9bc459d3652e3ae4ef4c4ebdd18a21145a47ca4b6b8" + "sha256:dd951ff5ecf3e3b3aa26b40703ba77495dab41da839ae72ef3c8e5d8e2433289", + "sha256:fc06670dd0ed212426dfeb94fc1b983d917c4f9847c863f313c9dfaaffb7c23c" ], "markers": "python_version >= '3.8'", - "version": "==2.32.0" + "version": "==2.32.2" }, "rfc3339-validator": { "hashes": [ @@ -1214,9 +1360,17 @@ "sha256:83f085bd5ca59c80295fc2a82ab5dac679cbe02b9f33f7d83af68e241bea51b0", "sha256:c1f94d72897edaf4ce775bb7558d5b79d8126906a14ea5ed1635921406c0387a" ], - "markers": "python_version < '3.12'", + "markers": "python_version >= '3.8'", "version": "==4.11.0" }, + "tzdata": { + "hashes": [ + "sha256:2674120f8d891909751c38abcdfd386ac0a5a1127954fbc332af6b5ceae07efd", + "sha256:9068bc196136463f5245e51efda838afa15aaeca9903f49050dfa2679db4d252" + ], + "markers": "python_version >= '2'", + "version": "==2024.1" + }, "uri-template": { "hashes": [ "sha256:0e00f8eb65e18c7de20d595a14336e9f337ead580c70934141624b6d1ffdacc7", @@ -1229,7 +1383,7 @@ "sha256:450b20ec296a467077128bff42b73080516e71b56ff59a60a02bef2232c4fa9d", "sha256:d0570876c61ab9e520d776c38acbbb5b05a776d3f9ff98a5c8fd5162a444cf19" ], - "markers": "python_version >= '3.10'", + "markers": "python_version >= '3.8'", "version": "==2.2.1" }, "wcwidth": { @@ -1260,6 +1414,14 @@ ], "markers": "python_version >= '3.8'", "version": "==1.8.0" + }, + "zipp": { + "hashes": [ + "sha256:6278d9ddbcfb1f1089a88fde84481528b07b0e10474e09dcfe53dad4069fa059", + "sha256:dce197b859eb796242b0622af1b8beb0a722d52aa2f57133ead08edd5bf5374e" + ], + "markers": "python_version >= '3.8'", + "version": "==3.18.2" } }, "develop": { @@ -1483,7 +1645,7 @@ "sha256:83f085bd5ca59c80295fc2a82ab5dac679cbe02b9f33f7d83af68e241bea51b0", "sha256:c1f94d72897edaf4ce775bb7558d5b79d8126906a14ea5ed1635921406c0387a" ], - "markers": "python_version < '3.12'", + "markers": "python_version >= '3.8'", "version": "==4.11.0" }, "wcwidth": { diff --git a/docker-compose.yaml b/docker-compose.yaml index 3dd4717..b5b639b 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -5,7 +5,9 @@ version: '3' services: spark-master: - image: bitnami/spark:3.5.1 + build: + context: . + dockerfile: Dockerfile container_name: spark-master ports: - "8080:8080" @@ -51,10 +53,48 @@ services: environment: - SPARK_MASTER_URL=spark://spark-master:7077 + minio: + image: minio/minio + container_name: spark-minio + expose: + - "9000" + ports: + - "9000:9000" + # MinIO Console is available at http://localhost:9001 + - "9001:9001" + environment: + MINIO_ROOT_USER: minio + MINIO_ROOT_PASSWORD: minio123 + MINIO_ACCESS_KEY: minio + MINIO_SECRET_KEY: minio123 + healthcheck: + test: timeout 5s bash -c ':> /dev/tcp/127.0.0.1/9000' || exit 1 + interval: 1s + timeout: 10s + retries: 5 + # Note there is no bucket by default + command: server /data --console-address ":9001" + + minio-create-bucket: + image: minio/mc + depends_on: + minio: + condition: service_healthy + entrypoint: > + bash -c " + mc alias set minio http://minio:9000 minio minio123 && + if ! mc ls minio/delta-lake 2>/dev/null; then + mc mb minio/delta-lake && echo 'Bucket delta-lake created' + else + echo 'bucket delta-lake already exists' + fi + " + notebook: build: context: . dockerfile: Dockerfile + entrypoint: /opt/scripts/notebook_entrypoint.sh container_name: spark-notebook ports: - "4041:4041" @@ -63,4 +103,9 @@ services: environment: - NOTEBOOK_PORT=4041 - SPARK_MASTER_URL=spark://spark-master:7077 - - SPARK_DRIVER_HOST=spark-notebook \ No newline at end of file + - SPARK_DRIVER_HOST=spark-notebook + - MINIO_URL=http://minio:9000 + - MINIO_ACCESS_KEY=minio + - MINIO_SECRET_KEY=minio123 + volumes: + - ./cdm_shared_workspace:/cdm_shared_workspace \ No newline at end of file diff --git a/scripts/entrypoint.sh b/scripts/notebook_entrypoint.sh similarity index 100% rename from scripts/entrypoint.sh rename to scripts/notebook_entrypoint.sh diff --git a/src/spark/utils.py b/src/spark/utils.py index d1c7137..2dcfe65 100644 --- a/src/spark/utils.py +++ b/src/spark/utils.py @@ -4,12 +4,15 @@ from pyspark.sql import SparkSession -def get_spark_session(app_name: str, local: bool = False) -> SparkSession: +def get_spark_session(app_name: str, + local: bool = False, + delta_lake: bool = False) -> SparkSession: """ Helper to get and manage the `SparkSession` and keep all of our spark configuration params in one place. :param app_name: The name of the application - :param local: Whether to run the spark session locally or not + :param local: Whether to run the spark session locally or doesn't + :param delta_lake: build the spark session with delta lake support :return: A `SparkSession` object """ @@ -19,14 +22,39 @@ def get_spark_session(app_name: str, local: bool = False) -> SparkSession: spark_conf = SparkConf() - spark_conf.setAll( - [ - ( - "spark.master", - os.environ.get("SPARK_MASTER_URL", "spark://spark-master:7077"), - ), - ("spark.app.name", app_name), - ] - ) + if delta_lake: + + jars_dir = "/opt/bitnami/spark/jars/" + jar_files = [os.path.join(jars_dir, f) for f in os.listdir(jars_dir) if f.endswith(".jar")] + jars = ",".join(jar_files) + + spark_conf.setAll( + [ + ( + "spark.master", + os.environ.get("SPARK_MASTER_URL", "spark://spark-master:7077"), + ), + ("spark.app.name", app_name), + ("spark.hadoop.fs.s3a.endpoint", os.environ.get("MINIO_URL")), + ("spark.hadoop.fs.s3a.access.key", os.environ.get("MINIO_ACCESS_KEY")), + ("spark.hadoop.fs.s3a.secret.key", os.environ.get("MINIO_SECRET_KEY")), + ("spark.jars", jars), + ("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension"), + ("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog"), + ("spark.hadoop.fs.s3a.path.style.access", "true"), + ("spark.hadoop.fs.s3a.connection.ssl.enabled", "false"), + ("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem"), + ] + ) + else: + spark_conf.setAll( + [ + ( + "spark.master", + os.environ.get("SPARK_MASTER_URL", "spark://spark-master:7077"), + ), + ("spark.app.name", app_name), + ] + ) return SparkSession.builder.config(conf=spark_conf).getOrCreate() From b5d2f8b12615ceee03e05712e6211044067f32e2 Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Wed, 22 May 2024 16:07:10 -0500 Subject: [PATCH 2/8] only run chmod on sh files --- Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Dockerfile b/Dockerfile index 4646c08..b2333b2 100644 --- a/Dockerfile +++ b/Dockerfile @@ -34,7 +34,7 @@ COPY ./src/ /src ENV PYTHONPATH "${PYTHONPATH}:/src" COPY ./scripts/ /opt/scripts/ -RUN chmod a+x /opt/scripts/* +RUN chmod a+x /opt/scripts/*.sh # Switch back to the original user USER ${ORI_USER} From c7952c2bba8d8069bdc2d777fbae3e7e8d992fad Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Wed, 22 May 2024 16:32:05 -0500 Subject: [PATCH 3/8] update readme --- README.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index b006c7c..da4a02f 100644 --- a/README.md +++ b/README.md @@ -79,7 +79,10 @@ In addition, the environment variable `SPARK_MASTER_URL` should also be configur ```python from spark.utils import get_spark_session -spark = get_spark_session('TestApp') +spark = get_spark_session(app_name) + +# To build spark session for Delta Lake operations, set the delta_lake parameter to True +spark = get_spark_session(app_name, delta_lake=True) ``` #### Manually Configuring SparkSession/SparkContext From 65aba5093b151397628f5f4d7b5f3188e615ecad Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Wed, 22 May 2024 17:46:21 -0500 Subject: [PATCH 4/8] address comments --- Dockerfile | 2 +- docker-compose.yaml | 1 + src/spark/utils.py | 1 - 3 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index b2333b2..52301a5 100644 --- a/Dockerfile +++ b/Dockerfile @@ -20,7 +20,7 @@ ENV SCALA_VER=2.12 RUN curl -O https://repo1.maven.org/maven2/io/delta/delta-spark_${SCALA_VER}/${DELTA_SPARK_VER}/delta-spark_${SCALA_VER}-${DELTA_SPARK_VER}.jar \ && mv delta-spark_${SCALA_VER}-${DELTA_SPARK_VER}.jar /opt/bitnami/spark/jars -Run curl -O https://repo1.maven.org/maven2/io/delta/delta-storage/${DELTA_SPARK_VER}/delta-storage-${DELTA_SPARK_VER}.jar \ +RUN curl -O https://repo1.maven.org/maven2/io/delta/delta-storage/${DELTA_SPARK_VER}/delta-storage-${DELTA_SPARK_VER}.jar \ && mv delta-storage-${DELTA_SPARK_VER}.jar /opt/bitnami/spark/jars # install pipenv diff --git a/docker-compose.yaml b/docker-compose.yaml index b5b639b..d01be28 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -68,6 +68,7 @@ services: MINIO_ACCESS_KEY: minio MINIO_SECRET_KEY: minio123 healthcheck: + # Check if the server is healthy by sending a HTTP request on port 9000 test: timeout 5s bash -c ':> /dev/tcp/127.0.0.1/9000' || exit 1 interval: 1s timeout: 10s diff --git a/src/spark/utils.py b/src/spark/utils.py index 2dcfe65..108696b 100644 --- a/src/spark/utils.py +++ b/src/spark/utils.py @@ -42,7 +42,6 @@ def get_spark_session(app_name: str, ("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension"), ("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog"), ("spark.hadoop.fs.s3a.path.style.access", "true"), - ("spark.hadoop.fs.s3a.connection.ssl.enabled", "false"), ("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem"), ] ) From 2db6e46c28d41e9deb5a8519d9f41ec95877225c Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Wed, 22 May 2024 17:55:02 -0500 Subject: [PATCH 5/8] add todo to use Gradle for JAR --- Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/Dockerfile b/Dockerfile index 52301a5..b43afbc 100644 --- a/Dockerfile +++ b/Dockerfile @@ -9,6 +9,7 @@ RUN apt-get update && apt-get install -y \ gcc curl \ && rm -rf /var/lib/apt/lists/* +# TODO: using Gradle to build the jar # Install jars to support delta lake spark operations ENV HADOOP_AWS_VER=3.3.4 RUN curl -O https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/${HADOOP_AWS_VER}/hadoop-aws-${HADOOP_AWS_VER}.jar \ From 9de9dd02af97fdf4a9b774d1f2125f17fa446526 Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Wed, 22 May 2024 18:27:08 -0500 Subject: [PATCH 6/8] using rancher dir path --- .gitignore | 2 +- docker-compose.yaml | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index 478ea88..9d06e6c 100644 --- a/.gitignore +++ b/.gitignore @@ -2,4 +2,4 @@ .idea .coverage *_pycache__ -cdm_shared_workspace/ \ No newline at end of file +cdr/cdm/jupyter/ \ No newline at end of file diff --git a/docker-compose.yaml b/docker-compose.yaml index d01be28..652e648 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -68,7 +68,7 @@ services: MINIO_ACCESS_KEY: minio MINIO_SECRET_KEY: minio123 healthcheck: - # Check if the server is healthy by sending a HTTP request on port 9000 + # reference: https://github.com/rodrigobdz/docker-compose-healthchecks?tab=readme-ov-file#minio-release2023-11-01t18-37-25z-and-older test: timeout 5s bash -c ':> /dev/tcp/127.0.0.1/9000' || exit 1 interval: 1s timeout: 10s @@ -101,6 +101,7 @@ services: - "4041:4041" depends_on: - spark-master + - minio-create-bucket environment: - NOTEBOOK_PORT=4041 - SPARK_MASTER_URL=spark://spark-master:7077 @@ -109,4 +110,4 @@ services: - MINIO_ACCESS_KEY=minio - MINIO_SECRET_KEY=minio123 volumes: - - ./cdm_shared_workspace:/cdm_shared_workspace \ No newline at end of file + - ./cdr/cdm/jupyter:/cdm_shared_workspace \ No newline at end of file From 23c934a4014cd8aef3527cdd4569dc6bc0891714 Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Wed, 22 May 2024 19:26:51 -0500 Subject: [PATCH 7/8] using mode to control entrypoint script --- Dockerfile | 3 +++ docker-compose.yaml | 2 +- scripts/entrypoint.sh | 7 +++++++ 3 files changed, 11 insertions(+), 1 deletion(-) create mode 100644 scripts/entrypoint.sh diff --git a/Dockerfile b/Dockerfile index b43afbc..6bbb5f1 100644 --- a/Dockerfile +++ b/Dockerfile @@ -39,3 +39,6 @@ RUN chmod a+x /opt/scripts/*.sh # Switch back to the original user USER ${ORI_USER} + +ENTRYPOINT ["/opt/scripts/entrypoint.sh"] +CMD ["/opt/bitnami/scripts/spark/run.sh"] diff --git a/docker-compose.yaml b/docker-compose.yaml index 652e648..a74d845 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -95,7 +95,6 @@ services: build: context: . dockerfile: Dockerfile - entrypoint: /opt/scripts/notebook_entrypoint.sh container_name: spark-notebook ports: - "4041:4041" @@ -109,5 +108,6 @@ services: - MINIO_URL=http://minio:9000 - MINIO_ACCESS_KEY=minio - MINIO_SECRET_KEY=minio123 + - SPARK_MODE=notebook volumes: - ./cdr/cdm/jupyter:/cdm_shared_workspace \ No newline at end of file diff --git a/scripts/entrypoint.sh b/scripts/entrypoint.sh new file mode 100644 index 0000000..93b1a4c --- /dev/null +++ b/scripts/entrypoint.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +if [ "$SPARK_MODE" = "notebook" ]; then + exec /opt/scripts/notebook_entrypoint.sh "$@" +else + exec /opt/bitnami/scripts/spark/entrypoint.sh "$@" +fi \ No newline at end of file From 796a8d463ffafc2959bddb69333e0e1115a6a882 Mon Sep 17 00:00:00 2001 From: Tianhao-Gu Date: Thu, 23 May 2024 15:57:34 -0500 Subject: [PATCH 8/8] remove last CDM to entrypoint --- Dockerfile | 1 - scripts/entrypoint.sh | 5 ++++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index 6bbb5f1..7ddc1de 100644 --- a/Dockerfile +++ b/Dockerfile @@ -41,4 +41,3 @@ RUN chmod a+x /opt/scripts/*.sh USER ${ORI_USER} ENTRYPOINT ["/opt/scripts/entrypoint.sh"] -CMD ["/opt/bitnami/scripts/spark/run.sh"] diff --git a/scripts/entrypoint.sh b/scripts/entrypoint.sh index 93b1a4c..35ab63c 100644 --- a/scripts/entrypoint.sh +++ b/scripts/entrypoint.sh @@ -3,5 +3,8 @@ if [ "$SPARK_MODE" = "notebook" ]; then exec /opt/scripts/notebook_entrypoint.sh "$@" else - exec /opt/bitnami/scripts/spark/entrypoint.sh "$@" + # In bitnami/spark Dockerfile, the entrypoint is set to /opt/bitnami/scripts/spark/entrypoint.sh and followed + # by CMD ["/opt/bitnami/scripts/spark/run.sh"] meaning that the entrypoint is expected the run.sh script as an argument. + # reference: https://github.com/bitnami/containers/blob/main/bitnami/spark/3.5/debian-12/Dockerfile#L69 + exec /opt/bitnami/scripts/spark/entrypoint.sh "$@" /opt/bitnami/scripts/spark/run.sh fi \ No newline at end of file