Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add integration for openhouse-java-iceberg-1.5-runtime and openhouse-spark-3.5-runtime #221

Merged
merged 9 commits into from
Nov 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/build-tag-publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ jobs:
run: pip install -r scripts/python/requirements.txt

- name: Run Integration Tests
run: python scripts/python/integration_test.py ./tables-test-fixtures/src/main/resources/dummy.token
run: python scripts/python/integration_test.py ./tables-test-fixtures/tables-test-fixtures-iceberg-1.2/src/main/resources/dummy.token

- name: Stop Docker Containers
run: docker compose -f infra/recipes/docker-compose/oh-only/docker-compose.yml down
Expand Down
8 changes: 4 additions & 4 deletions apps/spark/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ dependencies {
implementation project(':client:secureclient')
implementation project(':services:common')
implementation project(':cluster:storage')
compileOnly (project(path: ':integrations:spark:openhouse-spark-runtime_2.12')) {
compileOnly (project(path: ':integrations:spark:spark-3.1:openhouse-spark-runtime_2.12')) {
exclude group: 'io.netty'
exclude group: 'org.apache.hadoop', module: 'hadoop-common'
exclude group: 'org.apache.hadoop', module: 'hadoop-client'
Expand Down Expand Up @@ -82,7 +82,7 @@ dependencies {
implementation 'io.opentelemetry:opentelemetry-semconv:1.14.0-alpha'
implementation 'org.apache.commons:commons-lang3:3.12.0'

testImplementation (project(path: ':integrations:spark:openhouse-spark-runtime_2.12', configuration: 'shadow')) {
testImplementation (project(path: ':integrations:spark:spark-3.1:openhouse-spark-runtime_2.12', configuration: 'shadow')) {
exclude group: 'io.netty'
exclude group: 'org.apache.hadoop', module: 'hadoop-common'
exclude group: 'org.apache.hadoop', module: 'hadoop-client'
Expand All @@ -92,7 +92,7 @@ dependencies {
testImplementation 'org.mockito:mockito-inline:4.11.0'
testImplementation 'org.powermock:powermock-module-junit4:2.0.9'
testImplementation 'org.powermock:powermock-api-mockito2:2.0.9'
testImplementation(project(':tables-test-fixtures_2.12')) {
testImplementation(project(':tables-test-fixtures:tables-test-fixtures_2.12')) {
exclude group: "io.netty"
}
testRuntimeOnly("org.eclipse.jetty:jetty-server:11.0.2")
Expand All @@ -101,7 +101,7 @@ dependencies {

// Need spark runtime to be built before this test for this project to run successfully because compileOnly and
// testImplementation dependencies are not triggering it.
test.dependsOn ':integrations:spark:openhouse-spark-runtime_2.12:build'
test.dependsOn ':integrations:spark:spark-3.1:openhouse-spark-runtime_2.12:build'

shadowJar {
zip64 = true
Expand Down
15 changes: 8 additions & 7 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -57,18 +57,19 @@ allprojects {
}
}

configurations.all {
resolutionStrategy {
force 'com.fasterxml.jackson:jackson-bom:2.13.4'
force 'com.fasterxml.jackson.core:jackson-databind:2.13.4'
force 'org.apache.orc:orc-core:1.8.3'
force 'com.google.guava:guava:31.1-jre'
if (it.path != ':integrations:spark:spark-3.5:openhouse-spark-3.5-itest') {
configurations.all {
jiang95-dev marked this conversation as resolved.
Show resolved Hide resolved
resolutionStrategy {
force 'com.fasterxml.jackson:jackson-bom:2.13.4'
force 'com.fasterxml.jackson.core:jackson-databind:2.13.4'
force 'org.apache.orc:orc-core:1.8.3'
force 'com.google.guava:guava:31.1-jre'
}
}
}

plugins.withType(JavaPlugin) {
dependencies {

testImplementation "org.assertj:assertj-core:3.24.2" //assertions library
testImplementation "org.junit.jupiter:junit-jupiter-api:" + junit_version
testImplementation "org.junit.jupiter:junit-jupiter-params:" + junit_version
Expand Down
2 changes: 1 addition & 1 deletion docs/development/intellij-setup.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ To ensure they are properly indexed by IDE, you need to:
### Fixing missing classes errors
Intellij doesn't work nicely with shadow plugin, this is a known issue tracked here: [IntelliJ IDEA unable to resolve classes of project dependency shadow jar #264](https://github.com/johnrengelman/shadow/issues/264)

Modules such as `:integrations:spark:openhouse-spark-itest` `:integrations:java:openhouse-java-itest` face this issue.
Modules such as `:integrations:spark:spark-3.1:openhouse-spark-itest` `:integrations:java:iceberg-1.2:openhouse-java-itest` face this issue.

To fix this issue please follow instructions as follows: [define-library](https://www.jetbrains.com/help/idea/library.html#define-library)
![](ide-setup-for-shadow-jars.gif)
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
FROM openjdk:11.0.11-jdk-slim-buster as builder

RUN apt-get update && apt-get install -y \
git curl vim zip software-properties-common ssh net-tools ca-certificates \
# Add Dependencies for PySpark \
python3 python3-pip python3-numpy python3-matplotlib python3-scipy python3-pandas python3-simpy

RUN update-alternatives --install "/usr/bin/python" "python" "$(which python3)" 1

# Fix the value of PYTHONHASHSEED
# Note: this is needed when you use Python 3.3 or greater
ENV SPARK_VERSION=3.5.2 \
LIVY_VERSION=0.8.0-incubating-SNAPSHOT \
LIVY_HOME=/opt/livy \
HADOOP_VERSION=3 \
SPARK_HOME=/opt/spark \
MAVEN_VERSION=3.9.4 \
PYTHONHASHSEED=1

# install apache spark
RUN curl --no-verbose -o apache-spark.tgz \
"https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz" \
&& mkdir -p /opt/spark \
&& tar -xf apache-spark.tgz -C /opt/spark --strip-components=1 \
&& rm apache-spark.tgz

# install maven to build apache livy
RUN curl --no-verbose -o maven.tgz https://dlcdn.apache.org/maven/maven-3/${MAVEN_VERSION}/binaries/apache-maven-${MAVEN_VERSION}-bin.tar.gz \
&& mkdir -p /opt/maven \
&& tar -xf maven.tgz -C /opt/maven --strip-components=1 \
&& rm maven.tgz

# build with patch and install apache livy
ARG ARCHIVE_FILENAME="apache-livy-${LIVY_VERSION}-bin"
COPY /infra/recipes/docker-compose/common/spark/livy_spark3_hadoop3.patch /
RUN git clone https://github.com/apache/incubator-livy \
&& cd incubator-livy \
&& git checkout 4d8a912699683b973eee76d4e91447d769a0cb0d \
&& git apply /livy_spark3_hadoop3.patch \
&& rm /livy_spark3_hadoop3.patch \
&& /opt/maven/bin/mvn clean package -B -V -e -Pspark-3.0 -Pthriftserver -DskipTests -DskipITs -Dmaven.javadoc.skip=true \
&& unzip -qq ./assembly/target/${ARCHIVE_FILENAME}.zip -d /opt \
&& mv "/opt/${ARCHIVE_FILENAME}" "${LIVY_HOME}" \
&& rm -rf "/incubator-livy"


FROM bde2020/hadoop-namenode:2.0.0-hadoop3.2.1-java8

ENV LIVY_HOME=/opt/livy \
SPARK_HOME=/opt/spark

COPY --from=builder /opt/livy /opt/livy
COPY --from=builder /opt/spark /opt/spark

WORKDIR $SPARK_HOME

ENV HADOOP_HOME=/opt/hadoop-3.2.1
ENV HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
ENV SPARK_MASTER_PORT=7077 \
SPARK_MASTER_WEBUI_PORT=8080 \
SPARK_LOG_DIR=$SPARK_HOME/logs
ENV SPARK_MASTER_LOG=$SPARK_LOG_DIR/spark-master.out \
SPARK_WORKER_LOG=$SPARK_LOG_DIR/spark-worker.out \
SPARK_WORKER_WEBUI_PORT=8080 \
SPARK_WORKER_PORT=7000 \
SPARK_MASTER="spark://spark-master:7077"
ENV OTEL_EXPORTER_OTLP_ENDPOINT="http://localhost:4318" \
OTEL_EXPORTER_OTLP_METRICS_PROTOCOL="http"

EXPOSE 8080 7077 7000 8998

RUN mkdir -p $SPARK_LOG_DIR && \
touch $SPARK_MASTER_LOG && \
touch $SPARK_WORKER_LOG && \
ln -sf /dev/stdout $SPARK_MASTER_LOG && \
ln -sf /dev/stdout $SPARK_WORKER_LOG

WORKDIR $LIVY_HOME

RUN mkdir -p "${LIVY_HOME}/logs"

COPY /infra/recipes/docker-compose/common/spark/start-spark.sh /
COPY /build/openhouse-spark-3.5-runtime_2.12/libs/openhouse-spark-3.5-runtime_2.12-uber.jar $SPARK_HOME/openhouse-spark-runtime_2.12-latest-all.jar
COPY /build/openhouse-spark-apps_2.12/libs/openhouse-spark-apps_2.12-uber.jar $SPARK_HOME/openhouse-spark-apps_2.12-latest-all.jar
COPY /build/dummytokens/libs/dummytokens*.jar /dummytokens.jar
RUN java -jar /dummytokens.jar -d /var/config/

ARG OH_USERNAME=openhouse
ARG OH_GROUPNAME=$OH_USERNAME
ARG OH_USER_ID=1000
ARG OH_GROUP_ID=$OH_USER_ID
ENV OH_USER_HOME=/home/$OH_USERNAME

# Create an openhouse user as there's no reason to run as root user
RUN groupadd --force -g $OH_GROUP_ID $OH_USERNAME
RUN useradd -d $OH_USER_HOME -m $OH_USERNAME -u $OH_USER_ID -g $OH_GROUP_ID

RUN chown -R $OH_USERNAME:$OH_GROUPNAME /opt

ARG TABLE_OWNER_USERNAME=u_tableowner
ARG TABLE_OWNER_GROUPNAME=$TABLE_OWNER_USERNAME
ARG TABLE_OWNER_USER_ID=1001
ARG TABLE_OWNER_GROUP_ID=$TABLE_OWNER_USER_ID
ENV TABLE_OWNER_USER_HOME=/home/$TABLE_OWNER_USERNAME

# Create an openhouse user as there's no reason to run as root user
RUN groupadd --force -g $TABLE_OWNER_GROUP_ID $TABLE_OWNER_USERNAME
RUN useradd -d $TABLE_OWNER_USER_HOME -m $TABLE_OWNER_USERNAME -u $TABLE_OWNER_USER_ID -g $TABLE_OWNER_GROUP_ID


USER $OH_USERNAME

WORKDIR /opt/spark

CMD ["/bin/bash", "/start-spark.sh"]
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ ext {
}

dependencies {
testImplementation(project(path: ':integrations:java:openhouse-java-runtime', configuration: 'shadow'))
testImplementation(project(path: ':integrations:java:iceberg-1.2:openhouse-java-runtime', configuration: 'shadow'))

testImplementation "com.squareup.okhttp3:okhttp:" + ok_http3_version
testImplementation "com.squareup.okhttp3:mockwebserver:" + ok_http3_version
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,9 @@
import org.junit.jupiter.api.Test;

/**
* This class tests out packaging done in :integrations:java:openhouse-java-runtime. The goal of
* this test is to ensure all of `tableclient` functionality, such as configuring {@link ApiClient},
* {@link TableApi}, making REST call can be satisfied by the singular jar
* This class tests out packaging done in :integrations:java:iceberg-1.2:openhouse-java-runtime. The
* goal of this test is to ensure all of `tableclient` functionality, such as configuring {@link
* ApiClient}, {@link TableApi}, making REST call can be satisfied by the singular jar
* `openhouse-java-runtime.jar`. These tests do not test complete functionality, rather it tests
* various interfaces and their integration.
*/
Expand Down
24 changes: 24 additions & 0 deletions integrations/java/iceberg-1.5/openhouse-java-itest/build.gradle
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
plugins {
id 'openhouse.java-minimal-conventions'
id 'openhouse.maven-publish'
}

ext {
icebergVersion = '1.5.2'
}

sourceSets {
test {
java {
srcDirs += project(':integrations:java:iceberg-1.2:openhouse-java-itest').sourceSets.test.java.srcDirs
}
}
}

dependencies {
testImplementation(project(path: ':integrations:java:iceberg-1.5:openhouse-java-iceberg-1.5-runtime', configuration: 'shadow'))

testImplementation "com.squareup.okhttp3:okhttp:4.9.3"
testImplementation "com.squareup.okhttp3:mockwebserver:4.9.3"
testImplementation "org.apache.iceberg:iceberg-bundled-guava:" + icebergVersion
}
94 changes: 94 additions & 0 deletions integrations/java/iceberg-1.5/openhouse-java-runtime/build.gradle
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
plugins {
id 'openhouse.java-minimal-conventions'
id 'openhouse.maven-publish'
id 'com.github.johnrengelman.shadow' version '7.1.2'
}

configurations {
fatJarPackagedDependencies {
// Following exclusions are not needed during runtime
// and often cause conflict with existing classpath.
exclude(group: 'org.slf4j') // logging libraries
exclude(group: 'org.apache.log4j')
exclude(group: 'org.apache.logging.log4j')
exclude(group: 'org.mapstruct')
exclude(group: 'io.micrometer') // not used in client
exclude(group: 'ch.qos.logback')
}
shadow.extendsFrom implementation
}

ext {
icebergVersion = '1.5.2'
sparkVersion = '3.5.2'
springVersion = '2.7.8'
hadoopVersion = '2.10.0'
}

sourceSets {
main {
java {
srcDirs += project(':integrations:java:iceberg-1.2:openhouse-java-runtime').sourceSets.main.java.srcDirs
}
resources {
srcDirs += project(':integrations:java:iceberg-1.2:openhouse-java-runtime').sourceSets.main.resources.srcDirs
}
}
}

dependencies {
compileOnly project(':client:secureclient')
compileOnly project(':client:tableclient')
compileOnly("org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:" + icebergVersion)
compileOnly ("org.springframework.boot:spring-boot-starter-webflux:" + springVersion)

implementation 'org.apache.commons:commons-lang3:3.12.0'
fatJarPackagedDependencies(project(':client:secureclient'))
implementation("org.apache.iceberg:iceberg-core:" + icebergVersion)
implementation("org.apache.hadoop:hadoop-client:" + hadoopVersion) {
exclude group: 'junit', module: 'junit'
exclude group: 'javax', module: 'servlet-api'
exclude group: "io.netty"
exclude group: 'com.zaxxer', module: 'HikariCP-java7'
exclude group: 'org.apache.commons', module: 'commons-lang3'
}
}

// Following codeblock completely relocates contents of the jar
// except for source code written in the module. As a result,
// we remove chances of classpath conflicts during runtime/compiletime.
shadowJar {
dependencies {
exclude("org/springframework/http/codec/CodecConfigurer.properties")
exclude("javax/**")
exclude("okio/**")
exclude("kotlin/**")

relocate ('io.', 'com.linkedin.openhouse.relocated.io.')
relocate('org','com.linkedin.openhouse.relocated.org') {
exclude 'org.xml.sax.**'
exclude 'org.apache.hadoop.**'
exclude 'org.apache.iceberg.**'
exclude 'org.slf4j.**'
}
relocate('reactor', 'com.linkedin.openhouse.relocated.reactor')
relocate('com.linkedin.openhouse.jobs.client', 'com.linkedin.openhouse.gen.job.client')
relocate('com.linkedin.openhouse.tables.client', 'com.linkedin.openhouse.gen.tables.client')
relocate('com.linkedin.openhouse.client.ssl', 'com.linkedin.openhouse.gen.client.ssl')
relocate('com.linkedin.openhouse.housetables.client', 'com.linkedin.openhouse.gen.housetables.client')
relocate('com', 'com.linkedin.openhouse.relocated.com') {
exclude 'com.linkedin.openhouse.**'
}
}
// Jackson file not needed in newer version
exclude 'module-info.class'
// service file not needed for client
exclude 'log4j2.springboot'

configurations = [project.configurations.fatJarPackagedDependencies]
mergeServiceFiles()
archiveClassifier.set('uber')
zip64 true
}

jar.enabled=true
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ dependencies {

testImplementation 'com.google.code.gson:gson:2.8.9'

testImplementation(project(path: ':integrations:spark:openhouse-spark-runtime_2.12', configuration: 'shadow')) {
testImplementation(project(path: ':integrations:spark:spark-3.1:openhouse-spark-runtime_2.12', configuration: 'shadow')) {
exclude group: 'org.apache.commons', module: 'commons-lang3'
}

Expand All @@ -16,7 +16,7 @@ dependencies {
exclude group: "io.netty"
}

testImplementation project(':tables-test-fixtures_2.12')
testImplementation project(':tables-test-fixtures:tables-test-fixtures_2.12')
testImplementation 'org.junit.platform:junit-platform-runner:1.11.0'
testRuntimeOnly 'org.junit.platform:junit-platform-launcher:1.11.0'
// Required to test /tables mockserver
Expand Down
Loading
Loading